このサイトのチュートリアルを学習していたら、以下のようなエラーが吐き出された。
import tensorflow as tf
import numpy as np
import random, json, string
import keras
import keras.layers
import keras.models
import keras.optimizers
from keras.layers.wrappers import TimeDistributed
import keras.layers.embeddings
import keras.preprocessing.text
import keras.preprocessing.sequence
import keras.callbacks
mscoco = json.load(open('annotations/captions_train2014.json'))
captionStrings = ['[START] ' + entry['caption'] for entry in mscoco['annotations']]
print('Number of sentences', len(captionStrings))
print('First sentence in the list', captionStrings[0])
vocabularySize = 1000 # vocabulary size.
# Split sentences into words, and define a vocabulary with the most common words.
tokenizer = keras.preprocessing.text.Tokenizer(nb_words = vocabularySize, \
filters = '!"#$%&()*+,-./:;<=>?@\\^_`{|}~\t\n')
tokenizer.fit_on_texts(captionStrings)
# Convert the sentences into sequences of word ids using our vocabulary.
captionSequences = tokenizer.texts_to_sequences(captionStrings)
# Keep dictionaries that map ids -> words, and words -> ids.
word2id = tokenizer.word_index
id2word = {idx: word for (word, idx) in word2id.items()}
maxSequenceLength = max([len(seq) for seq in captionSequences]) # Find the sentence with most words.
# Print some output to verify the above.
print('Original string', captionStrings[0])
print('Sequence of Word Ids', captionSequences[0])
print('Word Ids back to Words', string.join([id2word[idx] for idx in captionSequences[0]], " "))
print('Max Sequence Length', maxSequenceLength)
vocabularySize = 1000 # vocabulary size.
# Split sentences into words, and define a vocabulary with the most common words.
tokenizer = keras.preprocessing.text.Tokenizer(nb_words = vocabularySize, \
filters = '!"#$%&()*+,-./:;<=>?@\\^_`{|}~\t\n')
tokenizer.fit_on_texts(captionStrings)
# Convert the sentences into sequences of word ids using our vocabulary.
captionSequences = tokenizer.texts_to_sequences(captionStrings)
# Keep dictionaries that map ids -> words, and words -> ids.
word2id = tokenizer.word_index
id2word = {idx: word for (word, idx) in word2id.items()}
maxSequenceLength = max([len(seq) for seq in captionSequences]) # Find the sentence with most words.
# Print some output to verify the above.
print('Original string', captionStrings[0])
print('Sequence of Word Ids', captionSequences[0])
print('Word Ids back to Words', " ".join([id2word[idx] for idx in captionSequences[0]]))
print('Max Sequence Length', maxSequenceLength)
ちなみに、””に空白を入れないと下のようにイミフになってしまう。
vocabularySize = 1000 # vocabulary size.
# Split sentences into words, and define a vocabulary with the most common words.
tokenizer = keras.preprocessing.text.Tokenizer(nb_words = vocabularySize, \
filters = '!"#$%&()*+,-./:;<=>?@\\^_`{|}~\t\n')
tokenizer.fit_on_texts(captionStrings)
# Convert the sentences into sequences of word ids using our vocabulary.
captionSequences = tokenizer.texts_to_sequences(captionStrings)
# Keep dictionaries that map ids -> words, and words -> ids.
word2id = tokenizer.word_index
id2word = {idx: word for (word, idx) in word2id.items()}
maxSequenceLength = max([len(seq) for seq in captionSequences]) # Find the sentence with most words.
# Print some output to verify the above.
print('Original string', captionStrings[0])
print('Sequence of Word Ids', captionSequences[0])
print('Word Ids back to Words', "".join([id2word[idx] for idx in captionSequences[0]]))
print('Max Sequence Length', maxSequenceLength)
スポンサーリンク
スポンサーリンク