このサイトのチュートリアルを学習していたら、以下のようなエラーが吐き出された。

import tensorflow as tf
import numpy as np
import random, json, string
import keras
import keras.layers
import keras.models
import keras.optimizers
from keras.layers.wrappers import TimeDistributed
import keras.layers.embeddings
import keras.preprocessing.text
import keras.preprocessing.sequence
import keras.callbacks

Using TensorFlow backend.

mscoco = json.load(open('annotations/captions_train2014.json'))
captionStrings = ['[START] ' + entry['caption'] for entry in mscoco['annotations']]

print('Number of sentences', len(captionStrings))
print('First sentence in the list', captionStrings[0])

Number of sentences 414113
First sentence in the list [START] A very clean and well decorated empty bathroom

vocabularySize = 1000  # vocabulary size.

# Split sentences into words, and define a vocabulary with the most common words.
tokenizer = keras.preprocessing.text.Tokenizer(nb_words = vocabularySize, \
                                               filters = '!"#$%&()*+,-./:;<=>?@\\^_`{|}~\t\n') 
tokenizer.fit_on_texts(captionStrings)

# Convert the sentences into sequences of word ids using our vocabulary.
captionSequences = tokenizer.texts_to_sequences(captionStrings)

# Keep dictionaries that map ids -> words, and words -> ids.
word2id = tokenizer.word_index
id2word = {idx: word for (word, idx) in word2id.items()}
maxSequenceLength = max([len(seq) for seq in captionSequences])  # Find the sentence with most words.

# Print some output to verify the above.
print('Original string', captionStrings[0])
print('Sequence of Word Ids', captionSequences[0])
print('Word Ids back to Words', string.join([id2word[idx] for idx in captionSequences[0]], " "))
print('Max Sequence Length', maxSequenceLength)

/root/.pyenv/versions/py365/lib/python3.6/site-packages/keras_preprocessing/text.py:175: UserWarning: The `nb_words` argument in `Tokenizer` has been renamed `num_words`.
  warnings.warn('The `nb_words` argument in `Tokenizer` '

Original string [START] A very clean and well decorated empty bathroom
Sequence of Word Ids [2, 1, 140, 507, 8, 618, 414, 274, 56]

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-4-f225dcf5f4a4> in <module>()
     16 print('Original string', captionStrings[0])
     17 print('Sequence of Word Ids', captionSequences[0])
---> 18 print('Word Ids back to Words', string.join([id2word[idx] for idx in captionSequences[0]], " "))
     19 print('Max Sequence Length', maxSequenceLength)

AttributeError: module 'string' has no attribute 'join'

このエラーもpython2とpython3の仕様の違いが原因と思われる。このエラーは、下記のようにstring.joinからstringを取り除いてから” “を付け足す事で簡単に解消される。

vocabularySize = 1000  # vocabulary size.

# Split sentences into words, and define a vocabulary with the most common words.
tokenizer = keras.preprocessing.text.Tokenizer(nb_words = vocabularySize, \
                                               filters = '!"#$%&()*+,-./:;<=>?@\\^_`{|}~\t\n') 
tokenizer.fit_on_texts(captionStrings)

# Convert the sentences into sequences of word ids using our vocabulary.
captionSequences = tokenizer.texts_to_sequences(captionStrings)

# Keep dictionaries that map ids -> words, and words -> ids.
word2id = tokenizer.word_index
id2word = {idx: word for (word, idx) in word2id.items()}
maxSequenceLength = max([len(seq) for seq in captionSequences])  # Find the sentence with most words.

# Print some output to verify the above.
print('Original string', captionStrings[0])
print('Sequence of Word Ids', captionSequences[0])
print('Word Ids back to Words', " ".join([id2word[idx] for idx in captionSequences[0]]))
print('Max Sequence Length', maxSequenceLength)

/root/.pyenv/versions/py365/lib/python3.6/site-packages/keras_preprocessing/text.py:175: UserWarning: The `nb_words` argument in `Tokenizer` has been renamed `num_words`.
  warnings.warn('The `nb_words` argument in `Tokenizer` '

Original string [START] A very clean and well decorated empty bathroom
Sequence of Word Ids [2, 1, 140, 507, 8, 618, 414, 274, 56]
Word Ids back to Words [start] a very clean and well decorated empty bathroom
Max Sequence Length 46

ちなみに、””に空白を入れないと下のようにイミフになってしまう。

vocabularySize = 1000  # vocabulary size.

# Split sentences into words, and define a vocabulary with the most common words.
tokenizer = keras.preprocessing.text.Tokenizer(nb_words = vocabularySize, \
                                               filters = '!"#$%&()*+,-./:;<=>?@\\^_`{|}~\t\n') 
tokenizer.fit_on_texts(captionStrings)

# Convert the sentences into sequences of word ids using our vocabulary.
captionSequences = tokenizer.texts_to_sequences(captionStrings)

# Keep dictionaries that map ids -> words, and words -> ids.
word2id = tokenizer.word_index
id2word = {idx: word for (word, idx) in word2id.items()}
maxSequenceLength = max([len(seq) for seq in captionSequences])  # Find the sentence with most words.

# Print some output to verify the above.
print('Original string', captionStrings[0])
print('Sequence of Word Ids', captionSequences[0])
print('Word Ids back to Words', "".join([id2word[idx] for idx in captionSequences[0]]))
print('Max Sequence Length', maxSequenceLength)

/root/.pyenv/versions/py365/lib/python3.6/site-packages/keras_preprocessing/text.py:175: UserWarning: The `nb_words` argument in `Tokenizer` has been renamed `num_words`.
  warnings.warn('The `nb_words` argument in `Tokenizer` '

Original string [START] A very clean and well decorated empty bathroom
Sequence of Word Ids [2, 1, 140, 507, 8, 618, 414, 274, 56]
Word Ids back to Words [start]averycleanandwelldecoratedemptybathroom
Max Sequence Length 46

参考サイトhttps://stackoverflow.com/