CWデコードとディープラーニング（４）

また別のタイプのトレーニングシーケンスは、こんな感じです。

a, 10111000
a, 10111000
aa, 1011100010111000
    (many lines deleted)
zythia, 111011101010001110101110111000111000101010100010100010111000
zythum, 111011101010001110101110111000111000101010100010101110001110111000
zyzomys, 11101110101000111010111011100011101110101000111011101110001110111000111010111011100010101000
zyzzogeton, 11101110101000111010111011100011101110101000111011101010001110111011100011101110100010001110001110111011100011101000

“000”は、文字間のスペースを表しています。

11101011101000111011101110001110101110100010111000
coca     coca
1110101110100010111000101010001000
case     case
111010111010001011100010111010001110101000
card     card
101010001011101110001010001110111000
swim     swim
11101110100010111000101011101000101011101000
gaff     gaff
11101011101000101110001010111010001010101000
cafh     caln
1010101000101000111010001110101000
hind     hind

さらに、別のタイプです。

a, 1 111   
aa, 1 111   1 111   
aal, 1 111   1 111   1 111 1 1   
aalii, 1 111   1 111   1 111 1 1   1 1   1 1
    (many lines deleted)
zythia, 111 111 1 1   111 1 111 111   111   1 1 1 1   1 1   1 111   
zythum, 111 111 1 1   111 1 111 111   111   1 1 1 1   1 1 111   111 111   
zyzomys, 111 111 1 1   111 1 111 111   111 111 1 1   111 111 111   111 111   111 1 111 111   1 1 1   
zyzzogeton, 111 111 1 1   111 1 111 111   111 111 1 1   111 111 1 1   111 111 111   111 111 1   1   111   111 111 111   111 1

この方が、あなたには読みやすいですか。

1 1 111 1   1 1   111 1 1   111 111 111
fido     fido
1 111   111 1 1   1 111   1 1
adai     adai
1 111 1   1 111   111 1 1   1 111
rada     rada
1 111   1 111 1 1   1   111 111
alem     alem
1 111 111 1   1 1   111 1 111 1   1
pice     pice
1   111 111 1   111 111 1   111 1 111 111
eggy     egcy
1 111 111 1   1 111   1 111 1 1   1
pale     pale

以下の例では、４文字未満の単語も含まれています。

1 111   1 1
ai       ai  
111 1 111 111   111 111 111   1 1 111
you      you 
111   1 1 111   111   1 1 111
tutu     tutu
1 1 111 1   111 111 111   111 1 1 111   111 1 111 111
foxy     foxy
1 1 1 1   111 111 111   111   1 1
hoti     hott
111 1 111 1   1   1 111 111 1   1 111
cepa     cepa
111 111 1   1 1 111   111
gut      gut

from keras.models import Sequential
from keras import layers
import numpy as np
import matplotlib.pyplot as plt


class CharTable(object):
    def __init__(self, chars):
        self.chars = sorted(set(chars))
        self.char_indices = dict((c, i) for i, c in enumerate(self.chars))
        self.indices_char = dict((i, c) for i, c in enumerate(self.chars))

    def encode(self, token, num_rows):
        x = np.zeros((num_rows, len(self.chars)))
        for i, c in enumerate(token):
            x[i, self.char_indices] = 1
        return x

    def decode(self, x, calc_argmax=True):
        if calc_argmax:
            x = [x.argmax(axis=-1)]
        return ''.join(self.indices_char[int(v)] for v in x)


def main():
    word_len = 4
    max_len_x = 15 * word_len + 2*(word_len - 1)
    max_len_y = word_len

    input_list = []
    output_list = []
    fin = 'words_morse1only.txt'
    with open(fin, 'r') as file:
        for line in file.read().splitlines():
            mylist = line.split(", ")
            [word, morse] = mylist
            morse = morse + ' ' * (max_len_x - len(morse))
            if len(word) <= word_len:
                word = word + ' ' * (word_len - len(word))
                input_list.append(morse)
                output_list.append(word)

    print("input_list = ", input_list[:5])
    print("output_list = ", output_list[:5])

#    chars_in = '10 '
    chars_in = '1 '
    chars_out = 'abcdefghijklmnopqrstuvwxyz '
    ctable_in = CharTable(chars_in)
    ctable_out = CharTable(chars_out)

    x = np.zeros((len(input_list), max_len_x, len(chars_in)))
    y = np.zeros((len(output_list), max_len_y, len(chars_out)))
    for i, token in enumerate(input_list):
        x[i] = ctable_in.encode(token, max_len_x)
    for i, token in enumerate(output_list):
        y[i] = ctable_out.encode(token, max_len_y)

    indices = np.arange(len(y))
    np.random.shuffle(indices)
    x = x[indices]
    y = y[indices]

    m = len(x) - 100
    (x_train, x_val) = x[:m], x[m:]
    (y_train, y_val) = y[:m], y[m:]

    hidden_size = 64
    batch_size = 128
    nlayers = 1
    epochs = 600

    model = Sequential()
    model.add(layers.LSTM(hidden_size, input_shape=(max_len_x, len(chars_in))))
    model.add(layers.RepeatVector(word_len))

    for _ in range(nlayers):
        model.add(layers.LSTM(hidden_size, return_sequences=True))

    model.add(layers.TimeDistributed(layers.Dense(len(chars_out), activation='softmax')))
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    model.summary()

    hist = model.fit(x_train, y_train, batch_size=batch_size,
                     epochs=epochs, verbose=2, validation_data=(x_val, y_val))

    predict = model.predict_classes(x_val)

    for i in range(len(x_val)):
        print("".join([ctable_in.decode(code) for code in x_val[i]]),
              "".join([ctable_out.decode(code) for code in y_val[i]]), end="     ")
        for j in range(word_len):
            print(ctable_out.indices_char[predict[i][j]], end="")
        print()

    plt.figure(figsize=(16, 5))
    plt.subplot(121)
    plt.plot(hist.history['acc'])
    plt.plot(hist.history['val_acc'])
    plt.title('model accuracy')
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    plt.legend(['train', 'validation'], loc='upper left')
    plt.subplot(122)
    plt.plot(hist.history['loss'])
    plt.plot(hist.history['val_loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'validation'], loc='upper right')
    plt.show()


main()

S	M	T	W	T	F	S
			1	2	3	4
5	6	7	8	9	10	11
12	13	14	15	16	17	18
19	20	21	22	23	24	25
26	27	28	29	30	31

Leave a Reply Cancel reply