吴裕雄--天生自然神经网络与深度学习实战Python+Keras+TensorFlow:自然语言处理Word Embedding 单词向量化

import numpy as np
samples = ['The cat jump over the dog', 'The dog ate my homework']

#我们先将每个单词放置到一个哈希表中
token_index = {}
for sample in samples:
    #将一个句子分解成多个单词
    for word in sample.split():
        if word not in token_index:
            token_index[word] = len(token_index) + 1
            
#设置句子的最大长度
max_length = 10
results = np.zeros((len(samples), max_length, max(token_index.values()) + 1))
for i, sample in enumerate(samples):
    for j, word in list(enumerate(sample.split()))[: max_length]:
        index = token_index.get(word)
        results[i, j, index] = 1.
        print("{0} -> {1}".format(word, results[i, j]))

吴裕雄--天生自然神经网络与深度学习实战Python+Keras+TensorFlow:自然语言处理Word Embedding 单词向量化

from keras.preprocessing.text import Tokenizer

def oneHotEncode(samples):
    #只考虑最常使用的前1000个单词
    tokenizer = Tokenizer(num_words = 1000)
    tokenizer.fit_on_texts(samples)
    #把句子分解成单词数组
    sequences = tokenizer.texts_to_sequences(samples)
    return sequences

samples = ['The cat jump over the dog', 'The dog ate my homework']
vecs = oneHotEncode(samples)
print(vecs)

吴裕雄--天生自然神经网络与深度学习实战Python+Keras+TensorFlow:自然语言处理Word Embedding 单词向量化

 

 

from numpy import array
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.embeddings import Embedding
# 定义一系列简单文本,前一半表示赞美,后一半表示批判
docs = ['Well done',
'Good work',
'Great effort',
'nice work',
'Excellent',
'Weak',
'Poor effort',
'quit bad',
'it is terrible',
'like a shit']
# 属于赞美性质的文本用1表示,属于匹配性质的文本用0表示
labels = array([1,1,1,1,1,0,0,0,0,0])
# 假定单词量有50个
vocab_size = 50

encoded_docs = oneHotEncode(docs)
print(encoded_docs)
# 规定每个文本4个单词,不足4个的用0补足
max_length = 4
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
print(padded_docs)

model = Sequential()
'''
Embedding层本质上是一个矩阵,高位vocab_size, 宽为8,矩阵的每一行对应每个单词向量
由于我们设定每篇文本的单词量为4个,每个单词对应一个8元素的向量,
因此我们把一篇文本对应的向量也就是一个含有4个元素的向量输入Embedding层后,
得到4*8的一个二维矩阵,其中的4对应输入文本向量中元素个数,8对应每个单词的向量维度
'''
emebdding_layer = Embedding(vocab_size, 8, input_length=max_length)
model.add(emebdding_layer)

print("vector for word Well before train is:")
print(emebdding_layer.get_weights()[0][0])

model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

print(model.summary())
# 训练网络
model.fit(padded_docs, labels, epochs=50, verbose=0)

print("vector for word Well after train is:")
print(emebdding_layer.get_weights()[0][0])

吴裕雄--天生自然神经网络与深度学习实战Python+Keras+TensorFlow:自然语言处理Word Embedding 单词向量化

 

 吴裕雄--天生自然神经网络与深度学习实战Python+Keras+TensorFlow:自然语言处理Word Embedding 单词向量化

from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
#建立单词与编号之间的对应关系
word2Num = {}
for idx, doc in enumerate(docs):
    words = doc.split()
    for i, word in enumerate(words):
        print("{0} => {1}".format(word, encoded_docs[idx][i]))
        word2Num[word] = encoded_docs[idx][i]

embeddings = emebdding_layer.get_weights()[0]
#建立单词与向量之间的连续
vectors = []
words = []
for word, num in word2Num.items():
    print("{0} => {1}".format(word, embeddings[num]))
    words.append(word)
    vectors.append(embeddings[num])
    

吴裕雄--天生自然神经网络与深度学习实战Python+Keras+TensorFlow:自然语言处理Word Embedding 单词向量化

tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23)
new_values = tsne_model.fit_transform(vectors)

x = []
y = []
for value in new_values:
    x.append(value[0])
    y.append(value[1])

plt.figure(figsize=(16, 16))
for i in range(len(x)):
    plt.scatter(x[i], y[i])
    plt.annotate(words[i], xy=(x[i], y[i]), xytext=(5,2), textcoords='offset points',
                ha='right',va='bottom')
plt.show()

吴裕雄--天生自然神经网络与深度学习实战Python+Keras+TensorFlow:自然语言处理Word Embedding 单词向量化

 

 吴裕雄--天生自然神经网络与深度学习实战Python+Keras+TensorFlow:自然语言处理Word Embedding 单词向量化

 

上一篇:Stat


下一篇:具体封装函数讲解read_num_class_data()、prepare_train_data()(OpenCV案例源码letter_recog.cpp解读)