本文利用20Newsgroup这个数据集作为Corpus(语料库),用户可以通过搜索关键字来进行查询关联度最高的News,实现对文本的搜索引擎:
1. 导入数据集
from sklearn.datasets import fetch_20newsgroupsnewsgroups = fetch_20newsgroups()print(f'Number of documents: {len(newsgroups.data)}')print(f'Sample document:\n{newsgroups.data[0]}')
2. 向量化单词
from sklearn.feature_extraction.text import CountVectorizercount = CountVectorizer()count.fit(newsgroups.data)show_vocabulary(count)print(f'Size of vocabulary: {len(count.get_feature_names_out())}')def show_vocabulary(vectorizer):words = vectorizer.get_feature_names_out()print(f'Vocabulary size: {len(words)} words')# we can print ~10 words per linefor l in np.array_split(words, math.ceil(len(words) / 10)):print(''.join([f'{x:<15}' for x in l]))
3. 搜索引擎
#将语料库进行转化corpus_bow = count.transform(newsgroups.data)#提供用户输入,对输入内容进行转化为BoW - Bag of wordquery = input("Type your query: ")query_bow = count.transform([query])from sklearn.metrics.pairwise import cosine_similarity#比较输入内容与语料库中的相似度similarity_matrix = cosine_similarity(corpus_bow, query_bow)print(f'Similarity Matrix Shape: {similarity_matrix.shape}')
similarities = pd.Series(similarity_matrix[:, 0])similarities.head(10)
print('Best document:')print(newsgroups.data[top_10.index[0]])
code/s?__biz=MzU4ODkxOTU1Mw==&mid=2247483685&idx=1&sn=3559a65b36eb88b94058951769bfe0a2&chksm=fdd4274bcaa3ae5dafdc483fce7950d68354f1420398a77bd6167d1413fea5297cb9257eb078#rd