主页 > 互联网  > 

计算文本相似度,输出相似度最高的n个

计算文本相似度,输出相似度最高的n个
TF-IDF import jieba import pandas as pd from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity # 示例句子列表 sentences = [ "今天天气真好,阳光明媚。", "关键字匹配是一种常见的文本处理任务。", "计算机不认识人类语言,要转成词向量。", "富强、民主、文明、和谐、自由、平等、公正、法治、爱国、敬业、诚信、友善。", "中文分词工具对文本处理很有帮助。", ] # 输入的查询句子 query_sentence = "关键字匹配和文本处理任务" # 创建DataFrame data = {"Sentence": sentences} df = pd.DataFrame(data) # 分词并建立TF-IDF特征向量 def preprocess(text): words = jieba.lcut(text) return " ".join(words) df["Preprocessed_Sentence"] = df["Sentence"].apply(preprocess) query_sentence = preprocess(query_sentence) # 计算相似度 vectorizer = TfidfVectorizer() tfidf_matrix = vectorizer.fit_transform(list(df["Preprocessed_Sentence"]) + [query_sentence]) similarities = cosine_similarity(tfidf_matrix) query_similarity = similarities[-1, :-1] # 最后一行是查询句子 # 获取相似度最高的n个句子 n = 10 # 想要获取的相似句子数量 top_indices = query_similarity.argsort()[-n:][::-1] # 创建包含相似句子和相似度的DataFrame similar_sentences = df.loc[top_indices, "Sentence"].tolist() similarity_scores = [query_similarity[i] for i in top_indices] result_data = {"Similar_Sentence": similar_sentences, "Similarity_Score": similarity_scores} result_df = pd.DataFrame(result_data) print("查询句子:", query_sentence) print("\n相似度最高的句子:") result_df

Spacy import warnings warnings.filterwarnings("ignore") import spacy import pandas as pd # 加载Spacy中文语言模型 nlp = spacy.load("zh_core_web_sm") # 示例中文句子数据框 data = { "Sentences": [ "今天天气真好,阳光明媚。", "关键字匹配是一种常见的文本处理任务。", "计算机不认识人类语言,要转成词向量。", "富强、民主、文明、和谐、自由、平等、公正、法治、爱国、敬业、诚信、友善。", "中文分词工具对文本处理很有帮助。", ] } df = pd.DataFrame(data) # 选择要比较的目标句子 target_sentence = "关键字匹配和文本处理任务" # 计算所有句子的相似度 similarity_scores = [] for sentence in df["Sentences"]: doc1 = nlp(target_sentence) doc2 = nlp(sentence) similarity = doc1.similarity(doc2) similarity_scores.append(similarity) # 将相似度得分添加到数据框中 df["Similarity Score"] = similarity_scores # 根据相似度得分降序排序,选择相似度最高的n个句子 n = 10 # 选择前n个最相似的句子 top_n_similar_sentences = df.sort_values(by="Similarity Score", ascending=False).head(n) # 打印DataFrame top_n_similar_sentences

Bert import warnings warnings.filterwarnings("ignore") from transformers import AutoTokenizer, AutoModel import pandas as pd import torch from sklearn.metrics.pairwise import cosine_similarity # 加载BERT模型和分词器 model_name = "model/bert-base-chinese" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModel.from_pretrained(model_name) # 示例中文句子数据框 data = { "Sentences": [ "今天天气真好,阳光明媚。", "关键字匹配是一种常见的文本处理任务。", "计算机不认识人类语言,要转成词向量。", "富强、民主、文明、和谐、自由、平等、公正、法治、爱国、敬业、诚信、友善。", "中文分词工具对文本处理很有帮助。", ] } df = pd.DataFrame(data) # 选择要比较的目标句子 target_sentence = "关键字匹配和文本处理任务" # 计算所有句子的相似度 similarity_scores = [] target_embedding = model(**tokenizer(target_sentence, return_tensors="pt", padding=True, truncation=True)).last_hidden_state.mean(dim=1) for sentence in df["Sentences"]: # 使用分词器编码句子 sentence_embedding = model(**tokenizer(sentence, return_tensors="pt", padding=True, truncation=True)).last_hidden_state.mean(dim=1) # 计算余弦相似度 similarity = cosine_similarity(target_embedding.detach().numpy(), sentence_embedding.detach().numpy())[0][0] similarity_scores.append(similarity) # 将相似度得分添加到数据框中 df["Similarity Score"] = similarity_scores # 根据相似度得分降序排序,选择相似度最高的n个句子 n = 10 # 选择前两个最相似的句子 top_n_similar_sentences = df.sort_values(by="Similarity Score", ascending=False).head(n) # 打印DataFrame top_n_similar_sentences

标签:

计算文本相似度,输出相似度最高的n个由讯客互联互联网栏目发布,感谢您对讯客互联的认可,以及对我们原创作品以及文章的青睐,非常欢迎各位朋友分享到个人网站或者朋友圈,但转载请说明文章出处“计算文本相似度,输出相似度最高的n个