-
-
Notifications
You must be signed in to change notification settings - Fork 0
/
text_similarity_checker.py
executable file
·69 lines (56 loc) · 3.25 KB
/
text_similarity_checker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
#!/usr/bin/env python
# TODO: It would be interesting to see how this TfidfVectorizer + cosine_similarity method compares with using difflib's SequenceMatcher + ratio methods:
# https://docs.python.org/3/library/difflib.html#sequencematcher-objects
# See also:
# https://docs.python.org/3/library/difflib.html#difflib.get_close_matches
# Return a list of the best “good enough” matches. word is a sequence for which close matches are desired (typically a string), and possibilities is a list of sequences against which to match word (typically a list of strings).
# TODO: This ChatGPT chat has some examples of how to calculate this sort of thing in JavaScript:
# https://chatgpt.com/c/7fef26fd-0531-4079-b508-43904ff3e089
# See also:
# https://github.com/NaturalNode/natural/
# https://naturalnode.github.io/natural/
# https://blog.logrocket.com/natural-language-processing-node-js/
# https://winkjs.org/
# https://winkjs.org/wink-nlp/bm25-vectorizer.html
# BM25 is a major improvement over the classical TF-IDF based algorithms. The weights for a specific term (i.e. token) is computed using the BM25 algorithm.
# https://github.com/winkjs/wink-nlp
# https://github.com/winkjs/wink-nlp-utils
# https://winkjs.org/wink-nlp-utils/
# https://github.com/winkjs/wink-distance
import argparse
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import os
def read_file(file_path):
with open(file_path, 'r', encoding='utf-8') as file:
return file.read()
def calculate_similarities(main_file, other_files):
documents = [read_file(main_file)] + [read_file(f) for f in other_files]
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)
main_doc_matrix = tfidf_matrix[0:1]
similarities = cosine_similarity(main_doc_matrix, tfidf_matrix[1:])
return list(zip(other_files, similarities.flatten()))
def main():
parser = argparse.ArgumentParser(description="Calculate cosine similarity between a main file and a list of other files.")
parser.add_argument("main_file", type=str, help="The main file to compare.")
parser.add_argument("other_files", nargs='+', type=str, help="A list of other files to compare against the main file.")
args = parser.parse_args()
# Filter out the main file early if it's accidentally included in other_files
filtered_files = [f for f in args.other_files if f != args.main_file]
if not os.path.isfile(args.main_file):
print(f"Error: '{args.main_file}' does not exist or is not a file.")
return
for file_path in filtered_files:
if not os.path.isfile(file_path):
print(f"Error: '{file_path}' does not exist or is not a file.")
return
results = calculate_similarities(args.main_file, filtered_files)
sorted_results = sorted(results, key=lambda x: x[1], reverse=True)
# for other_file, similarity in sorted_results:
# print(f"Similarity between {args.main_file} and {other_file}: {similarity:.4f}")
print(f"Comparing against: {args.main_file}")
for other_file, similarity in sorted_results:
print(f"{other_file}: {similarity:.4f}")
if __name__ == "__main__":
main()