From a393119ee80b7dd070295b4f67c40161bcdf09e1 Mon Sep 17 00:00:00 2001 From: Eve Date: Sat, 23 Dec 2023 18:18:45 +1100 Subject: [PATCH 1/3] Add wmd_match and testing --- src/harmony/matching/wmd_matcher.py | 40 +++++++++++++++++++++ src/harmony_test.py | 56 +++++++++++++++++++++++++++++ 2 files changed, 96 insertions(+) create mode 100644 src/harmony/matching/wmd_matcher.py create mode 100644 src/harmony_test.py diff --git a/src/harmony/matching/wmd_matcher.py b/src/harmony/matching/wmd_matcher.py new file mode 100644 index 0000000..10ce55a --- /dev/null +++ b/src/harmony/matching/wmd_matcher.py @@ -0,0 +1,40 @@ +from wmd import WMD +import numpy as np +import math + +def euclidean_dist(point1, point2): + if len(point1) != len(point2): + raise ValueError("Points must have the same number of dimensions") + + squared_distance = sum((p1 - p2) ** 2 for p1, p2 in zip(point1, point2)) + distance = math.sqrt(squared_distance) + return distance + +def par_to_vecs(par,vectorisation_function): + return [vectorisation_function(sent) for sent in par] + +def dist(vecs1,vecs2): + vec_union = list(vecs1 + vecs2) + n1,n2 = len(vecs1),len(vecs2) + n = len(vec_union) + dist_ = np.zeros((n,n)) + for i in range(n): + for j in range(i): + dist_[i,j] = dist_[j,i] = euclidean_dist(vec_union[i],vec_union[j]) + + nw1 = [1 for i in range(n1)]+[0 for i in range(n2)] + nw2 = [1 for i in range(n2)]+[0 for i in range(n1)] + return dist_,nw1,nw2 + + +def pars_dist_emd_emdrelaxed(par1,par2,vectorisation_function): + relax_cache = libwmdrelax.emd_relaxed_cache_init(1e5) + cache = libwmdrelax.emd_cache_init(1e5) + + vecs1,vecs2 = par_to_vecs(par1),par_to_vecs(par2) + dist_,nw1,nw2 = dist(vecs1,vecs2) + emd = libwmdrelax.emd(nw1,nw2,dist_,) + emd_relaxed = libwmrelax.emd_relaxed(nw1,nw2,dist_,relax_cache) + + + diff --git a/src/harmony_test.py b/src/harmony_test.py new file mode 100644 index 0000000..c9a2a08 --- /dev/null +++ b/src/harmony_test.py @@ -0,0 +1,56 @@ +import harmony +import numpy as np +#harmony.download_models() +from harmony import match_instruments +#from harmony.schemes.requests.text import Instrument +#from harmony import match_instruments +# +import json + +def import_(): + instruments = [] + with open("mhc_data/mhc_questions.json", "r", encoding="utf-8") as f: + for l in f: + instrument = json.loads(l) + instruments.append(instrument) + return instruments + +def texts_similarity_matrix_benchmark(text_vectors): + # Create numpy array of texts vectors + # Get similarity with polarity + vectors_pos,vectors_neg = harmony.matching.matcher.vectors_pos_neg(text_vectors) + if vectors_pos.any(): + pos_pairwise_similarity = harmony.matching.matcher_utils.cosine_similarity(vectors_pos, vectors_pos) + return pos_pairwise_similarity + +def test_similarity(): + questions = ["I was bothered by things that usually don’t bother me.","I did not feel like eating; my appetite was poor.","I felt that I could not shake off the blues even with help from my family or friends.","I felt I was just as good as other people."] + questions = ["lost my key", "found my car"] + vectorisation_function = harmony.matching.default_matcher.convert_texts_to_vector + text_vectors = harmony.matching.matcher.process_questions(questions) + print(text_vectors) + text_vectors = harmony.matching.matcher.vectorise_texts(text_vectors,vectorisation_function) + print(texts_similarity_matrix_benchmark(text_vectors)) +# pip install harmonydata + +def test_match_instruments_with_function(): + instruments = import_() + print(instruments[0]) + query = "Lost much sleep over worry?" + vectorisation_function = harmony.matching.default_matcher.convert_texts_to_vector + all_questions, similarity_with_polarity, query_similarity, new_vectors_dict=harmony.matching.matcher.match_instruments_with_function(instruments[1:10],query,vectorisation_function,[],[],np.zeros((0, 0)),{}) + print(all_questions) + print(similarity_with_polarity) +# print(query_similarity) +# print(new_vectors_dict) + np.savetxt("sim_with_polarity.txt", similarity_with_polarity, fmt='%d', delimiter='\t') + + + + +test_match_instruments_with_function() + + + + + From 7fbdaf5ea2e8c98babced486bfac2ca8775d32a9 Mon Sep 17 00:00:00 2001 From: Eve Date: Sun, 4 Feb 2024 14:44:18 +1100 Subject: [PATCH 2/3] -a --- src/harmony/matching/wmd_matcher.py | 20 ++++++++++++-------- src/harmony_test.py | 16 +++++++++++----- 2 files changed, 23 insertions(+), 13 deletions(-) diff --git a/src/harmony/matching/wmd_matcher.py b/src/harmony/matching/wmd_matcher.py index 10ce55a..e0a7708 100644 --- a/src/harmony/matching/wmd_matcher.py +++ b/src/harmony/matching/wmd_matcher.py @@ -1,6 +1,7 @@ from wmd import WMD import numpy as np import math +import libwmdrelax def euclidean_dist(point1, point2): if len(point1) != len(point2): @@ -22,19 +23,22 @@ def dist(vecs1,vecs2): for j in range(i): dist_[i,j] = dist_[j,i] = euclidean_dist(vec_union[i],vec_union[j]) - nw1 = [1 for i in range(n1)]+[0 for i in range(n2)] - nw2 = [1 for i in range(n2)]+[0 for i in range(n1)] - return dist_,nw1,nw2 + nw1 = [1. for i in range(n1)]+[0. for i in range(n2)] + nw2 = [1. for i in range(n2)]+[0. for i in range(n1)] + return np.array(dist_,dtype=np.float32),np.array(nw1,dtype=np.float32),np.array(nw2,dtype=np.float32) def pars_dist_emd_emdrelaxed(par1,par2,vectorisation_function): - relax_cache = libwmdrelax.emd_relaxed_cache_init(1e5) - cache = libwmdrelax.emd_cache_init(1e5) + relax_cache = libwmdrelax.emd_relaxed_cache_init(int(100)) + cache = libwmdrelax.emd_cache_init(int(100)) - vecs1,vecs2 = par_to_vecs(par1),par_to_vecs(par2) + vecs1,vecs2 = par_to_vecs(par1,vectorisation_function),par_to_vecs(par2,vectorisation_function) dist_,nw1,nw2 = dist(vecs1,vecs2) - emd = libwmdrelax.emd(nw1,nw2,dist_,) - emd_relaxed = libwmrelax.emd_relaxed(nw1,nw2,dist_,relax_cache) + emd = libwmdrelax.emd(nw1,nw2,dist_,cache) + emd_relaxed = libwmdrelax.emd_relaxed(nw1,nw2,dist_,relax_cache) + return emd,emd_relaxed + + diff --git a/src/harmony_test.py b/src/harmony_test.py index c9a2a08..1d72acb 100644 --- a/src/harmony_test.py +++ b/src/harmony_test.py @@ -1,11 +1,9 @@ import harmony import numpy as np -#harmony.download_models() from harmony import match_instruments -#from harmony.schemes.requests.text import Instrument -#from harmony import match_instruments -# import json +import harmony.matching.wmd_matcher +from wmd import WMD def import_(): instruments = [] @@ -46,9 +44,17 @@ def test_match_instruments_with_function(): np.savetxt("sim_with_polarity.txt", similarity_with_polarity, fmt='%d', delimiter='\t') +def test_wwd(): + vectorisation_function = harmony.matching.default_matcher.convert_texts_to_vector + par1 = ["I want to go outside","oh outside is nice"] + par2 = ["Who wants to go outside","oh the dog wants to go outside"] + emd,emd_relaxed = harmony.matching.wmd_matcher.pars_dist_emd_emdrelaxed(par1,par2,vectorisation_function) + print(emd) + print(emd_relaxed) +test_wwd() + -test_match_instruments_with_function() From 1b023059b112db0c7741afa391d166222256a452 Mon Sep 17 00:00:00 2001 From: Eve Date: Sun, 4 Feb 2024 14:54:54 +1100 Subject: [PATCH 3/3] -a --- src/harmony/matching/wmd_matcher.py | 2 +- src/harmony_test.py | 8 ++++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/src/harmony/matching/wmd_matcher.py b/src/harmony/matching/wmd_matcher.py index e0a7708..4fa829f 100644 --- a/src/harmony/matching/wmd_matcher.py +++ b/src/harmony/matching/wmd_matcher.py @@ -24,7 +24,7 @@ def dist(vecs1,vecs2): dist_[i,j] = dist_[j,i] = euclidean_dist(vec_union[i],vec_union[j]) nw1 = [1. for i in range(n1)]+[0. for i in range(n2)] - nw2 = [1. for i in range(n2)]+[0. for i in range(n1)] + nw2 = [0. for i in range(n1)] +[1. for i in range(n2)] return np.array(dist_,dtype=np.float32),np.array(nw1,dtype=np.float32),np.array(nw2,dtype=np.float32) diff --git a/src/harmony_test.py b/src/harmony_test.py index 1d72acb..2e8085b 100644 --- a/src/harmony_test.py +++ b/src/harmony_test.py @@ -47,8 +47,12 @@ def test_match_instruments_with_function(): def test_wwd(): vectorisation_function = harmony.matching.default_matcher.convert_texts_to_vector par1 = ["I want to go outside","oh outside is nice"] - par2 = ["Who wants to go outside","oh the dog wants to go outside"] - emd,emd_relaxed = harmony.matching.wmd_matcher.pars_dist_emd_emdrelaxed(par1,par2,vectorisation_function) + par2 = ["I want to go outside maybe","oh outside is nice"] + par3 = ["You are a dog", "I love dogs"] + par4 = ["I am sad","are you sad"] + +# par2 = ["Who wants to go outside","oh the dog wants to go outside"] + emd,emd_relaxed = harmony.matching.wmd_matcher.pars_dist_emd_emdrelaxed(par4,par3,vectorisation_function) print(emd) print(emd_relaxed)