Merge pull request #32 from EveWCheng/main

WWD
harmonydata · Apr 10, 2024 · b22aee0 · b22aee0
2 parents 554fac4 + 1b02305
commit b22aee0
Show file tree

Hide file tree

Showing 2 changed files with 110 additions and 0 deletions.
diff --git a/src/harmony/matching/wmd_matcher.py b/src/harmony/matching/wmd_matcher.py
@@ -0,0 +1,44 @@
+from wmd import WMD
+import numpy as np
+import math
+import libwmdrelax
+
+def euclidean_dist(point1, point2):
+    if len(point1) != len(point2):
+        raise ValueError("Points must have the same number of dimensions")
+
+    squared_distance = sum((p1 - p2) ** 2 for p1, p2 in zip(point1, point2))
+    distance = math.sqrt(squared_distance)
+    return distance
+
+def par_to_vecs(par,vectorisation_function):
+    return [vectorisation_function(sent) for sent in par]
+
+def dist(vecs1,vecs2):
+    vec_union = list(vecs1 + vecs2)
+    n1,n2 = len(vecs1),len(vecs2)
+    n = len(vec_union)
+    dist_ = np.zeros((n,n))
+    for i in range(n):
+        for j in range(i):
+            dist_[i,j] = dist_[j,i] = euclidean_dist(vec_union[i],vec_union[j])
+
+    nw1 = [1. for i in range(n1)]+[0. for i in range(n2)]
+    nw2 = [0. for i in range(n1)] +[1. for i in range(n2)]
+    return np.array(dist_,dtype=np.float32),np.array(nw1,dtype=np.float32),np.array(nw2,dtype=np.float32)
+
+
+def pars_dist_emd_emdrelaxed(par1,par2,vectorisation_function):
+    relax_cache = libwmdrelax.emd_relaxed_cache_init(int(100)) 
+    cache = libwmdrelax.emd_cache_init(int(100)) 
+
+    vecs1,vecs2 = par_to_vecs(par1,vectorisation_function),par_to_vecs(par2,vectorisation_function)
+    dist_,nw1,nw2 = dist(vecs1,vecs2)
+    emd = libwmdrelax.emd(nw1,nw2,dist_,cache)
+    emd_relaxed = libwmdrelax.emd_relaxed(nw1,nw2,dist_,relax_cache)
+    return emd,emd_relaxed
+
+
+
+
+
diff --git a/src/harmony_test.py b/src/harmony_test.py
@@ -0,0 +1,66 @@
+import harmony
+import numpy as np
+from harmony import match_instruments
+import json
+import harmony.matching.wmd_matcher
+from wmd import WMD
+
+def import_():
+    instruments = []
+    with open("mhc_data/mhc_questions.json", "r", encoding="utf-8") as f:
+        for l in f:
+            instrument = json.loads(l)
+            instruments.append(instrument)
+    return instruments
+
+def texts_similarity_matrix_benchmark(text_vectors):
+        # Create numpy array of texts vectors
+        # Get similarity with polarity
+        vectors_pos,vectors_neg = harmony.matching.matcher.vectors_pos_neg(text_vectors)
+        if vectors_pos.any():
+            pos_pairwise_similarity = harmony.matching.matcher_utils.cosine_similarity(vectors_pos, vectors_pos)
+        return pos_pairwise_similarity
+
+def test_similarity():
+    questions = ["I was bothered by things that usually don’t bother me.","I did not feel like eating; my appetite was poor.","I felt that I could not shake off the blues even with help from my family or friends.","I felt I was just as good as other people."]
+    questions = ["lost my key", "found my car"]
+    vectorisation_function = harmony.matching.default_matcher.convert_texts_to_vector
+    text_vectors = harmony.matching.matcher.process_questions(questions)
+    print(text_vectors)
+    text_vectors = harmony.matching.matcher.vectorise_texts(text_vectors,vectorisation_function)
+    print(texts_similarity_matrix_benchmark(text_vectors))
+#   pip install harmonydata
+
+def test_match_instruments_with_function():
+    instruments = import_()
+    print(instruments[0])
+    query = "Lost much sleep over worry?"
+    vectorisation_function = harmony.matching.default_matcher.convert_texts_to_vector
+    all_questions, similarity_with_polarity, query_similarity, new_vectors_dict=harmony.matching.matcher.match_instruments_with_function(instruments[1:10],query,vectorisation_function,[],[],np.zeros((0, 0)),{})
+    print(all_questions)
+    print(similarity_with_polarity)
+#    print(query_similarity)
+#    print(new_vectors_dict)
+    np.savetxt("sim_with_polarity.txt", similarity_with_polarity, fmt='%d', delimiter='\t')
+
+
+def test_wwd():
+    vectorisation_function = harmony.matching.default_matcher.convert_texts_to_vector
+    par1 = ["I want to go outside","oh outside is nice"]
+    par2 = ["I want to go outside maybe","oh outside is nice"]
+    par3 = ["You are a dog", "I love dogs"]
+    par4 = ["I am sad","are you sad"]
+
+#    par2 = ["Who wants to go outside","oh the dog wants to go outside"]
+    emd,emd_relaxed = harmony.matching.wmd_matcher.pars_dist_emd_emdrelaxed(par4,par3,vectorisation_function)
+    print(emd)
+    print(emd_relaxed)
+
+test_wwd()
+
+
+
+
+
+
+