test analyze_wortsalat

petra-viola · Dec 21, 2023 · 63a1069 · 63a1069
1 parent 2a4da0f
commit 63a1069
Show file tree

Hide file tree

Showing 7 changed files with 46 additions and 36 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "wortsalat"
-version = "0.0.1"
+version = "0.0.2"
 description = "An NLP python library for analyzing the german language."
 keywords = ["nlp", "german nlp"]
 readme = "README.md"

diff --git a/src/wortsalat/analyze_wortsalat.py b/src/wortsalat/analyze_wortsalat.py
@@ -1,7 +1,7 @@
 from wortsalat.preprocess import tokenize_words, split_sentences
-from wortsalat.identify_tags import identify_tags
-from wortsalat.identify_words import identify_words
-from wortsalat.count import count_total_words, count_total_sentences, count_average_word_length, count_average_words_per_sentence, count_words_with_tag, count_identified_words
+from wortsalat.identify_tags import identify_tags, count_words_with_tag
+from wortsalat.identify_words import identify_words, count_identified_words
+from wortsalat.count import count_total_words, count_total_sentences, count_average_word_length, count_average_words_per_sentence
 from wortsalat.wrapper import calculate_flesch_score, calculate_wiener_sachtextformel
 from wortsalat.lix import calculate_lix
 
@@ -40,29 +40,29 @@ def analyze_wortsalat (text: str) -> dict:
     words = tokenize_words(text)
     sentences = split_sentences(text)
 
-    words_with_tag = identify_tags("ADJA", text, int = 0)
-    identified_words = identify_words(type, text)
+    words_with_tag = identify_tags("ADJA", text)
+    identified_words = identify_words("ich.txt", text)
 
-    num_total_words = count_total_words(words)
-    num_total_sentences = count_total_sentences(words)
-    length_average_word = count_average_word_length(words)
-    length_average_sentence = count_average_words_per_sentence(sentences)
+    num_total_words = count_total_words(text)
+    num_total_sentences = count_total_sentences(text)
+    length_average_word = count_average_word_length(text)
+    length_average_sentence = count_average_words_per_sentence(text)
     num_words_with_tag = count_words_with_tag(words_with_tag)
     num_identified_words = count_identified_words(identified_words)
 
     flesch_kincaid = calculate_flesch_score(text)
     wiener_sachtextformel = calculate_wiener_sachtextformel(text)
     lix = calculate_lix(text)
 
-    adjektive = identify_tags('ADJ', words, 0)
-    adverbien = identify_tags('ADV', words, 0)
-    artikel = identify_tags('ART', words, 0)
-    modalverben = identify_tags('VM', words, 0)
-    nomen = identify_tags('NN', words, 0)
-    praepositionen = identify_words('APPO', 'APPR', 'APPRART', 'APPZR', words, 0)
-    pronomen = identify_words('PPER', words, 0)
-    verben = identify_words("VA(FIN)", "VA(IMP)", "VA(INF)", "VM(FIN)", "VM(INF)", "VM(PP)," "VV(FIN)", "VV(IMP)", "VV(INF)", "VV(IZU)", "VV(PP)," words, 0)
-    emojis = identify_words(emojis, words)
+    adjektive = identify_tags('ADJ', text)
+    adverbien = identify_tags('ADV', text)
+    artikel = identify_tags('ART', text)
+    modalverben = identify_tags('VM', text)
+    nomen = identify_tags('NN', text)
+    praepositionen = identify_tags('APPO', text)
+    pronomen = identify_tags('PPER', text)
+    verben = identify_tags("VA", text)
+    emojis = identify_words("emojis.txt", text)
 
     num_adjektive = len(adjektive)
     num_adverbien = len(adverbien)
@@ -84,9 +84,9 @@ def analyze_wortsalat (text: str) -> dict:
     ratio_verben = len(verben)/ num_total_words
     ratio_emojis = len(emojis)/ num_total_words
 
-    ich = identify_words(ich, words)
-    wir = identify_words(wir, words)
-    ich_wir_verhältnis = ich/ wir
+    ich = identify_words("ich.txt", text)
+    wir = identify_words("wir.txt", text)
+    ich_wir_verhältnis = len(ich) / len(wir)
 
     analysis_small = {
         "total number of words": num_total_words,
@@ -144,8 +144,7 @@ def print_wortsalat_small(text: str) -> dict:
     Returns:
     - dict: A dictionary containing all metrics.
     """
-    text = input()
-    analysis_small = analyze_wortsalat(text)
+    analysis_small, analysis_big = analyze_wortsalat(text)
     for key, value in analysis_small.items():
         print(key, ":", value)
 
@@ -159,7 +158,6 @@ def print_wortsalat_big(text: str) -> dict:
     Returns:
     - dict: A dictionary containing all metrics.
     """
-    text = input()
-    analysis_big = analyze_wortsalat(text)
+    analysis_big, analysis_small = analyze_wortsalat(text)
     for key, value in analysis_big.items():
         print(key, ":", value)
diff --git a/src/wortsalat/count.py b/src/wortsalat/count.py
@@ -62,6 +62,7 @@ def count_average_words_per_sentence(text: str) -> float:
     - float: The average number of words per sentence in the input text.
     """
     sentences = split_sentences(text)
+    print(sentences)
     total_words = sum(len(sentence.split()) for sentence in sentences)
     length_average_sentence = total_words / len(sentences)
     return length_average_sentence
diff --git a/src/wortsalat/identify_tags.py b/src/wortsalat/identify_tags.py
@@ -4,7 +4,7 @@
 
 tagger = ht.HanoverTagger('morphmodel_ger.pgz')
 
-def identify_tags(tag: str, text: str, taglevel: int = 1) -> Dict[str, List[str]]:
+def identify_tags(tag: str, text: str) -> Dict[str, List[str]]:
     """
     This function tags the words using the HanTa library, and then identifies the words that match the specified POS tag.
 
@@ -30,11 +30,14 @@ def identify_tags(tag: str, text: str, taglevel: int = 1) -> Dict[str, List[str]
     - Dict[str, List[str]]: A dictionary where each key is a POS tag and each value is a list of words that were assigned that tag.
     """
     words = tokenize_words(text)
-    tagged_words = tagger.tag_sent(words, taglevel=taglevel)
+    tagged_words = tagger.tag_sent(words, taglevel=2)
 
     words_with_tag = list()
 
+    print(tagged_words)
+
     for word in tagged_words:
+        print(word)
         if word[2] == tag:
             words_with_tag.append(word)
 

diff --git a/src/wortsalat/identify_words.py b/src/wortsalat/identify_words.py
@@ -1,7 +1,7 @@
 from importlib import resources as impresources
 from . import data
 
-def identify_words(type: str ,text: str) -> list[str]:
+def identify_words(data_type: str ,text: str) -> list[str]:
     """
     Identify words in a given text that match a specific word list.
 
@@ -12,7 +12,7 @@ def identify_words(type: str ,text: str) -> list[str]:
     Returns:
     - list: The words in the input text that match the specified word list.
     """
-    inp_file = (impresources.files(data) / type)
+    inp_file = (impresources.files(data) / str(data_type))
     with inp_file.open("rt") as f:
         dictionary = f.readlines()
         dictionary = [line.strip("\n") for line in dictionary]

diff --git a/src/wortsalat/preprocess.py b/src/wortsalat/preprocess.py
@@ -37,6 +37,6 @@ def split_sentences(text: str) -> tuple[list[str]]:
     tuple: A tuple containing:
     - list: A list of sentences extracted from the input text.
     """
-    sentences = sent_tokenize(text)
+    sentences = sent_tokenize(text, language="german")
 
     return sentences
diff --git a/tests/test_analyze_wortsalat.py b/tests/test_analyze_wortsalat.py
@@ -1,5 +1,13 @@
-#from wortsalat import analyze_wortsalat
-#import pytest
-#
-#def test_analyze_wortsalat():
-#    print(analyze_wortsalat("kannst du damit aufhören?"))
+from wortsalat.analyze_wortsalat import analyze_wortsalat, print_wortsalat_small, print_wortsalat_big 
+
+def test_analyze_words():
+    text = "Hallo leute wir sind heute auf einem Bauernhof. Alle Tiere sind in Ordnung nur eins ist doof. Das Rapphuhn. Das rappt nun."
+    analyze_wortsalat(text)
+
+def test_print_wortsalat_small():
+    text = "Hallo leute wir sind heute auf einem Bauernhof. Alle Tiere sind in Ordnung nur eins ist doof. Das Rapphuhn. Das rappt nun."
+    print_wortsalat_small(text)
+
+def test_print_wortsalat_big():
+    text = "Hallo leute wir sind heute auf einem Bauernhof. Alle Tiere sind in Ordnung nur eins ist doof. Das Rapphuhn. Das rappt nun."
+    print_wortsalat_big(text)