diff --git a/pyproject.toml b/pyproject.toml index 781e373..c006c60 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "wortsalat" -version = "0.0.1" +version = "0.0.2" description = "An NLP python library for analyzing the german language." keywords = ["nlp", "german nlp"] readme = "README.md" diff --git a/src/wortsalat/analyze_wortsalat.py b/src/wortsalat/analyze_wortsalat.py index 09e5f67..a04ac1e 100644 --- a/src/wortsalat/analyze_wortsalat.py +++ b/src/wortsalat/analyze_wortsalat.py @@ -1,7 +1,7 @@ from wortsalat.preprocess import tokenize_words, split_sentences -from wortsalat.identify_tags import identify_tags -from wortsalat.identify_words import identify_words -from wortsalat.count import count_total_words, count_total_sentences, count_average_word_length, count_average_words_per_sentence, count_words_with_tag, count_identified_words +from wortsalat.identify_tags import identify_tags, count_words_with_tag +from wortsalat.identify_words import identify_words, count_identified_words +from wortsalat.count import count_total_words, count_total_sentences, count_average_word_length, count_average_words_per_sentence from wortsalat.wrapper import calculate_flesch_score, calculate_wiener_sachtextformel from wortsalat.lix import calculate_lix @@ -40,13 +40,13 @@ def analyze_wortsalat (text: str) -> dict: words = tokenize_words(text) sentences = split_sentences(text) - words_with_tag = identify_tags("ADJA", text, int = 0) - identified_words = identify_words(type, text) + words_with_tag = identify_tags("ADJA", text) + identified_words = identify_words("ich.txt", text) - num_total_words = count_total_words(words) - num_total_sentences = count_total_sentences(words) - length_average_word = count_average_word_length(words) - length_average_sentence = count_average_words_per_sentence(sentences) + num_total_words = count_total_words(text) + num_total_sentences = count_total_sentences(text) + length_average_word = count_average_word_length(text) + length_average_sentence = count_average_words_per_sentence(text) num_words_with_tag = count_words_with_tag(words_with_tag) num_identified_words = count_identified_words(identified_words) @@ -54,15 +54,15 @@ def analyze_wortsalat (text: str) -> dict: wiener_sachtextformel = calculate_wiener_sachtextformel(text) lix = calculate_lix(text) - adjektive = identify_tags('ADJ', words, 0) - adverbien = identify_tags('ADV', words, 0) - artikel = identify_tags('ART', words, 0) - modalverben = identify_tags('VM', words, 0) - nomen = identify_tags('NN', words, 0) - praepositionen = identify_words('APPO', 'APPR', 'APPRART', 'APPZR', words, 0) - pronomen = identify_words('PPER', words, 0) - verben = identify_words("VA(FIN)", "VA(IMP)", "VA(INF)", "VM(FIN)", "VM(INF)", "VM(PP)," "VV(FIN)", "VV(IMP)", "VV(INF)", "VV(IZU)", "VV(PP)," words, 0) - emojis = identify_words(emojis, words) + adjektive = identify_tags('ADJ', text) + adverbien = identify_tags('ADV', text) + artikel = identify_tags('ART', text) + modalverben = identify_tags('VM', text) + nomen = identify_tags('NN', text) + praepositionen = identify_tags('APPO', text) + pronomen = identify_tags('PPER', text) + verben = identify_tags("VA", text) + emojis = identify_words("emojis.txt", text) num_adjektive = len(adjektive) num_adverbien = len(adverbien) @@ -84,9 +84,9 @@ def analyze_wortsalat (text: str) -> dict: ratio_verben = len(verben)/ num_total_words ratio_emojis = len(emojis)/ num_total_words - ich = identify_words(ich, words) - wir = identify_words(wir, words) - ich_wir_verhältnis = ich/ wir + ich = identify_words("ich.txt", text) + wir = identify_words("wir.txt", text) + ich_wir_verhältnis = len(ich) / len(wir) analysis_small = { "total number of words": num_total_words, @@ -144,8 +144,7 @@ def print_wortsalat_small(text: str) -> dict: Returns: - dict: A dictionary containing all metrics. """ - text = input() - analysis_small = analyze_wortsalat(text) + analysis_small, analysis_big = analyze_wortsalat(text) for key, value in analysis_small.items(): print(key, ":", value) @@ -159,7 +158,6 @@ def print_wortsalat_big(text: str) -> dict: Returns: - dict: A dictionary containing all metrics. """ - text = input() - analysis_big = analyze_wortsalat(text) + analysis_big, analysis_small = analyze_wortsalat(text) for key, value in analysis_big.items(): print(key, ":", value) \ No newline at end of file diff --git a/src/wortsalat/count.py b/src/wortsalat/count.py index 4782a42..41df058 100644 --- a/src/wortsalat/count.py +++ b/src/wortsalat/count.py @@ -62,6 +62,7 @@ def count_average_words_per_sentence(text: str) -> float: - float: The average number of words per sentence in the input text. """ sentences = split_sentences(text) + print(sentences) total_words = sum(len(sentence.split()) for sentence in sentences) length_average_sentence = total_words / len(sentences) return length_average_sentence \ No newline at end of file diff --git a/src/wortsalat/identify_tags.py b/src/wortsalat/identify_tags.py index 6084fb3..0c4e033 100644 --- a/src/wortsalat/identify_tags.py +++ b/src/wortsalat/identify_tags.py @@ -4,7 +4,7 @@ tagger = ht.HanoverTagger('morphmodel_ger.pgz') -def identify_tags(tag: str, text: str, taglevel: int = 1) -> Dict[str, List[str]]: +def identify_tags(tag: str, text: str) -> Dict[str, List[str]]: """ This function tags the words using the HanTa library, and then identifies the words that match the specified POS tag. @@ -30,11 +30,14 @@ def identify_tags(tag: str, text: str, taglevel: int = 1) -> Dict[str, List[str] - Dict[str, List[str]]: A dictionary where each key is a POS tag and each value is a list of words that were assigned that tag. """ words = tokenize_words(text) - tagged_words = tagger.tag_sent(words, taglevel=taglevel) + tagged_words = tagger.tag_sent(words, taglevel=2) words_with_tag = list() + print(tagged_words) + for word in tagged_words: + print(word) if word[2] == tag: words_with_tag.append(word) diff --git a/src/wortsalat/identify_words.py b/src/wortsalat/identify_words.py index 87ae438..5713121 100644 --- a/src/wortsalat/identify_words.py +++ b/src/wortsalat/identify_words.py @@ -1,7 +1,7 @@ from importlib import resources as impresources from . import data -def identify_words(type: str ,text: str) -> list[str]: +def identify_words(data_type: str ,text: str) -> list[str]: """ Identify words in a given text that match a specific word list. @@ -12,7 +12,7 @@ def identify_words(type: str ,text: str) -> list[str]: Returns: - list: The words in the input text that match the specified word list. """ - inp_file = (impresources.files(data) / type) + inp_file = (impresources.files(data) / str(data_type)) with inp_file.open("rt") as f: dictionary = f.readlines() dictionary = [line.strip("\n") for line in dictionary] diff --git a/src/wortsalat/preprocess.py b/src/wortsalat/preprocess.py index 37a9d1f..139dccb 100644 --- a/src/wortsalat/preprocess.py +++ b/src/wortsalat/preprocess.py @@ -37,6 +37,6 @@ def split_sentences(text: str) -> tuple[list[str]]: tuple: A tuple containing: - list: A list of sentences extracted from the input text. """ - sentences = sent_tokenize(text) + sentences = sent_tokenize(text, language="german") return sentences \ No newline at end of file diff --git a/tests/test_analyze_wortsalat.py b/tests/test_analyze_wortsalat.py index beedb18..0fe81a5 100644 --- a/tests/test_analyze_wortsalat.py +++ b/tests/test_analyze_wortsalat.py @@ -1,5 +1,13 @@ -#from wortsalat import analyze_wortsalat -#import pytest -# -#def test_analyze_wortsalat(): -# print(analyze_wortsalat("kannst du damit aufhören?")) \ No newline at end of file +from wortsalat.analyze_wortsalat import analyze_wortsalat, print_wortsalat_small, print_wortsalat_big + +def test_analyze_words(): + text = "Hallo leute wir sind heute auf einem Bauernhof. Alle Tiere sind in Ordnung nur eins ist doof. Das Rapphuhn. Das rappt nun." + analyze_wortsalat(text) + +def test_print_wortsalat_small(): + text = "Hallo leute wir sind heute auf einem Bauernhof. Alle Tiere sind in Ordnung nur eins ist doof. Das Rapphuhn. Das rappt nun." + print_wortsalat_small(text) + +def test_print_wortsalat_big(): + text = "Hallo leute wir sind heute auf einem Bauernhof. Alle Tiere sind in Ordnung nur eins ist doof. Das Rapphuhn. Das rappt nun." + print_wortsalat_big(text) \ No newline at end of file