Skip to content

Commit

Permalink
test analyze_wortsalat
Browse files Browse the repository at this point in the history
  • Loading branch information
petra-viola committed Dec 21, 2023
1 parent 2a4da0f commit 63a1069
Show file tree
Hide file tree
Showing 7 changed files with 46 additions and 36 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "wortsalat"
version = "0.0.1"
version = "0.0.2"
description = "An NLP python library for analyzing the german language."
keywords = ["nlp", "german nlp"]
readme = "README.md"
Expand Down
48 changes: 23 additions & 25 deletions src/wortsalat/analyze_wortsalat.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from wortsalat.preprocess import tokenize_words, split_sentences
from wortsalat.identify_tags import identify_tags
from wortsalat.identify_words import identify_words
from wortsalat.count import count_total_words, count_total_sentences, count_average_word_length, count_average_words_per_sentence, count_words_with_tag, count_identified_words
from wortsalat.identify_tags import identify_tags, count_words_with_tag
from wortsalat.identify_words import identify_words, count_identified_words
from wortsalat.count import count_total_words, count_total_sentences, count_average_word_length, count_average_words_per_sentence
from wortsalat.wrapper import calculate_flesch_score, calculate_wiener_sachtextformel
from wortsalat.lix import calculate_lix

Expand Down Expand Up @@ -40,29 +40,29 @@ def analyze_wortsalat (text: str) -> dict:
words = tokenize_words(text)
sentences = split_sentences(text)

words_with_tag = identify_tags("ADJA", text, int = 0)
identified_words = identify_words(type, text)
words_with_tag = identify_tags("ADJA", text)
identified_words = identify_words("ich.txt", text)

num_total_words = count_total_words(words)
num_total_sentences = count_total_sentences(words)
length_average_word = count_average_word_length(words)
length_average_sentence = count_average_words_per_sentence(sentences)
num_total_words = count_total_words(text)
num_total_sentences = count_total_sentences(text)
length_average_word = count_average_word_length(text)
length_average_sentence = count_average_words_per_sentence(text)
num_words_with_tag = count_words_with_tag(words_with_tag)
num_identified_words = count_identified_words(identified_words)

flesch_kincaid = calculate_flesch_score(text)
wiener_sachtextformel = calculate_wiener_sachtextformel(text)
lix = calculate_lix(text)

adjektive = identify_tags('ADJ', words, 0)
adverbien = identify_tags('ADV', words, 0)
artikel = identify_tags('ART', words, 0)
modalverben = identify_tags('VM', words, 0)
nomen = identify_tags('NN', words, 0)
praepositionen = identify_words('APPO', 'APPR', 'APPRART', 'APPZR', words, 0)
pronomen = identify_words('PPER', words, 0)
verben = identify_words("VA(FIN)", "VA(IMP)", "VA(INF)", "VM(FIN)", "VM(INF)", "VM(PP)," "VV(FIN)", "VV(IMP)", "VV(INF)", "VV(IZU)", "VV(PP)," words, 0)
emojis = identify_words(emojis, words)
adjektive = identify_tags('ADJ', text)
adverbien = identify_tags('ADV', text)
artikel = identify_tags('ART', text)
modalverben = identify_tags('VM', text)
nomen = identify_tags('NN', text)
praepositionen = identify_tags('APPO', text)
pronomen = identify_tags('PPER', text)
verben = identify_tags("VA", text)
emojis = identify_words("emojis.txt", text)

num_adjektive = len(adjektive)
num_adverbien = len(adverbien)
Expand All @@ -84,9 +84,9 @@ def analyze_wortsalat (text: str) -> dict:
ratio_verben = len(verben)/ num_total_words
ratio_emojis = len(emojis)/ num_total_words

ich = identify_words(ich, words)
wir = identify_words(wir, words)
ich_wir_verhältnis = ich/ wir
ich = identify_words("ich.txt", text)
wir = identify_words("wir.txt", text)
ich_wir_verhältnis = len(ich) / len(wir)

analysis_small = {
"total number of words": num_total_words,
Expand Down Expand Up @@ -144,8 +144,7 @@ def print_wortsalat_small(text: str) -> dict:
Returns:
- dict: A dictionary containing all metrics.
"""
text = input()
analysis_small = analyze_wortsalat(text)
analysis_small, analysis_big = analyze_wortsalat(text)
for key, value in analysis_small.items():
print(key, ":", value)

Expand All @@ -159,7 +158,6 @@ def print_wortsalat_big(text: str) -> dict:
Returns:
- dict: A dictionary containing all metrics.
"""
text = input()
analysis_big = analyze_wortsalat(text)
analysis_big, analysis_small = analyze_wortsalat(text)
for key, value in analysis_big.items():
print(key, ":", value)
1 change: 1 addition & 0 deletions src/wortsalat/count.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ def count_average_words_per_sentence(text: str) -> float:
- float: The average number of words per sentence in the input text.
"""
sentences = split_sentences(text)
print(sentences)
total_words = sum(len(sentence.split()) for sentence in sentences)
length_average_sentence = total_words / len(sentences)
return length_average_sentence
7 changes: 5 additions & 2 deletions src/wortsalat/identify_tags.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

tagger = ht.HanoverTagger('morphmodel_ger.pgz')

def identify_tags(tag: str, text: str, taglevel: int = 1) -> Dict[str, List[str]]:
def identify_tags(tag: str, text: str) -> Dict[str, List[str]]:
"""
This function tags the words using the HanTa library, and then identifies the words that match the specified POS tag.
Expand All @@ -30,11 +30,14 @@ def identify_tags(tag: str, text: str, taglevel: int = 1) -> Dict[str, List[str]
- Dict[str, List[str]]: A dictionary where each key is a POS tag and each value is a list of words that were assigned that tag.
"""
words = tokenize_words(text)
tagged_words = tagger.tag_sent(words, taglevel=taglevel)
tagged_words = tagger.tag_sent(words, taglevel=2)

words_with_tag = list()

print(tagged_words)

for word in tagged_words:
print(word)
if word[2] == tag:
words_with_tag.append(word)

Expand Down
4 changes: 2 additions & 2 deletions src/wortsalat/identify_words.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from importlib import resources as impresources
from . import data

def identify_words(type: str ,text: str) -> list[str]:
def identify_words(data_type: str ,text: str) -> list[str]:
"""
Identify words in a given text that match a specific word list.
Expand All @@ -12,7 +12,7 @@ def identify_words(type: str ,text: str) -> list[str]:
Returns:
- list: The words in the input text that match the specified word list.
"""
inp_file = (impresources.files(data) / type)
inp_file = (impresources.files(data) / str(data_type))
with inp_file.open("rt") as f:
dictionary = f.readlines()
dictionary = [line.strip("\n") for line in dictionary]
Expand Down
2 changes: 1 addition & 1 deletion src/wortsalat/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,6 @@ def split_sentences(text: str) -> tuple[list[str]]:
tuple: A tuple containing:
- list: A list of sentences extracted from the input text.
"""
sentences = sent_tokenize(text)
sentences = sent_tokenize(text, language="german")

return sentences
18 changes: 13 additions & 5 deletions tests/test_analyze_wortsalat.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,13 @@
#from wortsalat import analyze_wortsalat
#import pytest
#
#def test_analyze_wortsalat():
# print(analyze_wortsalat("kannst du damit aufhören?"))
from wortsalat.analyze_wortsalat import analyze_wortsalat, print_wortsalat_small, print_wortsalat_big

def test_analyze_words():
text = "Hallo leute wir sind heute auf einem Bauernhof. Alle Tiere sind in Ordnung nur eins ist doof. Das Rapphuhn. Das rappt nun."
analyze_wortsalat(text)

def test_print_wortsalat_small():
text = "Hallo leute wir sind heute auf einem Bauernhof. Alle Tiere sind in Ordnung nur eins ist doof. Das Rapphuhn. Das rappt nun."
print_wortsalat_small(text)

def test_print_wortsalat_big():
text = "Hallo leute wir sind heute auf einem Bauernhof. Alle Tiere sind in Ordnung nur eins ist doof. Das Rapphuhn. Das rappt nun."
print_wortsalat_big(text)

0 comments on commit 63a1069

Please sign in to comment.