From 6527ce2464c741f7ed67e10c965becf8553ccd84 Mon Sep 17 00:00:00 2001
From: anthology-assist <126604033+anthology-assist@users.noreply.github.com>
Date: Thu, 7 Nov 2024 14:32:28 -0600
Subject: [PATCH] Ingestion: JLCL (#3994)

---
 data/xml/2023.jlcl.xml     | 144 +++++++++++++++++++++++++++++++++++++
 data/yaml/venues/jlcl.yaml |   3 +
 2 files changed, 147 insertions(+)
 create mode 100644 data/xml/2023.jlcl.xml
 create mode 100644 data/yaml/venues/jlcl.yaml
diff --git a/data/xml/2023.jlcl.xml b/data/xml/2023.jlcl.xml
new file mode 100644
index 0000000000..69c09b0c3e
--- /dev/null
+++ b/data/xml/2023.jlcl.xml
@@ -0,0 +1,144 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<collection id="2023.jlcl">
+  <volume id="1" ingest-date="2024-11-01" type="proceedings">
+    <meta>
+      <booktitle>Journal for Language Technology and Computational Linguistics, Vol. 36 No. 1</booktitle>
+      <editor><first>Roman</first><last>Schneider</last></editor>
+      <editor><first>Faaß</first><last>Gertrud</last></editor>
+      <publisher>German Society for Computational Lingustics and Language Technology</publisher>
+      <address>unknown</address>
+      <month>May</month>
+      <year>2023</year>
+      <venue>jlcl</venue>
+    </meta>
+    <frontmatter>
+      <url hash="ae6a48e0">2023.jlcl-1.0</url>
+      <bibkey>jlcl-2023-1</bibkey>
+    </frontmatter>
+    <paper id="1">
+      <title>Computerlinguistische Herausforderungen, empirische Erforschung &amp; multidisziplinäres Potenzial deutschsprachiger Songtexte</title>
+      <author><first>Roman</first><last>Schneider</last></author>
+      <author><first>Faaß</first><last>Gertrud</last></author>
+      <pages>iii-v</pages>
+      <url hash="d35cfcbc">2023.jlcl-1.1</url>
+      <doi>10.21248/jlcl.36.2023.234</doi>
+      <bibkey>schneider-gertrud-2023-computerlinguistische</bibkey>
+    </paper>
+    <paper id="2">
+      <title><fixed-case>E</fixed-case>nglish and <fixed-case>G</fixed-case>erman pop song lyrics: Towards a contrastive textology</title>
+      <author><first>Valentin</first><last>Werner</last></author>
+      <pages>1–20</pages>
+      <abstract>The present contribution offers a contrastive corpus-based analysis of English and German pop lyrics. It conceptualizes lyrics as a specific text type/register and tries to identify cross-linguistic commonalities and differences. As empirical base, it uses corpora that represent the lyrics of commercially highly successful pop songs in Anglophone and German contexts. Given the similar sociocultural functions and production circumstances of English and German lyrics, the study starts from the assumption that a large-scale linguistic overlap can be traced. While indeed cross-linguistic convergence is found especially for lexical patterns in terms of topic choice, the analysis also reveals a common property of conveying a conversational feel through lexicogrammatical means. However, given the differing typological make-up of the languages contrasted, fine-grained differences emerge as regards the ways conversationality/informality is established in pop lyrics as a performed text type.</abstract>
+      <url hash="c9c32389">2023.jlcl-1.2</url>
+      <doi>10.21248/jlcl.36.2023.235</doi>
+      <bibkey>werner-2023-english</bibkey>
+    </paper>
+    <paper id="3">
+      <title>Keyness in song lyrics: Challenges of highly clumpy data</title>
+      <author><first>Jan</first><last>Langenhorst</last></author>
+      <author><first>Yannick</first><last>Frommherz</last></author>
+      <author><first>Simon</first><last>Meier-Vieracker</last></author>
+      <pages>21–38</pages>
+      <abstract>Computer-assisted stylistic analyses regularly employ the calculation of keywords. We show that the inclusion of a separate dispersion measure in addition to a frequency measure into keyword analysis (or more generally: keyness analysis), as proposed by Gries (2021), is a necessary extension of said analyses. Using texts from the German Songkorpus, we demonstrate that traditional keyword calculations using only frequency measures lead to spurious results. Determining keywords by both measuring a word’s frequency and its dispersion in comparison to a reference corpus gives a more realistic view. This is especially relevant for our corpus, since song lyrics turn out to be extraordinarily clumpy data: Words that are very frequent in one artist’s subcorpus typically only occur in a few or even just a single one of their songs due to widespread word repetition within songs, e.g., in choruses. Song lyrics in our dataset are shown to not feature words that can be considered key at all. Our contribution is twofold: (1) We demonstrate the utility of Gries’ (2021) approach and (2) interpret the (lack of) results in terms of a genre-specific property which is that song lyrics are lexically autonomous works of art.</abstract>
+      <url hash="6c7e6259">2023.jlcl-1.3</url>
+      <doi>10.21248/jlcl.36.2023.236</doi>
+      <bibkey>langenhorst-etal-2023-keyness</bibkey>
+    </paper>
+    <paper id="4">
+      <title>Ist alte Schule oldschool? Zum “Nutzen” von Anglizismen in Deutschraptexten</title>
+      <author><first>Marco</first><last>Gierke</last></author>
+      <pages>39–72</pages>
+      <abstract>Der vorliegende Beitrag vergleicht die Verwendung der anglizistischen Nomination old school und der nativen Entsprechung Alte Schule im Hip-Hop-Subkorpus des Songkorpus (Schneider 2020). Dieser Vergleich erfolgt auf zwei Ebenen: Zum einen wird die diskurs-spezifische Verwendung anhand eines adaptierten Analyse-Frameworks für Hip-Hop-Texte von Androutsopoulos und Scholz (2002) untersucht, zum anderen wird der syntaktische und morphologische Gebrauch in den Deutschraptexten analysiert. Dabei zeigt sich, dass es jeweils spezifische Verwendungstendenzen auf diskursiver Ebene gibt, die wesentlichsten Unterschiede aber in der syntaktischen und morphologischen Verwendung auftreten, allen voran in der höheren Produktivität der anglizistischen Nomination. Es wird dafür argumentiert, dass sich dies unter anderem auf sprachstrukturelle bzw. wortformale Spezifika des Englischen zurückführen lässt, wie den nicht vorhandenen Flexionssuffixen der Adjektive. Damit werden die in der Anglizismenforschung etablierten Überlegungen zu Verwendungsgründen um eine simple, aber gegebenenfalls folgenreiche Beobachtung ergänzt, die sich vor allem bei den sprachökonomischen Ansätzen einordnen lässt. Schließlich wird darüber auf diskursiver Ebene wiederum auch ein Bezug zu terminologischen Vorteilen hergeleitet: Trotz flexibler Verwendung wird das schriftliche Abbild bei Wortbildungen geschont (Oldschoolstyle, Oldschool-Aufnahmen, Oldschooler), was für die Wiedererkennbarkeit des Diskurselements – neben der zusätzlichen Auszeichnung durch die Eigenschaft ‚fremdsprachig‘ – zuträglich sein könnte.</abstract>
+      <url hash="fcfc604f">2023.jlcl-1.4</url>
+      <doi>10.21248/jlcl.36.2023.237</doi>
+      <bibkey>gierke-2023-ist</bibkey>
+    </paper>
+    <paper id="5">
+      <title>“Beinahe-ums-Leben-kommen-in-Regenpfützen” und “Chauvi-Macho-Macker-Stuss” – kreative Wortbildungen in Songtexten</title>
+      <author><first>Katrin</first><last>Hein</last></author>
+      <pages>73–92</pages>
+      <abstract>Im Zentrum dieses Beitrags steht die Analyse kreativer Wortbildungsprodukte in Songtexten. Der Fokus liegt somit bewusst auf solchen Wortbildungen, die nicht den Weg ins Lexikon finden, sondern gerade aufgrund ihres okkasionellen Charakters einen erhöhten Grad an Expressivität aufweisen, der dann gezielt für die spezifische kreative Qualität von Songtexten genutzt wird. Solche okkasionellen komplexen Wörter, die sich in theoretischer Hinsicht innerhalb der Domäne der ‚Extravagant Morphology‘ verorten lassen, werden über das Kriterium der Wortlänge aus dem Songkorpus herausgefiltert und im Anschluss hinsichtlich ihrer formalen sowie semantisch-pragmatischen Besonderheiten analysiert. Im Vordergrund steht dabei die Frage, wodurch die Kreativität der insgesamt 183 Bildungen des Untersuchungskorpus getriggert wird. Die Analyse zeigt, dass expressive Effekte in Songtexten offenbar sowohl durch die Verwendung markierter Wortbildungsmuster als auch durch den Rückgriff auf ‚auffällige‘ Lexik erzeugt werden. Zum einen ist der Anteil markierter Wortbildungsmuster wie der Phrasenkomposition und anderer phrasaler Wortbildungen gegenüber klassischen Textsorten wie Zeitungstexten deutlich erhöht. Zum anderen wird durch die Verwendung einer umgangssprachlichen, vulgären, brutalen oder poetischen Lexik, aber auch mit unmarkierten Wortbildungsmustern wie der prototypischen Determinativkomposition, Aufmerksamkeit erregt. Insgesamt erweist sich das Songkorpus dabei als wahre Fundgrube für kreative Wortbildungsprodukte.</abstract>
+      <url hash="19bfe231">2023.jlcl-1.5</url>
+      <doi>10.21248/jlcl.36.2023.238</doi>
+      <bibkey>hein-2023-beinahe</bibkey>
+    </paper>
+    <paper id="6">
+      <title>Phraseme im Songkorpus: Etabliertes in Anti-Establishment-Texten</title>
+      <author><first>Elke</first><last>Donalies</last></author>
+      <pages>93–112</pages>
+      <abstract>Das Songkorpus erlaubt Einblicke in bestimmte gesellschaftliche Diskurse, die in anderen Sprachkorpora weniger zur Geltung kommen. Das zeigt sich auch bei der Analyse von Phrasemen im Songkorpus. Phraseme sind etablierte Wortkombinationen; sie konservieren kollektives Wissen, kollektive Kultur. Element of Crime, Fettes Brot, Udo Lindenberg, Stefan Stoppok, Konstantin Wecker, Marius Müller-Westernhagen, die Autoren meines kleinen Teilkorpus, sind Anti-Establishment und alles andere als konservativ. Zwar verwenden sie häufig Phraseme verschiedenster Struktur und Art, karikieren sie aber auch häufig, spielen lässig mit ihnen, hinterfragen ihre Bedeutung, verändern ihre Bedeutung. Ihre spezielle Haltung bedingt spezielle Phraseme und spezielle Phrasemvarianten.</abstract>
+      <url hash="9cb9368c">2023.jlcl-1.6</url>
+      <doi>10.21248/jlcl.36.2023.239</doi>
+      <bibkey>donalies-2023-phraseme</bibkey>
+    </paper>
+    <paper id="7">
+      <title>Empirische Verortung konzeptioneller Nähe/Mündlichkeit inner- und außerhalb schriftsprachlicher Korpora</title>
+      <author><first>Sarah</first><last>Broll</last></author>
+      <author><first>Roman</first><last>Schneider</last></author>
+      <pages>113–150</pages>
+      <abstract>Linguistische Studien arbeiten häufig mit einer Differenzierung zwischen gesprochener und geschriebener Sprache bzw. zwischen Kommunikation der Nähe und Distanz. Die Annahme eines Kontinuums zwischen diesen Polen bietet sich für eine Verortung unterschiedlichster Äußerungsformen an, inklusive unkonventioneller Textsorten wie etwa Popsongs. Wir konzipieren, implementieren und evaluieren ein automatisiertes Verfahren, das mithilfe unkorrelierter Entscheidungsbäume entsprechende Vorhersagen auf Textebene durchführt. Für die Identifizierung der Pole definieren wir einen Merkmalskatalog aus Sprachphänomenen, die als Markierer für Nähe/Mündlichkeit bzw. Distanz/Schriftlichkeit diskutiert werden, und wenden diesen auf prototypische Nähe-/Mündlichkeitstexte sowie prototypische Distanz-/Schrifttexte an. Basierend auf der sehr guten Klassifikationsgüte verorten wir anschließend eine Reihe weiterer Textsorten mithilfe der trainierten Klassifikatoren. Dabei erscheinen Popsongs als „mittige Textsorte“, die linguistisch motivierte Merkmale unterschiedlicher Kontinuumsstufen vereint. Weiterhin weisen wir nach, dass unsere Modelle mündlich kommunizierte, aber vorab oder nachträglich verschriftlichte Äußerungen wie Reden oder Interviews vollkommen anders verorten als prototypische Gesprächsdaten und decken Klassifikationsunterschiede für Social-Media-Varianten auf. Ziel ist dabei nicht eine systematisch-verbindliche Einordung im Kontinuum, sondern eine empirische Annäherung an die Frage, welche maschinell vergleichsweise einfach bestimmbaren Merkmale („shallow features“) nachweisbar Einfluss auf die Verortung haben.</abstract>
+      <url hash="5eb00729">2023.jlcl-1.7</url>
+      <doi>10.21248/jlcl.36.2023.240</doi>
+      <bibkey>broll-schneider-2023-empirische</bibkey>
+    </paper>
+    <paper id="8">
+      <title>Segmentierungs- und Annotationsverfahren für die Texte Udo Lindenbergs: Apostrophe und andere Herausforderungen</title>
+      <author><first>Gertrud</first><last>Faaß</last></author>
+      <author><first>Helmut</first><last>Schmid</last></author>
+      <pages>151–170</pages>
+      <abstract>In der Computerlinguistik ist eine kaskadische Prozessierung von Texten üblich. Dabei werden diese zuerst segmentiert (tokenisiert), d.h. Tokens und ggf. Satzgrenzen werden erkannt. Dabei entsteht meist eine Liste bzw. eine einspaltige Tabelle, die sukzessive durch weitere Prozessierungschritte um zusätzliche Spalten – also positionale Annotationen wie z.B. Wortarten und Lemmata für die Tokens in der ersten Spalte – ergänzt wird. Bei der Tokenisierung werden alle Spatien (Leerzeichen) gelöscht. Schon immer problematisch waren dabei Interpunktionszeichen, da diese äußerst ambig sein können, aber auch mehrteilige Namen, die Leerzeichen enthalten und eigentlich zusammengehören. Dieser Beitrag fokussiert auf den Apostroph, der in vielfältiger Weise in den Texten Udo Lindenbergs eingesetzt wird sowie auf mehrteilige Namen, die wir als Tokens erhalten möchten. Wir nutzen dafür das komplette Lindenberg-Archiv des songkorpus.de-Repositoriums, kategorisieren die auftretenden Phänomene, erstellen einen Goldstandard und entwickeln ein teils regel-, teils auf maschinellem Lernen basierendes Segmentierungswerkzeug, das insbesondere die auftretenden Apostrophe, aber auch - lexikonbasiert - mehrteilige Namen nach unseren Vorstellungen erkennt und tokenisiert. Im Anschluss trainieren wir den RNN-Tagger (Schmid, 2019) und zeigen auf, dass ein spezifisch für diese Texte angepasstes Training zu Genauigkeiten ≥ 96% führt. Dabei entsteht nicht nur ein Goldstandard des annotierten Korpus, das dem Songkorpus-Repositorium zur Verfügung gestellt wird, sondern auch eine angepasste Version des RNN-Taggers (verfügbar auf github), die für ähnliche Texte verwendet werden kann.</abstract>
+      <url hash="bfc66921">2023.jlcl-1.8</url>
+      <doi>10.21248/jlcl.36.2023.241</doi>
+      <bibkey>faass-schmid-2023-segmentierungs</bibkey>
+    </paper>
+    <paper id="9">
+      <title>Automatic Authorship Classification for <fixed-case>G</fixed-case>erman Lyrics Using Naïve <fixed-case>B</fixed-case>ayes</title>
+      <author><first>Akshay</first><last>Mendhakar</last></author>
+      <author><first>Mesian</first><last>Tilmatine</last></author>
+      <pages>171–182</pages>
+      <abstract>Text classification is a prevalent and essential machine-learning task. Machine learning classifiers have developed immensely since their inception. The naïve Bayes classifier is one of the most prominent supervised machine learning classifiers. In this experiment, we highlight the performance of Naïve Bayes for classifying of authors/artists on the German lyrics corpus (“Songkorpus”) and compare the classification results with other classifier algorithms. The corpus of investigation consists of six artists with 970 songs in total. Bayes model evaluation measures revealed a precision of 0.91, recall of 0.94, and F1-measure of 0.9. Furthermore, the classification performance with other classifier algorithms did not reveal any statistically significant difference in performance. The results of the study add to the high volume of reports on the classification accuracy of Naive Bayes for the task of lyrical classification.</abstract>
+      <url hash="87ae8b7a">2023.jlcl-1.9</url>
+      <doi>10.21248/jlcl.36.2023.242</doi>
+      <bibkey>mendhakar-tilmatine-2023-automatic</bibkey>
+    </paper>
+  </volume>
+  <volume id="2" ingest-date="2024-11-01" type="proceedings">
+    <meta>
+      <booktitle>Journal for Language Technology and Computational Linguistics, Vol. 36 No. 2</booktitle>
+      <editor><first>Christian</first><last>Wartena</last></editor>
+      <publisher>German Society for Computational Lingustics and Language Technology</publisher>
+      <address>unknown</address>
+      <month>May</month>
+      <year>2023</year>
+      <venue>jlcl</venue>
+    </meta>
+    <frontmatter>
+      <url hash="3b42657c">2023.jlcl-2.0</url>
+      <bibkey>jlcl-2023-2</bibkey>
+    </frontmatter>
+    <paper id="1">
+      <title>Kencorpus: A Kenyan Language Corpus of <fixed-case>S</fixed-case>wahili, Dholuo and Luhya for Natural Language Processing Tasks</title>
+      <author><first>Barack</first><last>Wanjawa</last></author>
+      <author><first>Lilian</first><last>Wanzare</last></author>
+      <author><first>Florence</first><last>Indede</last></author>
+      <author><first>Owen</first><last>McOnyango</last></author>
+      <author><first>Edward</first><last>Ombui</last></author>
+      <author><first>Lawrence</first><last>Muchemi</last></author>
+      <pages>1–27</pages>
+      <abstract>Indigenous African languages are categorized as under-served in Natural Language Processing. They therefore experience poor digital inclusivity and information access. The processing challenge with such languages has been how to use machine learning and deep learning models without the requisite data. The Kencorpus project intends to bridge this gap by collecting and storing text and speech data that is good enough for data-driven solutions in applications such as machine translation, question answering and transcription in multilingual communities. The Kencorpus dataset is a text and speech corpus for three languages predominantly spoken in Kenya: Swahili, Dholuo and Luhya (three dialects of Lumarachi, Lulogooli and Lubukusu). Data collection was done by researchers who were deployed to the various data collection sources such as communities, schools, media, and publishers. The Kencorpus’ dataset has a collection of 5,594 items, being 4,442 texts of 5.6 million words and 1,152 speech files worth 177 hours. Based on this data, other datasets were also developed such as Part of Speech tagging sets for Dholuo and the Luhya dialects of 50,000 and 93,000 words tagged respectively. We developed 7,537 Question-Answer pairs from 1,445 Swahili texts and also created a text translation set of 13,400 sentences from Dholuo and Luhya into Swahili. The datasets are useful for downstream machine learning tasks such as model training and translation. Additionally, we developed two proof of concept systems: for Kiswahili speech-to-text and a machine learning system for Question Answering task. These proofs provided results of a performance of 18.87% word error rate for the former, and 80% Exact Match (EM) for the latter system. These initial results give great promise to the usability of Kencorpus to the machine learning community. Kencorpus is one of few public domain corpora for these three low resource languages and forms a basis of learning and sharing experiences for similar works especially for low resource languages. Challenges in developing the corpus included deficiencies in the data sources, data cleaning challenges, relatively short project timelines and the Coronavirus disease (COVID-19) pandemic that restricted movement and hence the ability to get the data in a timely manner.</abstract>
+      <url hash="edb39e7a">2023.jlcl-2.1</url>
+      <doi>10.21248/jlcl.36.2023.243</doi>
+      <bibkey>wanjawa-etal-2023-kencorpus</bibkey>
+    </paper>
+    <paper id="2">
+      <title>The Proof is in the Pudding: Using Automated Theorem Proving to Generate Cooking Recipes</title>
+      <author><first>Louis</first><last>Mahon</last></author>
+      <author><first>Carl</first><last>Vogel</last></author>
+      <pages>29–85</pages>
+      <abstract>This paper presents FASTFOOD, a rule-based natural language generation (NLG) program for cooking recipes. We consider the representation of cooking recipes as discourse representation, because the meaning of each sentence needs to consider the context of the others. Our discourse representation system is based on states of affairs and transtions between states of affairs, and does not use discourse referents. Recipes are generated by using an automated theorem-proving procedure to select the ingredients and instructions, with ingredients corresponding to axioms and instructions to implications. FASTFOOD also contains a temporal optimization module which can rearrange the recipe to make it more time efficient for the user, e.g. the recipe specifies to chop the vegetables while the rice is boiling. The system is described in detail, including the decision to forgo discourse referents and how plausible representations of nouns and verbs emerge purely as a by-product of the practical requirements of efficiently representing recipe content. A comparison is then made with existing recipe generation systems, NLG systems more generally, and automated theorem provers.</abstract>
+      <url hash="21767a9e">2023.jlcl-2.2</url>
+      <doi>10.21248/jlcl.36.2023.233</doi>
+      <bibkey>mahon-vogel-2023-proof</bibkey>
+    </paper>
+  </volume>
+</collection>
diff --git a/data/yaml/venues/jlcl.yaml b/data/yaml/venues/jlcl.yaml
new file mode 100644
index 0000000000..efae5d2b14
--- /dev/null
+++ b/data/yaml/venues/jlcl.yaml
@@ -0,0 +1,3 @@
+acronym: JLCL
+name: Journal for Language Technology and Computational Linguistics
+type: journal