diff --git a/Makefile b/Makefile index a93259c9e7..d50cc5d8fc 100644 --- a/Makefile +++ b/Makefile @@ -307,6 +307,9 @@ autofix: check_staged_xml venv/bin/activate [ "$${PRE_DIFF}" = "$${POST_DIFF}" ] || EXIT_STATUS=1 ;\ [ $${EXIT_STATUS} -eq 0 ] +.PHONY: reformat +reformat: autofix + .PHONY: serve serve: @echo "INFO Starting a server at http://localhost:8000/" diff --git a/bin/add_dois.py b/bin/add_dois.py index 59015599e5..44efef5e6b 100755 --- a/bin/add_dois.py +++ b/bin/add_dois.py @@ -125,7 +125,7 @@ def process_volume(anthology_volume): added = add_doi(paper, collection_id, volume_id, force=args.force) if added: num_added += 1 - sleep(1) + sleep(0.1) indent(tree.getroot()) diff --git a/bin/ingest.py b/bin/ingest.py index e75bfdaa3d..aebe601b50 100755 --- a/bin/ingest.py +++ b/bin/ingest.py @@ -294,6 +294,7 @@ def find_book(): meta["path"], "cdrom", f"{year}-{venue_name.lower()}-{volume_name}.pdf", + f"{venue_name.lower()}-{year}.{volume_name}.pdf", ), os.path.join(meta["path"], "cdrom", f"{venue_name.upper()}-{year}.pdf"), ] diff --git a/bin/ingest_aclpub2.py b/bin/ingest_aclpub2.py index d661fc51b6..39ac8b3bbb 100755 --- a/bin/ingest_aclpub2.py +++ b/bin/ingest_aclpub2.py @@ -204,8 +204,9 @@ def parse_paper_yaml(ingestion_dir: str) -> List[Dict[str, str]]: else: raise Exception("Can't find papers.yml (looked in root dir and under inputs/)") - # remove non-archival papers - papers = [p for p in papers if p.get('archival', True)] + for paper in papers: + if "archival" not in paper: + paper["archival"] = False return papers @@ -220,39 +221,40 @@ def add_paper_nums_in_paper_yaml( start, end = 1, 0 for paper in papers: - assert 'file' in paper.keys(), f'{paper["id"]} is missing key file' + if paper["archival"]: + assert 'file' in paper.keys(), f'{paper["id"]} is missing key file' - paper_id = str(paper['id']) - # if 'file' not in paper.keys(): - # print(f'{paper_id} does not have file key but archive is {paper["archival"]}') - # paper_name = paper['title'] - # else: - - paper_path = paper['file'] - - # TODO: we should just be able to read paper_path directly, and throw an - # error if it doesn't exist - paper_need_read_path = None - paths_to_check = [ - ingestion_dir / "watermarked_pdfs" / paper_path, - ingestion_dir / "watermarked_pdfs" / f"{paper_id}.pdf", - ] - paper_need_read_path = None - for path in paths_to_check: - if path.exists(): - paper_need_read_path = str(path) - break - else: - raise Exception( - f"* Fatal: could not find paper ID {paper_id} ({paths_to_check})" - ) + paper_id = str(paper['id']) + # if 'file' not in paper.keys(): + # print(f'{paper_id} does not have file key but archive is {paper["archival"]}') + # paper_name = paper['title'] + # else: - pdf = open(paper_need_read_path, 'rb') - pdf_reader = PyPDF2.PdfReader(pdf) - num_of_pages = len(pdf_reader.pages) - start = end + 1 - end = start + num_of_pages - 1 - paper['pages'] = f'{start}-{end}' + paper_path = paper['file'] + + # TODO: we should just be able to read paper_path directly, and throw an + # error if it doesn't exist + paper_need_read_path = None + paths_to_check = [ + ingestion_dir / "watermarked_pdfs" / paper_path, + ingestion_dir / "watermarked_pdfs" / f"{paper_id}.pdf", + ] + paper_need_read_path = None + for path in paths_to_check: + if path.exists(): + paper_need_read_path = str(path) + break + else: + raise Exception( + f"* Fatal: could not find paper ID {paper_id} ({paths_to_check})" + ) + + pdf = open(paper_need_read_path, 'rb') + pdf_reader = PyPDF2.PdfReader(pdf) + num_of_pages = len(pdf_reader.pages) + start = end + 1 + end = start + num_of_pages - 1 + paper['pages'] = f'{start}-{end}' return papers @@ -532,6 +534,7 @@ def copy_pdf_and_attachment( volume[0] = { "anthology_id": f"{collection_id}-{volume_name}.0", "attachments": [], + "archival": True, } frontmatter_src_path = None @@ -562,28 +565,23 @@ def copy_pdf_and_attachment( paper_num = 0 for i, paper in enumerate(papers): - # archival papers only - if 'archival' not in paper.keys(): - paper.update({'archival': '1'}) assert 'archival' in paper.keys(), f'{paper["id"]} is missing key archival' assert 'file' in paper.keys(), f'{paper["id"]} is missing key file' - if ( - paper['archival'] == 1 - or paper['archival'] is True - or paper['archival'] == '1' - ): - # copy pdf - # if 'file' not in paper.keys(): - # paper_name = paper['title'] - # print(f'{paper_name} does not have file key') - # else: - paper_name = paper['file'] - # paper_name = paper['file'] - if paper_name != '' or paper_name is not None: - paper_id = str(paper['id']) - paper_num += 1 - paper_id_full = f'{collection_id}-{volume_name}.{paper_num}' + paper_name = paper['file'] + # paper_name = paper['file'] + if paper_name != '' or paper_name is not None: + paper_id = str(paper['id']) + paper_num += 1 + paper_id_full = f'{collection_id}-{volume_name}.{paper_num}' + + volume[paper_num] = { + 'anthology_id': paper_id_full, + 'attachments': [], + 'archival': paper["archival"], + } + + if paper["archival"]: pdf_src_path = None if (pdfs_src_dir / paper_name).exists(): pdf_src_path = pdfs_src_dir / paper_name @@ -599,61 +597,53 @@ def copy_pdf_and_attachment( if not dry_run: maybe_copy(pdf_src_path, pdf_dest_path) - volume[paper_num] = { - 'anthology_id': paper_id_full, - 'pdf': pdf_dest_path, - 'attachments': [], - } - - # copy attachments - if 'attachments' in paper: - attachs_dest_dir = create_dest_path(attachments_dir, venue_name) - attachs_src_dir = meta['path'] / 'attachments' - # assert ( - # attachs_src_dir.exists() - # ), f'paper {i, paper_name} contains attachments but attachments folder was not found' - - for attachment in paper['attachments']: - file_path = Path(attachment.get('file', None)) - if file_path is None: - continue - - attach_src_path = None - paths_to_check = [ - attachs_src_dir / file_path, - attachs_src_dir / file_path.name, - ] - for path in paths_to_check: - if path.exists(): - attach_src_path = str(path) - break - else: - print( - f"Warning: paper {paper_id} attachment {file_path} not found, skipping", - file=sys.stderr, - ) - continue - - attach_src_extension = attach_src_path.split(".")[-1] - type_ = attachment['type'].replace(" ", "") - file_name = f'{collection_id}-{volume_name}.{paper_num}.{type_}.{attach_src_extension}' - - # the destination path - attach_dest_path = os.path.join(attachs_dest_dir, file_name).replace( - " ", "" + volume[paper_num]["pdf"] = pdf_dest_path + + # copy attachments + if 'attachments' in paper: + attachs_dest_dir = create_dest_path(attachments_dir, venue_name) + attachs_src_dir = meta['path'] / 'attachments' + # assert ( + # attachs_src_dir.exists() + # ), f'paper {i, paper_name} contains attachments but attachments folder was not found' + + for attachment in paper['attachments']: + file_path = Path(attachment.get('file', None)) + if file_path is None: + continue + + attach_src_path = None + paths_to_check = [ + attachs_src_dir / file_path, + attachs_src_dir / file_path.name, + ] + for path in paths_to_check: + if path.exists(): + attach_src_path = str(path) + break + else: + print( + f"Warning: paper {paper_id} attachment {file_path} not found, skipping", + file=sys.stderr, ) + continue - if Path(attach_src_path).exists(): - if dry_run: - print( - f'would\'ve moved {attach_src_path} to {attach_dest_path}' - ) - else: - maybe_copy(attach_src_path, attach_dest_path) - print(f"Attaching {attach_dest_path}/{type_} to {paper_num}") - volume[paper_num]['attachments'].append( - (attach_dest_path, type_) - ) + attach_src_extension = attach_src_path.split(".")[-1] + type_ = attachment['type'].replace(" ", "") + file_name = f'{collection_id}-{volume_name}.{paper_num}.{type_}.{attach_src_extension}' + + # the destination path + attach_dest_path = os.path.join(attachs_dest_dir, file_name).replace( + " ", "" + ) + + if Path(attach_src_path).exists(): + if dry_run: + print(f'would\'ve moved {attach_src_path} to {attach_dest_path}') + else: + maybe_copy(attach_src_path, attach_dest_path) + print(f"Attaching {attach_dest_path}/{type_} to {paper_num}") + volume[paper_num]['attachments'].append((attach_dest_path, type_)) return volume, collection_id, volume_name, proceedings_pdf_dest_path @@ -692,6 +682,10 @@ def create_xml( meta_node = None for paper_num, paper in sorted(volume.items()): + if not paper["archival"]: + print(f"Skipping non-archival paper #{paper_num}", file=sys.stderr) + continue + paper_id_full = paper['anthology_id'] # print(f'creating xml for paper name {paper}, in papers {papers[paper_num-1]}') if paper_num == 0: @@ -873,7 +867,12 @@ def main(ingestion_dir, pdfs_dir, attachments_dir, dry_run, anthology_dir, inges # Load the papers.yaml file, skipping non-archival papers papers = parse_paper_yaml(ingestion_dir) - # print(f'original paper {papers[0]}') + print( + "Found", + len([p for p in papers if p["archival"]]), + "archival papers", + file=sys.stderr, + ) # add page numbering by parsing the PDFs papers = add_paper_nums_in_paper_yaml(papers, ingestion_dir) diff --git a/data/xml/2014.clib.xml b/data/xml/2014.clib.xml new file mode 100644 index 0000000000..db2f77a610 --- /dev/null +++ b/data/xml/2014.clib.xml @@ -0,0 +1,129 @@ + + + + + Proceedings of the First International Conference on Computational Linguistics in Bulgaria (CLIB 2014) + Department of Computational Linguistics, Institute for Bulgarian Language, Bulgarian Academy of Sciences +
Sofia, Bulgaria
+ September + 2014 + 2014.clib-1 + clib + + + 110 + 2014.clib-1.0 + clib-2014-1 + + + Electronic Language Resources in Teaching Mathematical Linguistics + IvanDerzhanski + RositsaDekova + 1–5 + The central role of electronic language resources in education is widely recognised (cf. Brinkley et al, 1999; Bennett, 2010; Derzhanski et al., 2007, among others). The variety and ease of access of such resources predetermines their extensive use in both research and education. With regard to teaching mathematical linguistics, electronic dictionaries and annotated corpora play a particularly important part, being an essential source of information for composing linguistic problems and presenting linguistic knowledge. This paper discusses the need for electronic resources, especially for less studied or low-resource languages, their creation and various uses in teaching linguistics to secondary school students, with examples mostly drawn from our practical work. + 2014.clib-1.1 + derzhanski-dekova-2014-electronic + + + Harnessing Language Technologies in Multilingual Information Channelling Services + DimanKaragiozov + 6–13 + Scientists and industry have put significant efforts in creating suitable tools to analyze information flows. However, up to now there are no successful solutions for 1) dynamic modeling of the user-defined interests and further personalization of the results, 2) effective cross-language information retrieval, and 3) processing of multilingual content. As a consequence, much of the potentially relevant and otherwise accessible data from the media stream may elude users’ grasp. We present a multilingual information channeling system, MediaTalk, which offers broad integration between language technologies and advanced data processing algorithms for annotation, analysis and classification of multilingual content. As a result, the system not only provides an all-in-one monitoring service that covers both traditional and social media, but also offers dynamic modeling of user profiles, personalization of obtained data and cross-language information retrieval. Bulgarian and English press clipping services relying on this system implement advanced functionalities such as identification of emerging topics, forecasting and trend prediction, all of which allow the users to monitor their standing reputation, events and relations. The architecture of the system is robust, extensible and adheres to the Big Data paradigm. + 2014.clib-1.2 + karagiozov-2014-harnessing + + + Automatic Semantic Filtering of Morphosemantic Relations in <fixed-case>W</fixed-case>ord<fixed-case>N</fixed-case>et + SvetlozaraLeseva + IvelinaStoyanova + BorislavRizov + MariaTodorova + EkaterinaTarpomanova + 14–22 + In this paper we present a method for automatic assignment of morphosemantic relations between derivationally related verb–noun pairs of synsets in the Bulgarian WordNet (BulNet) and for semantic filtering of those relations. The filtering process relies on the meaning of noun suffixes and the semantic compatibility of verb and noun taxonomic classes. We use the taxonomic labels assigned to all the synsets in the Princeton WordNet (PWN) – one label per synset – which denote their general semantic class. In the first iteration we employ the pairs <noun suffix : noun label> to filter out part of the relations. In the second iteration, which uses as input the output of the first one, we apply a stronger semantic filter. It makes use of the taxonomic labels of the noun-verb synset pairs observed for a given morphosemantic relation. In this way we manage to reliably filter out impossible or unlikely combinations. The results of the performed experiment may be applied to enrich BulNet with morphosemantic relations and new synsets semi-automatically, while facilitating the manual work and reducing its cost. + 2014.clib-1.3 + leseva-etal-2014-automatic + + + Noun-Verb Derivation in the <fixed-case>B</fixed-case>ulgarian and the <fixed-case>R</fixed-case>omanian <fixed-case>W</fixed-case>ord<fixed-case>N</fixed-case>et – A Comparative Approach + EkaterinaTarpomanova + SvetlozaraLeseva + MariaTodorova + TsvetanaDimitrova + BorislavRizov + VerginicaBarbu Mititelu + ElenaIrimia + 23–31 + Romanian and Bulgarian are Balkan languages with rich derivational morphology that, if introduced into their respective wordnets, can aid broadening of the wordnet content and the possible NLP applications. In this paper we present a joint work on introducing derivation into the Bulgarian and the Romanian WordNets, BulNet and RoWordNet, respectively, by identifying and subsequently labelling the derivationally and semantically related noun-verb pairs. Our research aims at providing a framework for a comparative study on derivation in the two languages and offering training material for the automatic identification and assignment of derivational and morphosemantic relations needed in various applications. + 2014.clib-1.4 + tarpomanova-etal-2014-noun + + + Semi-Automatic Detection of Multiword Expressions in the <fixed-case>S</fixed-case>lovak Dependency Treebank + DanielaMajchrakova + OndrejDusek + JanHajic + AgataKarcova + RadovanGarabik + 32–39 + We describe a method for semi-automatic extraction of Slovak multiword expressions (MWEs) from a dependency treebank. The process uses an automatic conversion from dependency syntactic trees to deep syntax and automatic tagging of verbal argument nodes based on a valency dictionary. Both the valency dictionary and the treebank conversion were adapted from the corresponding Czech versions; the automatically translated valency dictionary has been manually proofread and corrected. There are two main achievements – a valency dictionary of Slovak MWEs with direct links to corresponding expressions in the Czech dictionary, PDT-Vallex, and a method of extraction of MWEs from the Slovak Dependency Treebank. The extraction reached very high precision but lower recall in a manual evaluation. This is a work in progress, the overall goal of which is twofold: to create a Slovak language valency dictionary paralleling the Czech one, with bilingual links; and to use the extracted verbal frames in a collocation dictionary of Slovak verbs. + 2014.clib-1.5 + majchrakova-etal-2014-semi + + + Automatic Categorisation of Multiword Expressions and Named Entities in <fixed-case>B</fixed-case>ulgarian + IvelinaStoyanova + 40–48 + This paper describes an approach for automatic categorisation of various types of multiword expressions (MWEs) with a focus on multiword named entities (MNEs), which compose a large portion of MWEs in general. The proposed algorithm is based on a refined classification of MWEs according to their idiomaticity. While MWE categorisation can be considered as a separate and independent task, it complements the general task of MWE recognition. After outlining the method, we set up an experiment to demonstrate its performance. We use the corpus Wiki1000+ that comprises 6,311 annotated Wikipedia articles of 1,000 or more words each, amounting to 13.4 million words in total. The study also employs a large dictionary of 59,369 MWEs noun phrases (out of more than 85,000 MWEs), labelled with their respective types. The dictionary is compiled automatically and verified semi-automatically. The research presented here is based on Bulgarian although most of the ideas, the methodology and the analysis are applicable to other Slavic and possibly other European languages. + 2014.clib-1.6 + stoyanova-2014-automatic + + + Temporal Adverbs and Adverbial Expressions in a Corpus of <fixed-case>B</fixed-case>ulgarian and <fixed-case>U</fixed-case>krainian Parallel Texts + IvanDerzhanski + OlenaSiruk + 49–54 + This paper presents a comparative bilingual corpus-based study of the use of several frequent temporal adverbs and adverbial expressions (‘always’, ‘sometimes’, ‘never’ and their synonyms) in Bulgarian and Ukrainian. The Ukrainian items were selected with the aid of synonym dictionaries of words and of set expressions, the corpus was used to identify their most common Bulgarian counterparts, and the frequencies of the correspondences were compared and scrutinised for possibly informative regularities. + 2014.clib-1.7 + derzhanski-siruk-2014-temporal + + + Historical Corpora of <fixed-case>B</fixed-case>ulgarian Language and Second Position Markers + TsvetanaDimitrova + AndrejBoyadzhiev + 55–63 + This paper demonstrates how historical corpora can be used in researching language phenomena. We exemplify the advantages and disadvantages through exploring three of the available corpora that contain textual sources of Old and Middle Bulgarian language to shed light on some aspects of the development of two words of ambiguous class. We discuss their behaviour to outline certain conditions for diachronic change they have undergone. The three corpora are accessible online (and offline – for downloading search results, xml files, etc.). + 2014.clib-1.8 + dimitrova-boyadzhiev-2014-historical + + + Mаchine Translation Based on <fixed-case>W</fixed-case>ord<fixed-case>N</fixed-case>et and Dependency Relations + LuchezarJackov + 64–72 + The proposed machine translation (MT) approach uses WordNet (Fellbaum, 1998) as a base for concepts. It identifies the concepts and dependency relations using context-free grammars (CFGs) enriched with features, role markers and dependency markers. Multiple interpretation hypotheses are generated and then are scored using a knowledge base for the dependency relations. The hypothesis with the best score is used for generating the translation. The approach has already been implemented in an MT system for seven languages, namely Bulgarian, English, French, Spanish, Italian, German, and Turkish, and also for Chinese on experimental level. + 2014.clib-1.9 + jackov-2014-machine + + + Recognize the Generality Relation between Sentences Using Asymmetric Association Measures + SebastiaoPais + GaelDias + RumenMoraliyski + 73–81 + In this paper we focus on a particular case of entailment, namely entailment by generality. We argue that there exist various types of implication, a range of different levels of entailment reasoning, based on lexical, syntactic, logical and common sense clues, at different levels of difficulty. We introduce the paradigm of Textual Entailment (TE) by Generality, which can be defined as the entailment from a specific statement towards a relatively more general statement. In this context, the Text T entails the Hypothesis H, and at the same time H is more general than T . We propose an unsupervised and language-independent method to recognize TE by Generality given a case of Text − Hypothesis or T − H where entailment relation holds. + 2014.clib-1.10 + pais-etal-2014-recognize + + + Unsupervised and Language Independent Method to Recognize Textual Entailment by Generality + SebastiaoPais + GaelDias + JoaoCordeiro + RumenMoraliyski + 82–90 + In this work we introduce a particular case of textual entailment (TE), namely Textual Entailment by Generality (TEG). In text, there are different kinds of entailment yielded from different types of implicative reasoning (lexical, syntactic, common sense based), but here we focus just on TEG, which can be defined as an entailment from a specific statement towards a relatively more G general one. Therefore, we have T (G)→ H whenever the premise T entails the hypothesis H, the hypothesis being more general than the premise. We propose an unsupervised and language-independent method to recognize TEGs, given a pair T, H in an entailment relation. We have evaluated our proposal G → H English pairs, where we know through two experiments: (a) Test on T (G)→ H English pairs, where we know that TEG holds; (b) Test on T → H Portuguese pairs, randomly selected with 60% of TEGs and 40% of TE without generality dependency (TEnG). + 2014.clib-1.11 + pais-etal-2014-unsupervised + +
+
diff --git a/data/xml/2016.clib.xml b/data/xml/2016.clib.xml new file mode 100644 index 0000000000..f72ba3fa9c --- /dev/null +++ b/data/xml/2016.clib.xml @@ -0,0 +1,132 @@ + + + + + Proceedings of the Second International Conference on Computational Linguistics in Bulgaria (CLIB 2016) + Department of Computational Linguistics, Institute for Bulgarian Language, Bulgarian Academy of Sciences +
Sofia, Bulgaria
+ September + 2016 + 2016.clib-1 + clib + + + 110 + 2016.clib-1.0 + clib-2016-1 + + + How to Differentiate the Closely Related Standard Languages? + DuškoVitas + LjubomirPopović + CvetanaKrstev + AnđelkaZečević + 1–10 + In this paper the adequacy of the SETimes corpus as a basis for the comparison of closely related languages that are used in countries that emerged after the breakup of Yugoslavia is discussed by comparing it with other corpora. It is shown that the phenomena observed in this corpus and used to illustrate differences most specifically between Serbian and Croatian are consistent neither with their standards nor with other sources. Thus, results obtained on the basis of the SETimes corpus are corpus-biased and have to be reconsidered. This proves that the size of a corpus and its composition used in a linguistic research are crucial for assessing the obtained results. + 2016.clib-1.1 + vitas-etal-2016-differentiate + + + ’While’ and ’Until’ Clauses and Expletive Negation in a Corpus of <fixed-case>B</fixed-case>ulgarian and <fixed-case>U</fixed-case>krainian Parallel Texts + IvanDerzhanski + OlenaSiruk + 11–18 + The combination of the meanings ‘while’ and ‘until’ in a single lexeme and the use of expletive negation with the latter meaning are widespread phenomena that are a rich source of research problems. In this paper we present a comparative bilingual Bulgarian and Ukrainian corpus-based study of several conjunctions that share these two meanings. We discuss the difference in the frequency of expletive negation in the two languages, the use of až ‘even, all the way’ in Ukrainian and the impact of the original language in translated texts. + 2016.clib-1.2 + derzhanski-siruk-2016-clauses + + + Linguistic Data Retrievable from a Treebank + VerginicaBarbu Mititelu + ElenaIrimia + 19–27 + This paper describes the Romanian treebank annotated according to the Universal Dependency principles. We present the types of texts included in the treebank, their processing phases and the tools used for doing it, as well as the levels of annotation, with a focus on the syntactic level. We briefly present the syntactic formalism used, the principles followed and the set of relations. The perspective we adopted is the linguist’s who searches the treebank for information with relevance for the study of Romanian. (S)He can interpret the statistics based on the corpus and can also query the treebank for finding examples to support a theory, for testing hypothesis or for discovering new tendencies. We use here the passive constructions in Romanian as a case study for showing how statistical data help understanding this linguistic phenomenon. We also discuss the kinds of linguistic information retrievable and non-retrievable form the treebank, based on the annotation principles. + 2016.clib-1.3 + barbu-mititelu-irimia-2016-linguistic + + + Towards the Automatic Identification of Light Verb Constructions in <fixed-case>B</fixed-case>ulgarian + IvelinaStoyanova + SvetlozaraLeseva + MariaTodorova + 28–37 + This paper presents work in progress focused on developing a method for automatic identification of light verb constructions (LVCs) as a subclass of Bulgarian verbal MWEs. The method is based on machine learning and is trained on a set of LVCs extracted from the Bulgarian WordNet (BulNet) and the Bulgarian National Corpus (BulNC). The machine learning uses lexical, morphosyntactic, syntactic and semantic features of LVCs. We trained and tested two separate classifiers using the Java package Weka and two learning decision tree algorithms – J48 and RandomTree. The evaluation of the method includes 10-fold cross-validation on the training data from BulNet (F1 = 0.766 obtained by the J48 decision tree algorithm and F1 = 0.725 by the RandomTree algorithm), as well as evaluation of the performance on new instances from the BulNC (F1 = 0.802 by J48 and F1 = 0.607 by the RandomTree algorithm). Preliminary filtering of the candidates gives a slight improvement (F1 = 0.802 by J48 and F1 = 0.737 by RandomTree). + 2016.clib-1.4 + stoyanova-etal-2016-towards + + + <fixed-case>HR</fixed-case>4<fixed-case>EU</fixed-case> – Using Language Resources in Computer Aided Language Learning + DašaFarkaš + MateaFilko + MarkoTadić + 38–46 + In this paper we present the HR4EU – web portal for e-learning of Croatian language. The web portal offers a new method of computer aided language learning (CALL) by encouraging language learners to use different language resources available for Croatian: corpora, inflectional and derivational morphological lexicons, treebank, Wordnet, etc. Apart from the previously developed language resources, the new ones are created in order to further facilitate the learning of Croatian language. We will focus on the usage of the treebank annotated at syntactic and semantic level in the CALL and describe the new HR4EU sub-corpus of the Croatian Dependency Treebank (HOBS). The HR4EU sub-corpus consists of approx. 550 sentences, which are manually annotated on syntactic and semantic role level according to the specifications used for the HOBS. The syntactic and the semantic structure of the sentence can be visualized as a dependency tree via the SynSem Visualizer. The visualization of the syntactic and the semantic structure of sentences will help users to produce syntactically and semantically correct sentences on their own. + 2016.clib-1.5 + farkas-etal-2016-hr4eu + + + <fixed-case>S</fixed-case>yn<fixed-case>T</fixed-case>ags – Web Interface for Syntactic and Semantic Annotation + AtanasAtanasov + 47–53 + This paper presents a web tool for syntactic and semantic annotation and two of its applications. It gives the linguists the possibility to work with corpora and syntactic and semantic frames in XML format without having computer skills. The system is OS and platform independent and could be used both online and offline. + 2016.clib-1.6 + atanasov-2016-syntags + + + Finding Good Answers in Online Forums: Community Question Answering for <fixed-case>B</fixed-case>ulgarian + TsvetomilaMihaylova + IvanKoychev + PreslavNakov + IvelinaNikolova + 54–63 + Community Question Answering (CQA) is a form of question answering that is getting increasingly popular as a research direction recently. Given a question posted in an online community forum and the thread of answers to it, a common formulation of the task is to rank automatically the answers, so that the good ones are ranked higher than the bad ones. Despite the vast research in CQA for English, very little attention has been paid to other languages. To bridge this gap, here we present our method for Community Question Answering in Bulgarian. We create annotated training and testing datasets for Bulgarian, and we further explore the applicability of machine translation for reusing English CQA data for building a Bulgarian system. The evaluation results show improvement over the baseline and can serve as a basis for further research. + 2016.clib-1.7 + mihaylova-etal-2016-finding + + + Quotation Retrieval System for <fixed-case>B</fixed-case>ulgarian Media Content + SvetlaKoeva + IvelinaStoyanova + MartinYalamov + 64–73 + This paper presents a method for automatic retrieval and attribution of quotations from media texts in Bulgarian. It involves recognition of report verbs (including their analytical forms) and syntactic patterns introducing quotations, as well as source attribution of the quote by identification of personal names, descriptors, and anaphora. The method is implemented in a fully-functional online system which offers a live service processing media content and extracting quotations on a daily basis. The system collects and processes written news texts from six Bulgarian media websites. The results are presented in a structured way with description, as well as sorting and filtering functionalities which facilitate the monitoring and analysis of media content. The method has been applied to extract quotations from English texts as well and can be adapted to work with other languages, provided that the respective language specific resources are supplied. + 2016.clib-1.8 + koeva-etal-2016-quotation + + + Stress Patterns of Compounds and <fixed-case>MWE</fixed-case>s in <fixed-case>E</fixed-case>nglish and <fixed-case>B</fixed-case>ulgarian + BistraPopovska + RositsaDekova + 74–77 + The paper presents an ongoing research on the stress patterns of compounds and MWEs of the type ADJ+N and their corresponding free NPs in English and Bulgarian. The research focuses on the identification and the formal representation of the possible stress patterns of compounds and MWEs and free NPs. During our research so far, we have compiled a corpus of over 2000 compounds and MWEs, approx. 1000 for each language – English and Bulgarian. Our theoretical framework includes elements from different theories, i.e. the Generative Phonology Theory, the Metrical Theory, and the Theory of Primary accent first which all define the stress as a prosodic element. Our main goals are to specify the prosodic region where the stress is defined in English and Bulgarian MWEs and noun phrases and to define the main features of the stress in MWEs and free NPs in English and Bulgarian. The results of our research can serve for implementation into NLP modules for spoken language processing and generation. + 2016.clib-1.9 + popovska-dekova-2016-stress + + + Verbal Multiword Expressions in <fixed-case>C</fixed-case>roatian + KrešimirŠojat + MateaFilko + DašaFarkaš + 78–85 + The paper deals with verbal multiword expressions in Croatian. We focus on four types of verbal constructions: light verb constructions, i.e. constructions consisting of a light verb and a noun or prepositional phrase, complex predicate constructions, i.e. constructions consisting of a finite and infinitive verb, prepositional verb constructions, i.e. constructions consisting of a verb and a typical preposition, and, finally, verbal idioms, i.e. constructions with completely idiosyncratic meanings. All the constructions are annotated in the Universal Dependency treebank for Croatian. The identification of verbal multiword expressions is an important task in numerous NLP tasks. It is also important to define and delimitate this concept in linguistic theory. + 2016.clib-1.10 + sojat-etal-2016-verbal + + + A Simple Approach to Unifying Ambiguously Encoded <fixed-case>K</fixed-case>urdish Characters + SardarJaf + 86–94 + In this study we outline a potential problem in the normalisation stage of processing texts that are based on a modified version of the Arabic alphabet. The main source of resources available for processing resource-scarce languages is raw text. We have identified an interesting challenge that must be addressed when normalising certain natural language texts. Many less-resourced languages, such as Kurdish, Farsi, Urdu, Pashtu, etc., use a modified version of the Arabic writing system. Many characters in harvested data from the Internet may have exactly the same form but encoded with different Unicode values (ambiguous characters). It is important to identify ambiguous characters during the normalisation stage of most text processing tasks. We will demonstrate cases related to ambiguous Kurdish and Farsi characters and propose a semi-automatic approach to identifying and unifying ambiguously encoded characters. + 2016.clib-1.11 + jaf-2016-simple + + + A Possible Solution to the Problem of Machine Translation of Verb Forms from <fixed-case>B</fixed-case>ulgarian to <fixed-case>E</fixed-case>nglish + TodorLazarov + 95–100 + The paper‘s main subject is concerned with the problems related to machine translation of verb forms from Bulgarian to English. In separate sections of this article we discuss the problems related to differences between word formation in both languages and differences in the information that the verb forms grammaticalize. We also introduce the idea of implementing the statistical method of machine translation altogether with the rule-based method as a proposal for future research and the possible practical and theoretical outcomes. + 2016.clib-1.12 + lazarov-2016-possible + +
+
diff --git a/data/xml/2018.clib.xml b/data/xml/2018.clib.xml new file mode 100644 index 0000000000..d7eeaf62ea --- /dev/null +++ b/data/xml/2018.clib.xml @@ -0,0 +1,240 @@ + + + + + Proceedings of the Third International Conference on Computational Linguistics in Bulgaria (CLIB 2018) + Department of Computational Linguistics, Institute for Bulgarian Language, Bulgarian Academy of Sciences +
Sofia, Bulgaria
+ May + 2018 + 2018.clib-1 + clib + + + 222 + 2018.clib-1.0 + clib-2018-1 + + + With a little help from <fixed-case>NLP</fixed-case>: My Language Technology applications with impact on society + RuslanMitkov + 1–4 + The keynote speech presents the speaker’s vision that research should lead to the development of applications which benefit society. To support this, the speaker will present three original methodologies proposed by him which underpin applications jointly implemented with colleagues from across his research group. These Language Technology tools already have a substantial societal impact in the following areas: learning and assessment, translation and care for people with language disabilities. + 2018.clib-1.1 + mitkov-2018-little + + + <fixed-case>NLP</fixed-case>-based Assessment of Reading Efficiency in Early Grade Children + VitoPirrelli + 5–6 + Assessing reading skills is a laborious and time-consuming task, which requires monitoring a variety of interlocked abilities, ranging from accurate word rendering, reading fluency and lexical access, to linguistic comprehension, and interpretation, management and inference of complex events in working memory. No existing software, to our knowledge, is able to cover and integrate reading performance monitoring, instant feedback, personalised potentiation and intelligent decision support to teachers and speech therapists, assessment of response to intervention. NLP and ICT technologies can make such an ambitious platform an achievable target. + 2018.clib-1.2 + pirrelli-2018-nlp + + + Figurative language processing: A developmental and <fixed-case>NLP</fixed-case> Perspective + MilaVulchanova + ValentinVulchanov + 7–14 + It is now common to employ evidence from human behaviour (e.g., child development) for the creation of computational models of this behaviour with a variety of applications (e.g., in developmental robotics). In this paper we address research in the comprehension and processing of figurative (non-literal) language in highly verbal individuals with autism in comparison with age- and language level-matched neuro-typical individuals and discuss critically what factors might account for the observed problems. Based on this evidence we try to outline the strategies used by human language users in understanding non-literal/non-compositional expressions and proceed to identifying possible solutions for automated language systems in the domain of idiomatic expressions. + 2018.clib-1.3 + vulchanova-vulchanov-2018-figurative + + + Abstractive Text Summarization with Application to <fixed-case>B</fixed-case>ulgarian News Articles + NikolaTaushanov + IvanKoychev + PreslavNakov + 15–22 + With the development of the Internet, a huge amount of information is available every day. Therefore, text summarization has become critical part of our first access to the information. There are two major approaches for automatic text summarization: abstractive and extractive. In this work, we apply abstractive summarization algorithms on a corpus of Bulgarian news articles. In particular, we compare selected algorithms of both techniques and we show results which provide evidence that the selected state-of-the-art algorithms for abstractive text summarization perform better than the extractive ones for articles in Bulgarian. For the purpose of our experiments we collected a new dataset consisting of around 70,000 news articles and their topics. For research purposes we are also sharing the tools to easily collect and process such datasets. + 2018.clib-1.4 + taushanov-etal-2018-abstractive + + + Towards Lexical Meaning Formal Representation by virtue of the <fixed-case>NL</fixed-case>-<fixed-case>DL</fixed-case> Definition Transformation Method + MariaGritz + 23–33 + The paper represents a part of an extensive study devoted to the issues of lexical meaning formal representation in OWL 2 DL notation. Both theoretical and methodological aspects of lexical meaning formalization within the framework of an ontology are observed in the paper. Model-theoretic semantics paradigm and Kripke model are considered to form a theoretical background for formalization of lexical meaning, whereas the NL-DL definition transformation method is investigated as a method designed to provide us with acceptable formal definitions in OWL 2 DL notation with natural language definitions given at the input. A brief critical study of the method has allowed to reveal particular problematic cases of the method application, which arise due to syntactic peculiarities of natural language definitions given at the input. + 2018.clib-1.5 + gritz-2018-towards + + + Narrow Productivity, Competition, and Blocking in Word Formation + JunyaMorita + 34–40 + The present study explores the productivity of word formation processes in English, focusing on word composition by suffixes such as ​-ize (e.g. transcendentalize​), ​-(a)(t)ion (​territorization)​ , and ​-al (​realizational​). An optimal productivity measure for affixation is identified, which makes best use of hapax legomena in a large-scale corpus and attaches great importance to the base forms of an affix. This measure is then applied to the data collected from a large corpus to compute the productivity values of twelve kinds of affixes. The detailed investigation reveals that (i) the high productivity rate of an affix demonstrates a creative aspect of the affix, giving full support to the idea of “generative” morphology, (ii) productivity is gradient; very high, fairly high, and low productivity of affixes are recognizable, and (iii) this is necessarily reflected in determining the word form of a derivative (cf. ​territoriz​ation​); competition is carried out to decide which affix is selected for a given base form (​territorize​) and the “losers” (​-ment/-al​) are blocked out. + 2018.clib-1.6 + morita-2018-narrow + + + Knowledge and Rule-Based Diacritic Restoration in <fixed-case>S</fixed-case>erbian + CvetanaKrstev + RankaStanković + DuškoVitas + 41–51 + In this paper we present a procedure for the restoration of diacritics in Serbian texts written using the degraded Latin alphabet. The procedure relies on the comprehensive lexical resources for Serbian: the morphological electronic dictionaries, the Corpus of Contemporary Serbian and local grammars. Dictionaries are used to identify possible candidates for the restoration, while the data obtained from SrpKor and local grammars assists in making a decision between several candidates in cases of ambiguity. The evaluation results reveal that, depending on the text, accuracy ranges from 95.03% to 99.36%, while the precision (average 98.93%) is always higher than the recall (average 94.94%). + 2018.clib-1.7 + krstev-etal-2018-knowledge + + + Perfect <fixed-case>B</fixed-case>ulgarian Hyphenation and or How not to Stutter at End-of-line + AntonZinoviev + 52–61 + What is Perfect Bulgarian Hyphenation? We know that it has to be based somehow on the syllables and on the morphology but considering that these two factors often contradict each other, how exactly are we going to combine them? And speaking about syllables, what are they and how are we going to determine them? Also, how are we going to find the morphemes in the words? Don’t we have to develop an electronic derivational dictionary of the Bulgarian language? Isn’t all this going to be forbiddingly difficult? + 2018.clib-1.8 + zinoviev-2018-perfect + + + <fixed-case>R</fixed-case>ussian Bridging Anaphora Corpus + AnnaRoitberg + DenisKhachko + 62–68 + In this paper, we present a bridging anaphora corpus for Russian, introduce a syntactic approach for bridging annotation and discuss the difference between the syntactic and semantic approaches. We also discuss some special aspects of bridging annotation for Russian and other languages where definite nominal groups are not marked so frequently as e.g. in Romance or Germanic languages. In the end we list the main cases of annotator disagreement. + 2018.clib-1.9 + roitberg-khachko-2018-russian + + + Aspectual and Temporal Characteristics of the Past Active Participles in <fixed-case>B</fixed-case>ulgarian – a Corpus-based Study + EkaterinaTarpomanova + 69–76 + The paper presents a corpus-based study of the past active participles in Bulgarian with respect of their aspectual and temporal characteristics. As this type of participles combine two morphological markers, a special attention is paid on their interaction in different tenses, moods and evidentials. The source of language material used for the study is the Bulgarian National Corpus. The paper is organized in terms of morphological oppositions, aspectual and temporal, analyzing the functions of the participles in compound verbal forms. + 2018.clib-1.10 + tarpomanova-2018-aspectual + + + Unmatched Feminitives in a Corpus of <fixed-case>B</fixed-case>ulgarian and <fixed-case>U</fixed-case>krainian Parallel Texts + OlenaSiruk + IvanDerzhanski + 77–84 + Feminitives are formed and used in all Slavic languages, but the productivity of their formation and the intensity of their use are not the same everywhere. They are often subject to various intralinguistic and extralinguistic restrictions. In this paper we present a study of feminitives based on a parallel Bulgarian– Ukrainian corpus, with a focus on those occasions on which a feminitive in one language corresponds to a masculine (rarely neuter) noun in the other. The experiment shows that Bulgarian uses feminitives with considerably greater regularity than Ukrainian does, and we discuss the semantic classes of nouns that fail to form feminitives most often and the efect of the source language in translated text and of the author’s and translator’s individual preferences. + 2018.clib-1.11 + siruk-derzhanski-2018-unmatched + + + The <fixed-case>B</fixed-case>ulgarian Summaries Corpus + ViktoriyaPetrova + 85–92 + This article aims to present the Bulgarian Summaries Corpus, its advantages, its purpose and why it is necessary. It explains the selection of texts and process of summarization and the tool used, in addition of a quick overview of the current situation in Bulgaria. The paper also presents a general outline of the market needs, the use of this kind of tools and a short list of examples of a variety of corpora around the world both in language and field. + 2018.clib-1.12 + petrova-2018-bulgarian + + + Ontologies for Natural Language Processing: Case of <fixed-case>R</fixed-case>ussian + NataliaLoukachevitch + BorisDobrov + 93–103 + The paper describes the RuThes family of Russian thesauri intended for natural language processing and information retrieval applications. RuThes-like thesauri include, besides RuThes, Sociopolitical thesaurus, Security Thesaurus, and Ontology on Natural Sciences and Technologies. The RuThes format is based on three approaches for developing computer resources: Princeton WordNet, information-retrieval thesauri, and formal ontologies. The published version of RuThes thesaurus (RuThes-lite 2.0) became a basis for semi-automatic generation of RuWordNet, WordNet-like thesaurus for Russian. Currently researchers can use both RuThes-lite or RuWordNet and compare them in applications. Other RuThes-like resources are being prepared to publication. + 2018.clib-1.13 + loukachevitch-dobrov-2018-ontologies + + + Resource-based <fixed-case>W</fixed-case>ord<fixed-case>N</fixed-case>et Augmentation and Enrichment + RankaStanković + MiljanaMladenović + IvanObradović + MarkoVitas + CvetanaKrstev + 104–114 + In this paper we present an approach to support production of synsets for Serbian WordNet (SerWN) by adjusting Princeton WordNet (PWN) synsets using several bilingual English-Serbian resources. PWN synset definitions were automatically translated and post-edited, if needed, while candidate literals for Serbian synsets were obtained automatically from a list of translational equivalents compiled form bilingual resources. Preliminary results obtained from a set of 1248 selected PWN synsets show that the produced Serbian synsets contain 4024 literals, out of which 2278 were offered by the system we present in this paper, whereas experts added the remaining 1746. Approximately one half of synset definitions obtained automatically were accepted with no or minor corrections. These first results are encouraging, since the efficiency of synset production for SerWN was increased. There is also space for further improvement of this approach to wordnet enrichment. + 2018.clib-1.14 + stankovic-etal-2018-resource + + + Classifying Verbs in <fixed-case>W</fixed-case>ord<fixed-case>N</fixed-case>et by Harnessing Semantic Resources + SvetlozaraLeseva + IvelinaStoyanova + MariaTodorova + 115–125 + This paper presents the principles and procedures involved in the construction of a classification of verbs using information from 3 semantic resources – WordNet, FrameNet and VerbNet. We adopt the FrameNet frames as the primary categories of the proposed classification and transfer them to WordNet synsets. The hierarchical relationships between the categories are projected both from the hypernymy relation in WordNet and from the hierarchy of some of the frame-to-frame relations in FrameNet. The semantic classes and their hierarchical organisation in WordNet are thus made explicit and allow for linguistic generalisations on the inheritance of semantic features and structures. We then select the beginners of the separate hierarchies and assign classification categories recursively to their hyponyms using a battery of procedures based on generalisations over the semantic primes and the hierarchical structure of WordNet and FrameNet and correspondences between VerbNet superclasses and FrameNet frames. The so-obtained suggestions are ranked according to probability. As a result, 13,465 out of 14,206 verb synsets are accommodated in the classification hierarchy at least through a general category, which provides a point of departure towards further refinement of categories. The resulting system of classification categories is initially derived from the WordNet hierarchy and is further validated against the hierarchy of frames within FrameNet. A set of procedures is established to address inconsistencies and heterogeneity of categories. The classification is subject to ongoing extensive manual verification, essential for ensuring the quality of the resource. + 2018.clib-1.15 + leseva-etal-2018-classifying + + + A Pilot Study for Enriching the <fixed-case>R</fixed-case>omanian <fixed-case>W</fixed-case>ord<fixed-case>N</fixed-case>et with Medical Terms + MariaMitrofan + VerginicaBarbu Mititelu + GrigorinaMitrofan + 126–134 + This paper presents the preliminary investigations in the process of integrating a specialized vocabulary, namely medical terminology, into the Romanian wordnet. We focus here on four classes from this vocabulary: anatomy (or body parts), disorders, medical procedures and chemicals. In this pilot study we selected two large concepts from each class and created the Romanian terminological (sub)trees for each of them, starting from a medical thesaurus (SNOMED CT) and translating the terms, process which raised various challenges, all of them asking for the expertise of a specialist in the health care domain. The integration of these (sub)trees in the Romanian wordnet also required careful decision making, given the structural differences between a wordnet and a terminological thesaurus. They are presented and discussed herein. + 2018.clib-1.16 + mitrofan-etal-2018-pilot + + + Factors and Features Determining the Inheritance of Semantic Primes between Verbs and Nouns within <fixed-case>W</fixed-case>ord<fixed-case>N</fixed-case>et + IvelinaStoyanova + 135–145 + The paper outlines the mechanisms of inheriting semantic content between verbs and nouns as a result of derivational relations. The main factors determining the inheritance are: (1) the semantic class of the verb as represented by the noun; (2) the subcategorisation frame and argument structure of the verb predicate; (3) the derivational relation between the verb and the noun, as well as the resulting semantic relation made explicit through the derivation; (4) hierarchical relations within WordNet. The paper explores three types of verb-noun prime inheritance relations: (a) universal – not depending on the argument structure, which are eventive or circumstantial; (b) general – specific to classes of verbs, for example agentive or non-agentive; (c) verb-specific – depending on the specific subcategorisation frame of the verb as presented in VerbNet and/or FrameNet. The paper presents a possibility for extended coverage of semantic relations based on information about the argument structure of verbs. Further, the work focuses on the regularities in the way in which derivationally related nouns inherit semantic characteristics of the predicate. These regularities can be applied for the purposed of predicting derivationally and semantically related synsets within WordNet, as well as for the creation of language specific synsets, for consistency checks and verification. + 2018.clib-1.17 + stoyanova-2018-factors + + + Online Editor for <fixed-case>W</fixed-case>ord<fixed-case>N</fixed-case>ets + BorislavRizov + TsvetanaDimitrova + 146–152 + The paper presents an online editor for lexical-semantic databases with relational structure similar to the structure of WordNet – Hydra for Web. It supports functionalities for editing of relational data (including query, creation, change, and linking of relational objects), simultaneous access of multiple user profiles, parallel data visualization and editing of the data on top of single- and parallel mode visualization of the language data. + 2018.clib-1.18 + rizov-dimitrova-2018-online + + + The Effect of Unobserved Word-Context Co-occurrences on a <fixed-case>V</fixed-case>ector<fixed-case>M</fixed-case>ixture Approach for Compositional Distributional Semantics + AmirBakarov + 153–161 + Swivel (Submatrix-WIse Vector Embedding Learner) is a distributional semantic model based on counting point-wise mutual information values, capable of capturing word-context co-occurrences in the PMI matrix that were not noted in the training corpus. This model outperforms mainstream word embedding training algorithms such as Continuous Bag-of-Words, GloVe and Skip-Gram in word similarity and word analogy tasks. But the properness of these intrinsic tasks could be questioned, and it is unclear if the ability to count unobservable word-context co-occurrences could also be helpful for downstream tasks. In this work we propose a comparison of Word2Vec and Swivel for two downstream tasks based on natural language sentence matching: the paraphrase detection task and the textual entailment task. As a result, we reveal that Swivel outperforms Word2Vec in both cases, but the difference is minuscule. We can conclude, that the ability to learn embeddings for rarely co-occurring words is not so crucial for downstream tasks. + 2018.clib-1.19 + bakarov-2018-effect + + + Introducing Computational Linguistics and <fixed-case>NLP</fixed-case> to High School Students + RositsaDekova + AdelinaRadeva + 162–168 + The paper addresses a possible way of introducing core concepts of Computational Linguistics through problems given at the linguistic contests organized for high school students in Bulgaria and abroad. Following a brief presentation of the foundation and the underlying objective of these contests, we outline some of the types of problems as reflecting the different levels of language processing and the diversity of approaches and tasks to be solved. By presenting the variety of problems given so far through the years, we would like to attract the attention of the academic community to this captivating method through which high school students might be acquainted with the challenges and the main goals of Computational Linguistics (CL) and Natural Language Processing (NLP). + 2018.clib-1.20 + dekova-radeva-2018-introducing + + + Linguistic Problems on Number Names + IvanDerzhanski + MilenaVeneva + 169–176 + This paper presents a contrastive investigation of linguistic problems based on number names in different languages and intended for secondary-school students. We examine the eight problems of this type that have been assigned at the International Linguistics Olympiad throughout the years and compare the phenomena in the number systems featured there with those of the working languages of the Olympiad and other languages known to be familiar to the participants. On the basis of a statistical analysis of the results achieved by the contestants we draw conclusions regarding the ways in which the difficulty of a problem depends on its structure and the kinds of linguistic phenomena featured in it. + 2018.clib-1.21 + derzhanski-veneva-2018-linguistic + + + Parallel Web Display of Transcribed Spoken <fixed-case>B</fixed-case>ulgarian with its Normalised Version and an Indexed List of Lemmas + MarinaDzhonova + Kjetil RøaHauge + YovkaTisheva + 177–184 + We present and discuss problems in creating a lemmatised index to transcriptions of Bulgarian speech, including the prerequisites for such an index, and why we consider an index preferable to a search engine for this particular kind of text. + 2018.clib-1.22 + dzhonova-etal-2018-parallel + + + Integrating Crowdsourcing in Language Learning + GeorgiDzhumayov + 185–192 + This article aims to illustrate the use of crowdsourcing in an educational context. The practical part illustrates and provides the results of an online test conducted among 12th grade high school students from Bulgaria in order to gain new knowledge, find out common characteristics among the tenses and revise for their upcoming exams. They along with some interesting and inspiring teaching ideas could be used in an educational environment to provide easier, quicker and more interactive acquisition of a language. The experiment has been conducted by means of Google forms and sets the beginning of the establishment of an annotated corpus of right and wrong uses of the Bulgarian and English tenses too. + 2018.clib-1.23 + dzhumayov-2018-integrating + + + <fixed-case>B</fixed-case>ulgarian–<fixed-case>E</fixed-case>nglish Parallel Corpus for the Purposes of Creating Statistical Translation Model of the Verb Forms. General Conception, Structure, Resources and Annotation + TodorLazarov + 193–202 + This paper describes the process of creating a Bulgarian-English parallel corpus for the purposes of constructing a statistical translation model for verb forms in both languages. We briefly introduce the scientific problem behind the corpus, its main purpose, general conception, linguistic resources and annotation conception. In more details we describe the collection of language data for the purposes of creating the corpus, the preparatory processing of the gathered data, the annotation rules based on the characteristics of the gathered data and the chosen software. We discuss the current work on the training model and the future work on this linguistic resource and the aims of the scientific project. + 2018.clib-1.24 + lazarov-2018-bulgarian + + + Fingerprints in <fixed-case>SMS</fixed-case> messages: Automatic Recognition of a Short Message Sender Using Gradient Boosting + BranislavaŠandrih + 203–210 + This paper considers the following question: Is it possible to tell who is the short message sender just by analyzing a typing style of the sender, and not the meaning of the content itself? If possible, how reliable would the judgment be? Are we leaving some kind of “fingerprint” when we text, and can we tell something about others based just on their typing style? For this purpose, a corpus of ∼ 5,500 SMS messages was gathered from one person’s cell phone and two gradient boost classifiers were built: first one is trying to distinguish whether the message was sent by this exact person (cell phone owner) or by someone else; second one was trained to distinguish between messages sent by some public service (e.g. parking service, bank reports etc.) and messages sent by humans. The performance of the classifiers was evaluated in the 5-fold cross-validation setting, resulting in 73.6% and 99.3% overall accuracy for the first and the second classifier, respectively. + 2018.clib-1.25 + sandrih-2018-fingerprints + +
+
diff --git a/data/xml/2020.clib.xml b/data/xml/2020.clib.xml index a44484ca54..41ab77f7fd 100644 --- a/data/xml/2020.clib.xml +++ b/data/xml/2020.clib.xml @@ -2,7 +2,7 @@ - Proceedings of the 4th International Conference on Computational Linguistics in Bulgaria (CLIB 2020) + Proceedings of the Fourth International Conference on Computational Linguistics in Bulgaria (CLIB 2020) Department of Computational Linguistics, IBL -- BAS
Sofia, Bulgaria
September diff --git a/data/xml/2020.findings.xml b/data/xml/2020.findings.xml index 140dab2dce..c618222ef2 100644 --- a/data/xml/2020.findings.xml +++ b/data/xml/2020.findings.xml @@ -3257,6 +3257,7 @@ 2020.findings-emnlp.217 10.18653/v1/2020.findings-emnlp.217 qi-etal-2020-prophetnet + BookCorpus C4 CNN/Daily Mail diff --git a/data/xml/2020.trac.xml b/data/xml/2020.trac.xml index 64174f8ecd..72b608b50d 100644 --- a/data/xml/2020.trac.xml +++ b/data/xml/2020.trac.xml @@ -92,6 +92,7 @@ eng suryawanshi-etal-2020-multimodal bharathichezhiyan/Multimodal-Meme-Classification-Identifying-Offensive-Content-in-Image-and-Text + MultiOFF A Comparative Study of Different State-of-the-Art Hate Speech Detection Methods in <fixed-case>H</fixed-case>indi-<fixed-case>E</fixed-case>nglish Code-Mixed Data diff --git a/data/xml/2021.emnlp.xml b/data/xml/2021.emnlp.xml index 8e5dd99bfe..22591b3ca9 100644 --- a/data/xml/2021.emnlp.xml +++ b/data/xml/2021.emnlp.xml @@ -1963,6 +1963,7 @@ chi-etal-2021-mt6 10.18653/v1/2021.emnlp-main.125
diff --git a/data/xml/2022.clib.xml b/data/xml/2022.clib.xml index d616b576cf..0813f83de8 100644 --- a/data/xml/2022.clib.xml +++ b/data/xml/2022.clib.xml @@ -2,7 +2,7 @@ - Proceedings of the 5th International Conference on Computational Linguistics in Bulgaria (CLIB 2022) + Proceedings of the Fifth International Conference on Computational Linguistics in Bulgaria (CLIB 2022) Department of Computational Linguistics, IBL -- BAS
Sofia, Bulgaria
September diff --git a/data/xml/2022.findings.xml b/data/xml/2022.findings.xml index 06ca74ec66..1bba4a2d66 100644 --- a/data/xml/2022.findings.xml +++ b/data/xml/2022.findings.xml @@ -2728,7 +2728,7 @@ schroder-etal-2022-revisiting 10.18653/v1/2022.findings-acl.172
@@ -15222,11 +15063,11 @@
Bangkok, Thailand
August 2024 - 2024.acl-tutorials + 2024.acl-tutorials acl - 2024.acl-tutorials.0 + 2024.acl-tutorials.0 acl-2024-tutorials @@ -15237,8 +15078,8 @@ JixingLi Marie-FrancineMoens 1-2 - Computational linguistics (CL) has witnessed tremendous advancementsin recent years, with models such as large language models demonstratingexceptional performance in various natural language processing tasks. Theseadvancements highlight their potential to help understand brain languageprocessing, especially through the lens of brain encoding and decoding.Brain encoding involves the mapping of linguistic stimuli to brain activity,while brain decoding is the process of reconstructing linguistic stimulifrom observed brain activities. CL models that excel at capturing andmanipulating linguistic features are crucial for mapping linguistic stimulito brain activities and vice versa. Brain encoding and decoding have vastapplications, from enhancing human-computer interaction to developingassistive technologies for individuals with communication impairments. Thistutorial will focus on elucidating how computational linguistics canfacilitate brain encoding and decoding. We will delve into the principlesand practices of using computational linguistics methods for brain encodingand decoding. We will also discuss the challenges and future directions ofbrain encoding and decoding. Through this tutorial, we aim to provide acomprehensive and informative overview of the intersection betweencomputational linguistics and cognitive neuroscience, inspiring futureresearch in this exciting and rapidly evolving field. - 2024.acl-tutorials.1 + Computational linguistics (CL) has witnessed tremendous advancements in recent years, with models such as large language models demonstrating exceptional performance in various natural language processing tasks. These advancements highlight their potential to help understand brain language processing, especially through the lens of brain encoding and decoding. Brain encoding involves the mapping of linguistic stimuli to brain activity, while brain decoding is the process of reconstructing linguistic stimuli from observed brain activities. CL models that excel at capturing and manipulating linguistic features are crucial for mapping linguistic stimuli to brain activities and vice versa. Brain encoding and decoding have vast applications, from enhancing human-computer interaction to developing assistive technologies for individuals with communication impairments. This tutorial will focus on elucidating how computational linguistics can facilitate brain encoding and decoding. We will delve into the principles and practices of using computational linguistics methods for brain encoding and decoding. We will also discuss the challenges and future directions of brain encoding and decoding. Through this tutorial, we aim to provide a comprehensive and informative overview of the intersection between computational linguistics and cognitive neuroscience, inspiring future research in this exciting and rapidly evolving field. + 2024.acl-tutorials.1 sun-etal-2024-computational 10.18653/v1/2024.acl-tutorials.1 @@ -15249,8 +15090,8 @@ ClaireGardent WeiXu 3-4 - In this tutorial, we focus on text-to-text generation, a class ofnatural language generation (NLG) tasks, that takes a piece of text as inputand then generates a revision that is improved according to some specificcriteria (e.g., readability or linguistic styles), while largely retainingthe original meaning and the length of the text. This includes many usefulapplications, such as text simplification, paraphrase generation, styletransfer, etc. In contrast to text summarization and open-ended textcompletion (e.g., story), the text-to-text generation tasks we discuss inthis tutorial are more constrained in terms of semantic consistency andtargeted language styles. This level of control makes these tasks idealtestbeds for studying the ability of models to generate text that is bothsemantically adequate and stylistically appropriate. Moreover, these tasksare interesting from a technical standpoint, as they require complexcombinations of lexical and syntactical transformations, stylistic control,and adherence to factual knowledge, – all at once. With a special focus ontext simplification and revision, this tutorial aims to provide an overviewof the state-of-the-art natural language generation research from four majoraspects – Data, Models, Human-AI Collaboration, and Evaluation – and todiscuss and showcase a few significant and recent advances: (1) the use ofnon-retrogressive approaches; (2) the shift from fine-tuning to promptingwith large language models; (3) the development of new learnable metric andfine-grained human evaluation framework; (4) a growing body of studies anddatasets on non-English languages; (5) the rise of HCI+NLP+Accessibilityinterdisciplinary research to create real-world writing assistant systems. - 2024.acl-tutorials.2 + In this tutorial, we focus on text-to-text generation, a class of natural language generation (NLG) tasks, that takes a piece of text as input and then generates a revision that is improved according to some specific criteria (e.g., readability or linguistic styles), while largely retaining the original meaning and the length of the text. This includes many useful applications, such as text simplification, paraphrase generation, style transfer, etc. In contrast to text summarization and open-ended text completion (e.g., story), the text-to-text generation tasks we discuss in this tutorial are more constrained in terms of semantic consistency and targeted language styles. This level of control makes these tasks ideal testbeds for studying the ability of models to generate text that is both semantically adequate and stylistically appropriate. Moreover, these tasks are interesting from a technical standpoint, as they require complex combinations of lexical and syntactical transformations, stylistic control, and adherence to factual knowledge, – all at once. With a special focus on text simplification and revision, this tutorial aims to provide an overview of the state-of-the-art natural language generation research from four major aspects – Data, Models, Human-AI Collaboration, and Evaluation – and to discuss and showcase a few significant and recent advances: (1) the use of non-retrogressive approaches; (2) the shift from fine-tuning to prompting with large language models; (3) the development of new learnable metric and fine-grained human evaluation framework; (4) a growing body of studies and datasets on non-English languages; (5) the rise of HCI+NLP+Accessibility interdisciplinary research to create real-world writing assistant systems. + 2024.acl-tutorials.2 dou-etal-2024-automatic 10.18653/v1/2024.acl-tutorials.2 @@ -15260,19 +15101,19 @@ RyanCotterell AnejSvete 5-5 - Language models (LMs) are currently at the forefront of NLP researchdue to their remarkable versatility across diverse tasks. However, a largegap exists between their observed capabilities and the explanations proposedby established formal machinery. To motivate a better theoreticalcharacterization of LMs’ abilities and limitations, this tutorial aims toprovide a comprehensive introduction to a specific framework for formalanalysis of modern LMs using tools from formal language theory (FLT). Wepresent how tools from FLT can be useful in understanding the inner workingsand predicting the capabilities of modern neural LM architectures. We willcover recent results using FLT to make precise and practically relevantstatements about LMs based on recurrent neural networks and transformers byrelating them to formal devices such as finite-state automata, Turingmachines, and analog circuits. Altogether, the results covered in thistutorial will allow us to make precise statements and explanations about theobserved as well as predicted behaviors of LMs, as well as providetheoretically motivated suggestions on the aspects of the architectures thatcould be improved. - 2024.acl-tutorials.3 + Language models (LMs) are currently at the forefront of NLP research due to their remarkable versatility across diverse tasks. However, a large gap exists between their observed capabilities and the explanations proposed by established formal machinery. To motivate a better theoretical characterization of LMs’ abilities and limitations, this tutorial aims to provide a comprehensive introduction to a specific framework for formal analysis of modern LMs using tools from formal language theory (FLT). We present how tools from FLT can be useful in understanding the inner workings and predicting the capabilities of modern neural LM architectures. We will cover recent results using FLT to make precise and practically relevant statements about LMs based on recurrent neural networks and transformers by relating them to formal devices such as finite-state automata, Turing machines, and analog circuits. Altogether, the results covered in this tutorial will allow us to make precise statements and explanations about the observed as well as predicted behaviors of LMs, as well as provide theoretically motivated suggestions on the aspects of the architectures that could be improved. + 2024.acl-tutorials.3 butoi-etal-2024-computational 10.18653/v1/2024.acl-tutorials.3 - Presentation Matters: How to Communicate Science in the <fixed-case>NLP</fixed-case> Venues and in the Wild? + Presentation Matters: How to Communicate Science in the <fixed-case>NLP</fixed-case> Venues and in the Wild SarvnazKarimi CecileParis GholamrezaHaffari 6-7 Each year a large number of early career researchers join the NLP/Computational Linguistics community, with most starting by presenting their research in the *ACL conferences and workshops. While writing a paper that has made it to these venues is one important step, what comes with communicating the outcome is equally important and sets the path to impact of a research outcome. In addition, not all PhD candidates get the chance of being trained for their presentation skills. Research methods courses are not all of the same quality and may not cover scientific communications, and certainly not all are tailored to the NLP community. We are proposing an introductory tutorial that covers a range of different communication skills, including writing, oral presentation (posters and demos), and social media presence. This is to fill in the gap for the researchers who may not have access to research methods courses or other mentors who could help them acquire such skills. The interactive nature of such a tutorial would allow attendees to ask questions and clarifications which would not be possible from reading materials alone. - 2024.acl-tutorials.4 + 2024.acl-tutorials.4 karimi-etal-2024-presentation 10.18653/v1/2024.acl-tutorials.4 @@ -15280,25 +15121,25 @@ Vulnerabilities of Large Language Models to Adversarial Attacks YuFu ErfanShayegan - Md.Mamun Al Abdullah + Md. Mamun AlAbdullah PedramZaree NaelAbu-Ghazaleh YueDong 8-9 This tutorial serves as a comprehensive guide on the vulnerabilities of Large Language Models (LLMs) to adversarial attacks, an interdisciplinary field that blends perspectives from Natural Language Processing (NLP) and Cybersecurity. As LLMs become more complex and integrated into various systems, understanding their security attributes is crucial. However, current research indicates that even safety-aligned models are not impervious to adversarial attacks that can result in incorrect or harmful outputs. The tutorial first lays the foundation by explaining safety-aligned LLMs and concepts in cybersecurity. It then categorizes existing research based on different types of learning architectures and attack methods. We highlight the existing vulnerabilities of unimodal LLMs, multi-modal LLMs, and systems that integrate LLMs, focusing on adversarial attacks designed to exploit weaknesses and mislead AI systems. Finally, the tutorial delves into the potential causes of these vulnerabilities and discusses potential defense mechanisms. - 2024.acl-tutorials.5 + 2024.acl-tutorials.5 fu-etal-2024-vulnerabilities 10.18653/v1/2024.acl-tutorials.5 - Detecting Machine-Generated Text: Techniques and Challenges - LiGao - WenhanXiong - TaewooKim + Watermarking for Large Language Models + XuandongZhao + Yu-XiangWang + LeiLi 10-11 - As AI-generated text increasingly resembles human-written content, the ability to detect machine-generated text becomes crucial in many applications. This tutorial aims to provide a comprehensive overview of text detection techniques, focusing on machine-generated text and deepfakes. We will discuss various methods for distinguishing between human-written and machine-generated text, including statistical methods, neural network-based techniques, and hybrid approaches. The tutorial will also cover the challenges in the detection process, such as dealing with evolving models and maintaining robustness against adversarial attacks. By the end of the session, attendees will have a solid understanding of current techniques and future directions in the field of text detection. - 2024.acl-tutorials.6 - gao-etal-2024-detecting + As AI-generated text increasingly resembles human-written content, the ability to detect machine-generated text becomes crucial in both the computational linguistics and machine learning communities. In this tutorial, we aim to provide an in-depth exploration of text watermarking, a subfield of linguistic steganography with the goal of embedding a hidden message (the watermark) within a text passage. We will introduce the fundamentals of text watermarking, discuss the main challenges in identifying AI-generated text, and delve into the current watermarking methods, assessing their strengths and weaknesses. Moreover, we will explore other possible applications of text watermarking and discuss future directions for this field. Each section will be supplemented with examples and key takeaways. + 2024.acl-tutorials.6 + zhao-etal-2024-watermarking 10.18653/v1/2024.acl-tutorials.6
diff --git a/data/xml/2024.clib.xml b/data/xml/2024.clib.xml new file mode 100644 index 0000000000..ec3ec4e743 --- /dev/null +++ b/data/xml/2024.clib.xml @@ -0,0 +1,355 @@ + + + + + Proceedings of the Sixth International Conference on Computational Linguistics in Bulgaria (CLIB 2024) + Department of Computational Linguistics, Institute for Bulgarian Language, Bulgarian Academy of Sciences +
Sofia, Bulgaria
+ September + 2024 + 2024.clib-1 + clib + + + 344 + 2024.clib-1.0 + clib-2024-1 + + + A Cross-model Study on Learning <fixed-case>R</fixed-case>omanian Parts of Speech with Transformer Models + RaduIon + VerginicaBarbu Mititelu + VasilePăiş + ElenaIrimia + ValentinBadea + 6–13 + This paper will attempt to determine experimentally if POS tagging of unseen words produces comparable performance, in terms of accuracy, as for words that were rarely seen in the training set (i.e. frequency less than 5), or more frequently seen (i.e. frequency greater than 10). To compare accuracies objectively, we will use the odds ratio statistic and its confidence interval testing to show that odds of being correct on unseen words are close to odds of being correct on rarely seen words. For the training of the POS taggers, we use different Romanian BERT models that are freely available on HuggingFace. + 2024.clib-1.1 + ion-etal-2024-cross + + + What do <fixed-case>BERT</fixed-case> Word Embeddings Learn about the <fixed-case>F</fixed-case>rench Language? + EkaterinaGoliakova + DavidLanglois + 14–32 + Pre-trained word embeddings (for example, BERT-like) have been successfully used in a variety of downstream tasks. However, do all embeddings, obtained from the models of the same architecture, encode information in the same way? Does the size of the model correlate to the quality of the information encoding? In this paper, we will attempt to dissect the dimensions of several BERT-like models that were trained on the French language to find where grammatical information (gender, plurality, part of speech) and semantic features might be encoded. In addition to this, we propose a framework for comparing the quality of encoding in different models. + 2024.clib-1.2 + goliakova-langlois-2024-bert + + + Whisper–<fixed-case>TAD</fixed-case>: A General Model for Transcription, Alignment and Diarization of Speech + CamilleLavigne + AlexStasica + 33–38 + Currently, there is a lack of a straightforward implementation of diarization-augmented speech transcription (DAST), ie. implementation of transcription, diarization and alignment to the audio within one model. These tasks typically require distinct models, necessitating to stack them together for complete processing. In this study, we advocate for leveraging the advanced capabilities of the Whisper models, which already excels in automatic transcription and partial alignment. Our approach involves fine-tuning the model’s parameters on both transcription and diarization tasks in a SOT-FIFO (Serialized Output Training-First In First Out) manner. This comprehensive framework facilitates the creation of orthographic transcriptions, identification of speakers, and precise alignment, thus enhancing the efficiency of audio processing workflows. While our work represents an initial step towards a unified transcription and diarization framework, the development of such a model demands substantial high-quality data augmentation and computational resources beyond our current scope. Consequently, our focus is narrowed to the English language. Despite these limitations, our method demonstrates promising performance in both transcription and diarization tasks. Comparative analysis between pre-trained models and fine-tuned TAD (Transcription, Alignment, Diarization) versions suggests that incorporating diarization into a Whisper model doesn’t compromise transcription accuracy. Our findings hint that deploying our TAD framework on the largest Whisper model could potentially yield state-of-the-art performance across all mentioned tasks. + 2024.clib-1.3 + lavigne-stasica-2024-whisper + + + Contemporary <fixed-case>LLM</fixed-case>s and Literary Abridgement: An Analytical Inquiry + IglikaNikolova-Stoupak + GaëlLejeune + EvaSchaeffer-Lacroix + 39–57 + Within the framework of this study, several contemporary Large Language Models (ChatGPT, Gemini Pro, Mistral-Instruct and BgGPT) are evaluated in relation to their ability to generate abridged versions of literary texts. The analysis is based on ’The Ugly Duckling’ by H. C. Andersen as translated into English, French and Bulgarian. The different scenarios of abridgement experimented with include zero-shot, one-shot, division into chunks and crosslingual (including chain-of-thought) abridgement. The resulting texts are evaluated both automatically and via human evaluation. The automatic analysis includes ROUGE and BERTScore as well as the ratios of a selection of readability-related textual features (e.g. number of words, type-to-token ratio) as pertaining to the original versus automatically abridged texts. Professionally composed abridged versions are regarded as gold standard. Following the automatic analysis, six selected best candidate texts per language are then evaluated by volunteers with university education in terms of textual characteristics of a more qualitative nature, such as coherence, consistency and aesthetic appeal. + 2024.clib-1.4 + nikolova-stoupak-etal-2024-contemporary + + + Advancing Sentiment Analysis in <fixed-case>S</fixed-case>erbian Literature: A Zero and Few–Shot Learning Approach Using the Mistral Model + Milica IkonićNešić + SašaPetalinkar + MihailoŠkorić + RankaStanković + BiljanaRujević + 58–70 + This study presents the Sentiment Analysis of the Serbian old novels from the 1840-1920 period, employing the Mistral Large Language Model (LLM) to pioneer zero and few-shot learning techniques. The main approach innovates by devising research prompts that include guidance text for zero-shot classification and examples for few-shot learning, enabling the LLM to classify sentiments into positive, negative, or objective categories. This methodology aims to streamline sentiment analysis by limiting responses, thereby enhancing classification precision. Python, along with the Hugging Face Transformers and LangChain libraries, serves as our technological backbone, facilitating the creation and refinement of research prompts tailored for sentence-level sentiment analysis. The results of sentiment analysis in both scenarios, zero-shot and few-shot, have indicated that the zero-shot approach outperforms, achieving an accuracy of 68.2%. + 2024.clib-1.5 + nesic-etal-2024-advancing + + + Generating Phonetic Embeddings for <fixed-case>B</fixed-case>ulgarian Words with Neural Networks + LyuboslavKarev + IvanKoychev + 71–79 + Word embeddings can be considered the cornerstone of modern natural language processing. They are used in many NLP tasks and allow us to create models that can understand the meaning of words. Most word embeddings model the semantics of the words. In this paper, we create phoneme-based word embeddings, which model how a word sounds. This is accomplished by training a neural network that can automatically generate transcriptions of Bulgarian words. We used the Jaccard index and direct comparison metrics to measure the performance of neural networks. The models perform nearly perfectly with the task of generating transcriptions. The model’s word embeddings offer versatility across various applications, with its application in automatic paronym detection being particularly notable, as well as the task of detecting the language of origin of a Bulgarian word. The performance of this paronym detection is measured with the standard classifier metrics - accuracy, precision, recall, and F1. + 2024.clib-1.6 + karev-koychev-2024-generating + + + <fixed-case>U</fixed-case>niversal <fixed-case>D</fixed-case>ependencies Treebank for Standard <fixed-case>A</fixed-case>lbanian: A New Approach + NeldaKote + RozanaRushiti + AnilaÇepani + AlbaHaveriku + EvisTrandafili + Elinda KajoMeçe + Elsa SkënderiRakipllari + LinditaXhanari + AlbanaDeda + 80–89 + In this paper, we present a Universal Dependencies (UD) treebank for the Standard Albanian Language (SAL), annotated by expert linguistics supported by information technology professionals. The annotated treebank consists of 24,537 tokens (1,400 sentences) and includes annotation for syntactic dependencies, part-of-speech tags, morphological features, and lemmas. This treebank represents the largest UD treebank available for SAL. In order to overcome annotation challenges in SAL within the UD framework, we delicately balanced the preservation of the richness of SAL grammar while adapting the UD tagset and addressing unique language-specific features for a unified annotation. We discuss the criteria followed to select the sentences included in the treebank and address the most significant linguistic considerations when adapting the UD framework conform to the grammar of the SAL. Our efforts contribute to the advancement of linguistic analyses and Natural Language Processing (NLP) in the SAL. The treebank will be made available online under an open license so that to provide the possibility for further developments of NLP tools based on the Artificial Intelligence (AI) models for the Albanian language. + 2024.clib-1.7 + kote-etal-2024-universal + + + Function Multiword Expressions Annotated with Discourse Relations in the <fixed-case>R</fixed-case>omanian Reference Treebank + VerginicaBarbu Mititelu + TudorVoicu + 90–97 + For the Romanian Reference Treebank, a general language corpus, covering several genres and annotated according to the principles of Universal Dependencies, we present here the annotation of some function words, namely multiword conjunctions, with discourse relations from the Penn Discourse Treebank version 3.0 inventory of such relations. The annotation process was manual, with two annotators for each occurrence of the conjunctions. Lexical-semantic relations of the types synonymy, polysemy can be established between the senses of such conjunctions. The discourse relations are added to the CoNLL-U file in which the treebank is represented. + 2024.clib-1.8 + barbu-mititelu-voicu-2024-function + + + Dependency Parser for <fixed-case>B</fixed-case>ulgarian + AtanasAtanasov + 98–105 + This paper delves into the implementation of a Biaffine Attention Model, a sophisticated neural network architecture employed for dependency parsing tasks. Proposed by Dozat and Manning, this model is applied to Bulgarian language processing. The model’s training and evaluation are conducted using the Bulgarian Universal Dependencies dataset. The paper offers a comprehensive explanation of the model’s architecture and the data preparation process, aiming to demonstrate that for highly inflected languages, the inclusion of two additional input layers - lemmas and language-specific morphological information - is beneficial. The results of the experiments are subsequently presented and discussed. The paper concludes with a reflection on the model’s performance and suggestions for potential future work. + 2024.clib-1.9 + atanasov-2024-dependency + + + Towards a <fixed-case>R</fixed-case>omanian Phrasal Academic Lexicon + MadalinaChitez + Ana-MariaBucur + AndreeaDinca + RoxanaRogobete + 106–112 + The lack of NLP based research studies on academic writing in Romania results in an unbalanced development of automatic support tools in Romanian compared to other languages, such as English. For this study, we use Romanian subsets of two bilingual academic writing corpora: the ROGER corpus, consisting of university student papers, and the EXPRES corpus, composed of expert research articles. Working with the Romanian Academic Word List / RoAWL, we present two phrase extraction phases: (i) use Ro-AWL words as node words to extract collocations according to the thresholds of statistical measures and (ii) classify extracted phrases into general versus domain-specific multi-word units. We show how manual rhetorical function annotation of resulting phrases can be combined with automatic function detection. The comparison between academic phrases in ROGER and EXPRES validates the final phrase list. The Romanian phrasal academic lexicon (ROPAL), similar to the Oxford Phrasal Academic Lexicon (OPAL), is a written academic phrase lexicon for Romanian language made available for academic use and further research or applications. + 2024.clib-1.10 + chitez-etal-2024-towards-romanian + + + Classifying Multi–Word Expressions in the <fixed-case>L</fixed-case>atvian Monolingual Electronic Dictionary Tēzaurs.lv + LauraRituma + GuntaNešpore-Bērzkalne + AguteKlints + IlzeLokmane + MadaraStāde + PēterisPaikens + 113–118 + The electronic dictionary Tēzaurs.lv contains more than 400,000 entries from which 73,000 entries are multi-word expressions (MWEs). Over the past two years, there has been an ongoing division of these MWEs into subgroups (proper names, multi-word terms, taxa, phraseological units, collocations). The article describes the classification of MWEs, focusing on phraseological units (approximately 7,250 entries), as well as on borderline cases of phraseological unit types (phrasemes and idioms) and different MWE groups in general. The division of phraseological units depends on semantic divisibility and figurativeness. In a phraseme, at least one of the constituents retains its literal sense, whereas the meaning of an idiom is not dependent on the literal sense of any of its constituents. As a result, 65919 entries of MWE have been manually classified, and now this information of MWE type is available for the users of the electronic dictionary Tēzaurs.lv. + 2024.clib-1.11 + rituma-etal-2024-classifying + + + Complex Word Identification for <fixed-case>I</fixed-case>talian Language: A Dictionary–based Approach + LauraOcchipinti + 119–129 + Assessing word complexity in Italian poses significant challenges, particularly due to the absence of a standardized dataset. This study introduces the first automatic model designed to identify word complexity for native Italian speakers. A dictionary of simple and complex words was constructed, and various configurations of linguistic features were explored to find the best statistical classifier based on Random Forest algorithm. Considering the probabilities of a word to belong to a class, a comparison between the models’ predictions and human assessments derived from a dataset annotated for complexity perception was made. Finally, the degree of accord between the model predictions and the human inter-annotator agreement was analyzed using Spearman correlation. Our findings indicate that a model incorporating both linguistic features and word embeddings performed better than other simpler models, also showing a value of correlation with the human judgements similar to the inter-annotator agreement. This study demonstrates the feasibility of an automatic system for detecting complexity in the Italian language with good performances and comparable effectiveness to humans in this subjective task. + 2024.clib-1.12 + occhipinti-2024-complex + + + Verbal Multiword Expressions in the <fixed-case>C</fixed-case>roatian Verb Lexicon + IvanaBrač + MateaBirtić + 130–139 + The paper examines the complexities of encoding verbal multiword expressions in the Croatian verb lexicon. The lexicon incorporates a verb’s description at the syntactic, morphological, and semantic levels. This study explores the treatment of reflexive verbs, light verb constructions, and verbal idioms across several Croatian and Slavic language resources to find the best solution for the verb lexicon. It addresses the following research questions: 1. How should reflexive verbs, i.e., verbs with the reflexive marker se, be treated? Should they be considered as separate lemmas, sublemmas of non-reflexive counterparts, or as one of their senses? 2. What syntactic label and semantic role should be assigned to a predicative noun in light verb constructions? 3. Should verbal idioms be included, and, if so, at which level of a description? Our conclusion is that all reflexive verbs should be treated as separate lemmas since they are distinct lexemes that have undergone semantic and syntactic change. To differentiate between a semantically full verb and a light verb, we have introduced the label LV and decided not to assign a semantic role to a predicative noun. By including verbal idioms and their translation into English, non-native users can benefit from the lexicon. The aim is to enhance the verb lexicon for the more effective description and recognition of verbal multiword expressions. + 2024.clib-1.13 + brac-birtic-2024-verbal + + + Assessing Reading Literacy of <fixed-case>B</fixed-case>ulgarian Pupils with Finger–tracking + AlessandroLento + AndreaNadalini + MarcelloFerro + ClaudiaMarzi + VitoPirrelli + TsvetanaDimitrova + HristinaKukova + ValentinaStefanova + MariaTodorova + SvetlaKoeva + 140–149 + The paper reports on the first steps in developing a time-stamped multimodal dataset of reading data by Bulgarian children. Data are being collected, structured and analysed by means of ReadLet, an innovative infrastructure for multimodal language data collection that uses a tablet as a reader’s front-end. The overall goal of the project is to quantitatively analyse the reading skills of a sample of early Bulgarian readers collected over a two-year period, and compare them with the reading data of early readers of Italian, collected using the same protocol. We illustrate design issues of the experimental protocol, as well as the data acquisition process and the post-processing phase of data annotation/augmentation. To evaluate the potential and usefulness of the Bulgarian dataset for reading research, we present some preliminary statistical analyses of our recently collected data. They show robust convergence trends between Bulgarian and Italian early reading development stages. + 2024.clib-1.14 + lento-etal-2024-assessing + + + Educational Horizons: Mapping the Terrain of Artificial Intelligence Integration in <fixed-case>B</fixed-case>ulgarian Educational Settings + DenitzaKurshumova + 150–156 + The role of artificial intelligence in education (AIEd) has recently become a major topic of discussion and future planning. This article presents data from a large-scale survey involving 1463 Bulgarian educators in primary, secondary, and high schools. The results revealed that 70.30% of the teachers were familiar with or somewhat familiar with the existence of AI applications. Chatbots were the most popular among the surveyed teachers, with ChatGPT ranking as the most familiar. The teachers were almost equally split between those who reported use and those who declared nonuse of AI technology for instructional purposes. A significant association was found between the teachers’ familiarity with and use of AI technology and their age-related generational traits. The younger educators (up to 40 years of age) were associated with higher use of AI technology as a support tool for creating lesson plans, lesson content, tests, and exams. The outlined tendencies can be used to inform policy, professional development, and future research in the realm of AI-driven education. + 2024.clib-1.15 + kurshumova-2024-educational + + + Evidential Auxiliaries as Non–reliability Markers in <fixed-case>B</fixed-case>ulgarian Parliamentary Speech + EkaterinaTarpomanova + 157–165 + In the evidentiality system of Bulgarian, there are three evidential auxiliaries that form complex verbal forms. The paper analyzes their potential to mark non-reliability in political discourse by using the ParlaMint-BG corpus of parliamentary debates. The method of the study includes detection, categorisation and context analysis of the evidentials formed with auxiliaries. The results prove that the evidential auxiliaries function as markers of non-reliability, especially in argumentative text type such as political discourse. + 2024.clib-1.16 + tarpomanova-2024-evidential + + + Extended Context at the Introduction of Complex Vocabulary in Abridged Literary Texts + IglikaNikolova-Stoupak + EvaSchaeffer-Lacroix + GaëlLejeune + 166–177 + Psycholinguistics speaks of a fine-tuning process used by parents as they address children, in which complex vocabulary is introduced with additional context (Leung et al., 2021). This somewhat counterintuitive lengthening of text in order to aid one’s interlocutor in the process of language acquisition also comes in accord with Harris (1988)’s notion that for every complex sentence, there is an equivalent longer (non-contracted) yet simpler one that contains the same amount of information. Within the proposed work, a corpus of eight renowned literary works (e.g. Alice’s Adventures in Wonderland, The Adventures of Tom Sawyer, Les Misérables) in four distinct languages (English, French, Russian and Spanish) is gathered: both the original (or translated) versions and up to four abridged versions for various audiences (e.g. children of a defined age or foreign language learners of a defined level) are present. The contexts of the first appearance of complex words (as determined based on word frequency) in pairs of original and abridged works are compared, and the cases in which the abridged texts offer longer context are investigated. The discovered transformations are consequently classified into three separate categories: addition of vocabulary items from the same lexical field as the complex word, simplification of grammar and insertion of a definition. Context extensions are then statistically analysed as associated with different languages and reader audiences. + 2024.clib-1.17 + nikolova-stoupak-etal-2024-extended + + + Corpus–based Research into Derivational Morphology: A Comparative Study of <fixed-case>J</fixed-case>apanese and <fixed-case>E</fixed-case>nglish Verbalization + JunyaMorita + 178–186 + As part of elucidating the syntax-morphology interaction, this study investigates where and how complex verbs are formed in Japanese and English. Focusing on the Japanese verb-forming suffix -ka-suru (e.g. toshi-o gendai-ka-suru ‘modernize city’), relevant verbs are extracted from a large-scale corpus and they receive an in-depth analysis from semantic, morphosyntactic, and functional viewpoints. The properties of -ka-suru and those of its English counterpart are then compared and contrasted. The result reveals three main points: (i) -ka-suru verbs are constantly created in syntactic settings to fulfill the functions of brevity and conceptualization, (ii) while denominal -ize derivatives have several submeanings such as ‘result,’ ‘ornative,’ and ‘agentive,’ -ka-suru equivalents retain the meaning ‘result,’ and (iii) -ka-suru can be combined with compound nouns, but -ize cannot. We will demonstrate that the above features originate in the underlying syntactic structure related to each suffix and their difference, thus supporting the thesis of syntactic word formation. (1) ji-kokumin-o moomai-ka-suru one’s-people-ACC ignorant-change-do ‘make one’s people ignorant’ (2) shinikaketa momiji-o bonsai-ka-suru dying maple-ACC bonsai-change-do ‘turn a dying maple into a bonsai’ + 2024.clib-1.18 + morita-2024-corpus + + + The Verbal Category of Conditionality in <fixed-case>B</fixed-case>ulgarian and its <fixed-case>U</fixed-case>krainian Correspondences + IvanDerzhanski + OlenaSiruk + 187–195 + Modern Bulgarian shares a conditional mood with the other Slavic languages, but it also has developed a future-in-the-past tense which is structurally analogous to many Western European languages’ category traditionally called a conditional mood in their grammars. The distinction between these two forms is sometimes elusive and can be difficult for native speakers of Slavic languages who are learning Bulgarian. In this paper we consider the uses of the Bulgarian conditional mood and future-in-the-past tense in a parallel corpus of Bulgarian and Ukrainian text, examining the corresponding wording in Ukrainian, where the conditional mood is supplemented by modal verbs, and discuss the breadth of choices open to translators when working in each direction. + 2024.clib-1.19 + derzhanski-siruk-2024-verbal + + + Lexical Richness of <fixed-case>F</fixed-case>rench and <fixed-case>Q</fixed-case>uebec Journalistic Texts + NataliaDankova + 196–200 + This paper presents some results of a quantitative study that focuses on the variety and word frequency in texts from a comparative perspective. The study aims to analyze and compare French and Quebec journalistic texts on political and cultural topics written in French and recently published in major newspapers such as Le Monde, le Figaro, Le Devoir, etc. The statistical analysis concerns the number of different words in the text, the number of different adjectives, the number of different verbs (and also passive structures, participles and gerunds which contribute to syntactic and stylistic sophistication), and the number of hapaxes. French texts from France exhibit greater lexical richness and sophistication: they contain more adjectives, a greater variety of adjectives, as well as more participles and gerunds compared to French texts from Quebec. The originality of the study lies in the fact that it analyzes variation in French using a lexicometric approach. + 2024.clib-1.20 + dankova-2024-lexical + + + A Corpus of Liturgical Texts in <fixed-case>G</fixed-case>erman: Towards Multilevel Text Annotation + MariaKhokhlova + MikhailKoryshev + 201–205 + The aim of the study is to create a “documented” literary and theological history of German Catholic hymnography. The paper focuses on the creation of a corpus of liturgical texts in German and describes the first stage of annotation dealing with the metatextual markup of Catholic hymns. The authors dwell in detail on the parameters of the multi-level classification of hymn texts they developed, which allows them to differentiate hymns on different grounds. The parameters include not only characteristics that represent hymns (the period and the source of their origin, rubrics, musical accompaniment), but also ones that are inherent for strophes. Based on the created markup, it is possible to trace general trends in texts divided according to certain meta-features. The developed scheme of annotation is given on the example of the hymnbook Gotteslob (1975). The results present statistics on different parameters used for hymn description. + 2024.clib-1.21 + khokhlova-koryshev-2024-corpus + + + <fixed-case>E</fixed-case>ur<fixed-case>L</fixed-case>ex<fixed-case>S</fixed-case>ummarization – A New Text Summarization Dataset on <fixed-case>EU</fixed-case> Legislation in 24 Languages with <fixed-case>GPT</fixed-case> Evaluation + ValentinZmiycharov + TodorTsonkov + IvanKoychev + 206–213 + Legal documents are notorious for their length and complexity, making it challenging to extract crucial information efficiently. In this paper, we introduce a new dataset for legal text summarization, covering 24 languages. We not only present and analyze the dataset but also conduct experiments using various extractive techniques. We provide a comparison between these techniques and summaries generated by the state-of-the-art GPT models. The abstractive GPT approach outperforms the extractive TextRank approach in 8 languages, but produces slightly lower results in the remaining 16 languages. This research aims to advance the field of legal document summarization by addressing the need for accessible and comprehensive information retrieval from lengthy legal texts. + 2024.clib-1.22 + zmiycharov-etal-2024-eurlexsummarization + + + On a Hurtlex Resource for <fixed-case>B</fixed-case>ulgarian + PetyaOsenova + 214–219 + The paper reports on the cleaning of the Hurtlex lexicon for Bulgarian as part of the multilingual Hurtlex resource. All the challenges during the cleaning process are presented, such as: deleting strings or lexica that are clear errors from the automatic translation, establishing criteria for keeping or discarding a lexeme based on its meaning and potential usages, contextualizing the lexeme with the meaning through an example, etc. In addition, the paper discusses the mapping of the offensive lexica to the BTB-Wordnet as well as the system that has been used. + 2024.clib-1.23 + osenova-2024-hurtlex + + + Unified Annotation of the Stages of the <fixed-case>B</fixed-case>ulgarian Language. First Steps + FabioMaion + TsvetanaDimitrova + AndrejBojadziev + 220–226 + The paper reports on an ongoing work on a proposal of guidelines for unified annotation of the stages in the development of the Bulgarian language from the Middle Ages to the early modern period. It discusses the criteria for the selection of texts and their representation, along with some results of the trial tagging with an existing tagger which was already trained on other texts. + 2024.clib-1.24 + maion-etal-2024-unified + + + <fixed-case>C</fixed-case>hat<fixed-case>GPT</fixed-case>: Detection of <fixed-case>S</fixed-case>panish Terms Based on False <fixed-case>F</fixed-case>riends + AmalHaddad Haddad + DamithPremasiri + 227–240 + One of the common errors which translators commit when transferring terms from one lan- guage into another is erroneously coining terms which are based on a false friend mistake due to the similarity between lexical units forming part of terms. In this case-study, we use Chat- GPT to automatically detect terms in Spanish which may be coined based on a false friend relation. To carry out this study, we imple- mented two experiments with GPT and com- pared the results. In the first, we prompted GPT to produce a list of twenty terms in Span- ish extracted from the UN discourse, which are possibly based on false friend relation, and its English equivalents and analysed the veracity of the results. In the second experiment, we used an aligned corpus to further study the ca- pabilities of the Language Model on detecting false friends in English and Spanish Text. Some results were significant for future terminologi- cal studies. + 2024.clib-1.25 + haddad-haddad-premasiri-2024-chatgpt + + + Deep Learning Framework for Identifying Future Market Opportunities from Textual User Reviews + JordanKralev + 241–248 + The paper develops an application of design gap theory for identification of future market segment growth and capitalization from a set of customer reviews for bought products from the market in a given past period. To build a consumer feature space, an encoded-decoder network with attention is trained over the textual reviews after they are pre-processed through tokenization and embedding layers. The encodings for product reviews are used to train a variational auto encoder network for representation of a product feature space. The sampling capabilities of this network are extended with a function to look for innovative designs with high consumer preferences, characterizing future opportunities in a given market segment. The framework is demonstrated for processing of Amazon reviews in consumer electronics segment. + 2024.clib-1.26 + kralev-2024-deep + + + Look Who’s Talking: The Most Frequently Used Words in the <fixed-case>B</fixed-case>ulgarian Parliament 1990-2024 + RuslanaMargova + BastiaanBruinsma + 249–256 + In this study we identify the most frequently used words and some multi-word expressions in the Bulgarian Parliament. We do this by using the transcripts of all plenary sessions between 1990 and 2024 - 3,936 in total. This allows us both to study an interesting period known in the Bulgarian linguistic space as the years of “transition and democracy”, and to provide scholars of Bulgarian politics with a purposefully generated list of additional stop words that they can use for future analysis. Because our list of words was generated from the data, there is no preconceived theory, and because we include all interactions during all sessions, our analysis goes beyond traditional party lines. We provide details of how we selected, retrieved, and cleaned our data, and discuss our findings. + 2024.clib-1.27 + margova-bruinsma-2024-look + + + Estimating Commonsense Knowledge from a Linguistic Analysis on Information Distribution + SabrinaMennella + MariaDi Maro + MartinaDi Bratto + 257–263 + Commonsense Knowledge (CSK) is defined as a complex and multifaceted structure, encompassing a wide range of knowledge and reasoning generally acquired through everyday experiences. As CSK is often implicit in communication, it poses a challenge for AI systems to simulate human-like interaction. This work aims to deepen the CSK information structure from a linguistic perspective, starting from its organisation in conversations. To achieve this goal, we developed a three-level analysis model to extract more insights about this knowledge, focusing our attention on the second level. In particular, we aimed to extract the distribution of explicit actions and their execution order in the communicative flow. We built an annotation scheme based on FrameNet and applied it to a dialogical corpus on the culinary domain. Preliminary results indicate that certain frames occur earlier in the dialogues, while others occur towards the process’s end. These findings contribute to the systematic nature of actions by establishing clear patterns and relationships between frames. + 2024.clib-1.28 + mennella-etal-2024-estimating + + + Pondera: A Personalized <fixed-case>AI</fixed-case>–Driven Weight Loss Mobile Companion with Multidimensional Goal Fulfillment Analytics + GeorgiPashev + SilviaGaftandzhieva + 264–271 + The global obesity epidemic is a significant challenge to public health, necessitating innovative and personalized solutions. This paper presents Pondera, an innovative mobile app revolutionizing weight management by integrating Artificial Intelligence (AI) and multidimensional goal fulfilment analytics. Pondera distinguishes itself by supplying a tailored approach to weight loss, combining individual user data, including dietary preferences, fitness levels, and specific weight loss objectives, with advanced AI algorithms to generate personalized weight loss plans. Future development directions include refining AI algorithms, enhancing user experience, and validating effectiveness through comprehensive studies, ensuring Pondera becomes a pivotal tool in achieving sustainable weight loss and health improvement. + 2024.clib-1.29 + pashev-gaftandzhieva-2024-pondera + + + Mitigating Hallucinations in Large Language Models via Semantic Enrichment of Prompts: Insights from <fixed-case>B</fixed-case>io<fixed-case>BERT</fixed-case> and Ontological Integration + StanislavPenkov + 272–276 + The advent of Large Language Models (LLMs) has been transformative for natural language processing, yet their tendency to produce “hallucinations”—outputs that are factually incorrect or entirely fabricated— remains a significant hurdle. This paper introduces a proactive methodology for reducing hallucinations by strategically enriching LLM prompts. This involves identifying key entities and contextual cues from varied domains and integrating this information into the LLM prompts to guide the model towards more accurate and relevant responses. Leveraging examples from BioBERT for biomedical entity recognition and ChEBI for chemical ontology, we illustrate a broader approach that encompasses semantic prompt enrichment as a versatile tool for enhancing LLM output accuracy. By examining the potential of semantic and ontological enrichment in diverse contexts, we aim to present a scalable strategy for improving the reliability of AI-generated content, thereby contributing to the ongoing efforts to refine LLMs for a wide range of applications. + 2024.clib-1.30 + penkov-2024-mitigating + + + Commercially Minor Languages and Localization + MariaTodorova + 277–285 + This paper offers a perspective of languages with a less significant volume of digital usership as minor in the context of globalization and localization. With this premise, the risks this status poses to the quality of localized texts, the substantiality of genre conventions, the public image of professional translators, and the users’ linguistic competence in these languages is explored. Furthermore, the common lack of established or clear conventions in the localization of digital products into commercially minor languages (and in the digital product genres) is highlighted as one of the factors amplifying these risks. These perspectives are contextualized with the Bulgarian language with examples of errors encountered in Bulgarian digital content localized from English and more specifically – errors and problems related to gender neutrality and register. + 2024.clib-1.31 + todorova-2024-commercially + + + Semantic features in the automatic analysis of verbs of creation in <fixed-case>B</fixed-case>ulgarian and <fixed-case>E</fixed-case>nglish + IvelinaStoyanova + 286–295 + The paper focuses on the semantic class of verbs of creation as a subclass of dynamic verbs. The objective is to present the description of creation verbs in terms of their corresponding semantic frames and to outline the semantic features of the frame elements with a view to their automatic identification and analysis in text. The observations are performed on Bulgarian and English data with the aim to establish the language-independent and language-specific features in the semantic description of the analysed class of verbs. + 2024.clib-1.32 + stoyanova-2024-semantic + + + A ‘Dipdive’ into Motion: Exploring Lexical Resources towards a Comprehensive Semantic and Syntactic Description + SvetlozaraLeseva + 296–308 + In this paper I illustrate the semantic description of verbs provided in three semantic resources (FrameNet, VerbNet and VerbAtlas) in comparative terms with a view to identifying common and distinct components in their representation and obtaining a preliminary idea of the resources’ interoperability. To this end, I provide a comparison of a small sample of motion verbs aligned with semantic frames and classes in the three resources. I also describe the semantic annotation of Bulgarian motion verbs using the framework defined in the Berkeley FrameNet project and its enrichment with information from the other two resources, which has been enabled by the mapping between: (i) their major semantic units – FrameNet frames, VerbNet classes and VerbAtlas frames, and (ii) their ’building blocks’ – frame elements (FrameNet )and semantic roles (VerbNet, VerbAtlas). + 2024.clib-1.33 + leseva-2024-dipdive + + + Multilingual Corpus of Illustrative Examples on Activity Predicates + IvelinaStoyanova + HristinaKukova + MariaTodorova + TsvetanaDimitrova + 309–318 + The paper presents the ongoing process of compilation of a multilingual corpus of illustrative examples to supplement our work on the syntactic and semantic analysis of predicates representing activities in Bulgarian and other languages. The corpus aims to include over 1,000 illustrative examples on verbs from six semantic classes of predicates (verbs of motion, contact, consumption, creation, competition and bodily functions) which provide a basis for observations on the specificity of their realisation. The corpus of illustrative examples will be used for contrastive studies and further elaboration on the scope and behaviour of activity verbs in general, as well as its semantic subclasses. + 2024.clib-1.34 + stoyanova-etal-2024-multilingual + + + Large Language Models in Linguistic Research: the Pilot and the Copilot + SvetlaKoeva + 319–328 + In this paper, we present two experiments focussing on linguistic classification and annotation of examples, using zero-shot prompting. The aim is to show how large language models can confirm or reject the linguistic judgements of experts in order to increase the productivity of their work. In the first experiment, new lexical units evoking a particular FrameNet semantic frame are selected simultaneously with the annotation of examples with the core frame elements. The second experiment attempts to categorise verbs into the aspectual classes, assuming that only certain combinations of verbs belonging to different aspectual classes evoke a semantic frame. The linguistic theories underlying the two experiments, the development of the prompts and the results of the experiments are presented. + 2024.clib-1.35 + koeva-2024-large + +
+
diff --git a/data/xml/2024.eacl.xml b/data/xml/2024.eacl.xml index fac82099eb..5aeeb34cea 100644 --- a/data/xml/2024.eacl.xml +++ b/data/xml/2024.eacl.xml @@ -364,7 +364,7 @@ VerenaBlaschkeLudwig-Maximilians-Universität München BarbaraPlankLudwig-Maximilians-Universität München and IT University of Copenhagen 445-468 - Mainstream cross-lingual task-oriented dialogue (ToD) systems leverage the transfer learning paradigm by training a joint model for intent recognition and slot-filling in English and applying it, zero-shot, to other languages.We address a gap in prior research, which often overlooked the transfer to lower-resource colloquial varieties due to limited test data.Inspired by prior work on English varieties, we craft and manually evaluate perturbation rules that transform German sentences into colloquial forms and use them to synthesize test sets in four ToD datasets.Our perturbation rules cover 18 distinct language phenomena, enabling us to explore the impact of each perturbation on slot and intent performance.Using these new datasets, we conduct an experimental evaluation across six different transformers.Here, we demonstrate that when applied to colloquial varieties, ToD systems maintain their intent recognition performance, losing 6% (4.62 percentage points) in accuracy on average. However, they exhibit a significant drop in slot detection, with a decrease of 31% (21 percentage points) in slot F_1 score.Our findings are further supported by a transfer experiment from Standard American English to synthetic Urban African American Vernacular English. + Mainstream cross-lingual task-oriented dialogue (ToD) systems leverage the transfer learning paradigm by training a joint model for intent recognition and slot-filling in English and applying it, zero-shot, to other languages. We address a gap in prior research, which often overlooked the transfer to lower-resource colloquial varieties due to limited test data. Inspired by prior work on English varieties, we craft and manually evaluate perturbation rules that transform German sentences into colloquial forms and use them to synthesize test sets in four ToD datasets. Our perturbation rules cover 18 distinct language phenomena, enabling us to explore the impact of each perturbation on slot and intent performance. Using these new datasets, we conduct an experimental evaluation across six different transformers. Here, we demonstrate that when applied to colloquial varieties, ToD systems maintain their intent recognition performance, losing 6% (4.62 percentage points) in accuracy on average. However, they exhibit a significant drop in slot detection, with a decrease of 31% (21 percentage points) in slot F_1 score. Our findings are further supported by a transfer experiment from Standard American English to synthetic Urban African American Vernacular English. 2024.eacl-long.28 2024.eacl-long.28.software.zip artemova-etal-2024-exploring diff --git a/data/xml/2024.findings.xml b/data/xml/2024.findings.xml index 1b32a29dd4..e053da99cf 100644 --- a/data/xml/2024.findings.xml +++ b/data/xml/2024.findings.xml @@ -5893,19 +5893,19 @@ - Findings of the Association for Computational Linguistics ACL 2024 + Findings of the Association for Computational Linguistics: ACL 2024 Lun-WeiKuAcademia Sinica AndreMartinsInstituto Superior Técnico / Instituto de Telecomunicações / Unbabel VivekSrikumarUniversity of Utah Association for Computational Linguistics -
Bangkok, Thailand and virtual meeting
+
Bangkok, Thailand
August 2024 - 2024.findings-acl + 2024.findings-acl findings - 2024.findings-acl.0 + 2024.findings-acl.0 findings-2024-acl @@ -5915,7 +5915,7 @@ JingboShangUniversity of California, San Diego 1-16 Prompting large language models (LLMs) for data augmentation has recently become a common practice in few-shot NLP tasks. In this paper, we propose Chain-of-Thought Attribute Manipulation (CoTAM), a novel approach that generates new data from existing examples by only tweaking in the user-provided, task-specific attribute, e.g., sentiment polarity or topic in movie reviews. Instead of conventional latent representation controlling, we leverage the chain-of-thought prompting to directly edit the text in three steps, (1) attribute decomposition, (2) manipulation proposal, and (3) sentence reconstruction. Extensive results on various tasks, such as text (pair) classification and aspect-based sentiment analysis, verify the superiority of CoTAM over other LLM-based augmentation methods with the same number of training examples for both fine-tuning and in-context learning. Remarkably, the 2D visualization of the augmented dataset using principle component analysis revealed a human-recognizable decision boundary that is likely hinted by the attribute manipulation, demonstrating the potential of our proposed approach. - 2024.findings-acl.1 + 2024.findings-acl.1 peng-etal-2024-controllable 10.18653/v1/2024.findings-acl.1 @@ -5926,7 +5926,7 @@ YiFeng 17-27 Keyphrase extraction aims to automatically extract salient phrases representing the critical information in the source document. Identifying salient phrases is challenging because there is a lot of noisy information in the document, leading to wrong extraction. To address this issue, in this paper, we propose a hybrid matching model for keyphrase extraction, which combines representation-focused and interaction-based matching modules into a unified framework for improving the performance of the keyphrase extraction task. Specifically, HybridMatch comprises (1) a PLM-based Siamese encoder component that represents both candidate phrases and documents, (2) an interaction-focused matching (IM) component that estimates word matches between candidate phrases and the corresponding document at the word level, and (3) a representation-focused matching (RM) component captures context-aware semantic relatedness of each candidate keyphrase at the phrase level. Extensive experimental results on the OpenKP dataset demonstrate that the performance of the proposed model HybridMatch outperforms the recent state-of-the-art keyphrase extraction baselines. Furthermore, we discuss the performance of large language models in keyphrase extraction based on recent studies and our experiments. - 2024.findings-acl.2 + 2024.findings-acl.2 song-etal-2024-match 10.18653/v1/2024.findings-acl.2 @@ -5941,7 +5941,7 @@ NingyiXuShanghai Jiaotong University 28-36 Large language models (LLMs) show great performance in various tasks, but face deployment challenges from limited memory capacity and bandwidth.Low-bit weight quantization can save memory and accelerate inference.Although floating-point (FP) formats show good performance in LLM quantization, they tend to perform poorly with small group sizes or sub-4 bits.We find the reason is that the absence of asymmetry in previous FP quantization makes it unsuitable for handling asymmetric value distribution of LLM weight tensors.In this work, we propose asymmetric FP quantization (AFPQ), which sets separate scales for positive and negative values.Our method leads to large accuracy improvements and can be easily plugged into other quantization methods, including GPTQ and AWQ, for better performance.Besides, no additional storage is needed compared with asymmetric integer (INT) quantization.The code is available at https://github.com/zhangsichengsjtu/AFPQ. - 2024.findings-acl.3 + 2024.findings-acl.3 zhang-etal-2024-afpq 10.18653/v1/2024.findings-acl.3