diff --git a/Makefile b/Makefile
index a93259c9e7..d50cc5d8fc 100644
--- a/Makefile
+++ b/Makefile
@@ -307,6 +307,9 @@ autofix: check_staged_xml venv/bin/activate
 	 [ "$${PRE_DIFF}" = "$${POST_DIFF}" ] || EXIT_STATUS=1 ;\
 	 [ $${EXIT_STATUS} -eq 0 ]
 
+.PHONY: reformat
+reformat: autofix
+
 .PHONY: serve
 serve:
 	 @echo "INFO     Starting a server at http://localhost:8000/"
diff --git a/bin/add_dois.py b/bin/add_dois.py
index 59015599e5..44efef5e6b 100755
--- a/bin/add_dois.py
+++ b/bin/add_dois.py
@@ -125,7 +125,7 @@ def process_volume(anthology_volume):
             added = add_doi(paper, collection_id, volume_id, force=args.force)
             if added:
                 num_added += 1
-                sleep(1)
+                sleep(0.1)
 
         indent(tree.getroot())
 
diff --git a/bin/ingest.py b/bin/ingest.py
index e75bfdaa3d..aebe601b50 100755
--- a/bin/ingest.py
+++ b/bin/ingest.py
@@ -294,6 +294,7 @@ def find_book():
                     meta["path"],
                     "cdrom",
                     f"{year}-{venue_name.lower()}-{volume_name}.pdf",
+                    f"{venue_name.lower()}-{year}.{volume_name}.pdf",
                 ),
                 os.path.join(meta["path"], "cdrom", f"{venue_name.upper()}-{year}.pdf"),
             ]
diff --git a/bin/ingest_aclpub2.py b/bin/ingest_aclpub2.py
index d661fc51b6..39ac8b3bbb 100755
--- a/bin/ingest_aclpub2.py
+++ b/bin/ingest_aclpub2.py
@@ -204,8 +204,9 @@ def parse_paper_yaml(ingestion_dir: str) -> List[Dict[str, str]]:
     else:
         raise Exception("Can't find papers.yml (looked in root dir and under inputs/)")
 
-    # remove non-archival papers
-    papers = [p for p in papers if p.get('archival', True)]
+    for paper in papers:
+        if "archival" not in paper:
+            paper["archival"] = False
 
     return papers
 
@@ -220,39 +221,40 @@ def add_paper_nums_in_paper_yaml(
 
     start, end = 1, 0
     for paper in papers:
-        assert 'file' in paper.keys(), f'{paper["id"]} is missing key file'
+        if paper["archival"]:
+            assert 'file' in paper.keys(), f'{paper["id"]} is missing key file'
 
-        paper_id = str(paper['id'])
-        # if 'file' not in paper.keys():
-        #     print(f'{paper_id} does not have file key but archive is {paper["archival"]}')
-        #     paper_name = paper['title']
-        # else:
-
-        paper_path = paper['file']
-
-        # TODO: we should just be able to read paper_path directly, and throw an
-        # error if it doesn't exist
-        paper_need_read_path = None
-        paths_to_check = [
-            ingestion_dir / "watermarked_pdfs" / paper_path,
-            ingestion_dir / "watermarked_pdfs" / f"{paper_id}.pdf",
-        ]
-        paper_need_read_path = None
-        for path in paths_to_check:
-            if path.exists():
-                paper_need_read_path = str(path)
-                break
-        else:
-            raise Exception(
-                f"* Fatal: could not find paper ID {paper_id} ({paths_to_check})"
-            )
+            paper_id = str(paper['id'])
+            # if 'file' not in paper.keys():
+            #     print(f'{paper_id} does not have file key but archive is {paper["archival"]}')
+            #     paper_name = paper['title']
+            # else:
 
-        pdf = open(paper_need_read_path, 'rb')
-        pdf_reader = PyPDF2.PdfReader(pdf)
-        num_of_pages = len(pdf_reader.pages)
-        start = end + 1
-        end = start + num_of_pages - 1
-        paper['pages'] = f'{start}-{end}'
+            paper_path = paper['file']
+
+            # TODO: we should just be able to read paper_path directly, and throw an
+            # error if it doesn't exist
+            paper_need_read_path = None
+            paths_to_check = [
+                ingestion_dir / "watermarked_pdfs" / paper_path,
+                ingestion_dir / "watermarked_pdfs" / f"{paper_id}.pdf",
+            ]
+            paper_need_read_path = None
+            for path in paths_to_check:
+                if path.exists():
+                    paper_need_read_path = str(path)
+                    break
+            else:
+                raise Exception(
+                    f"* Fatal: could not find paper ID {paper_id} ({paths_to_check})"
+                )
+
+            pdf = open(paper_need_read_path, 'rb')
+            pdf_reader = PyPDF2.PdfReader(pdf)
+            num_of_pages = len(pdf_reader.pages)
+            start = end + 1
+            end = start + num_of_pages - 1
+            paper['pages'] = f'{start}-{end}'
 
     return papers
 
@@ -532,6 +534,7 @@ def copy_pdf_and_attachment(
     volume[0] = {
         "anthology_id": f"{collection_id}-{volume_name}.0",
         "attachments": [],
+        "archival": True,
     }
 
     frontmatter_src_path = None
@@ -562,28 +565,23 @@ def copy_pdf_and_attachment(
 
     paper_num = 0
     for i, paper in enumerate(papers):
-        # archival papers only
-        if 'archival' not in paper.keys():
-            paper.update({'archival': '1'})
         assert 'archival' in paper.keys(), f'{paper["id"]} is missing key archival'
         assert 'file' in paper.keys(), f'{paper["id"]} is missing key file'
-        if (
-            paper['archival'] == 1
-            or paper['archival'] is True
-            or paper['archival'] == '1'
-        ):
-            # copy pdf
-            # if 'file' not in paper.keys():
-            #     paper_name = paper['title']
-            #     print(f'{paper_name} does not have file key')
-            # else:
-            paper_name = paper['file']
-            # paper_name = paper['file']
-            if paper_name != '' or paper_name is not None:
-                paper_id = str(paper['id'])
-                paper_num += 1
-                paper_id_full = f'{collection_id}-{volume_name}.{paper_num}'
 
+        paper_name = paper['file']
+        # paper_name = paper['file']
+        if paper_name != '' or paper_name is not None:
+            paper_id = str(paper['id'])
+            paper_num += 1
+            paper_id_full = f'{collection_id}-{volume_name}.{paper_num}'
+
+            volume[paper_num] = {
+                'anthology_id': paper_id_full,
+                'attachments': [],
+                'archival': paper["archival"],
+            }
+
+            if paper["archival"]:
                 pdf_src_path = None
                 if (pdfs_src_dir / paper_name).exists():
                     pdf_src_path = pdfs_src_dir / paper_name
@@ -599,61 +597,53 @@ def copy_pdf_and_attachment(
                 if not dry_run:
                     maybe_copy(pdf_src_path, pdf_dest_path)
 
-                volume[paper_num] = {
-                    'anthology_id': paper_id_full,
-                    'pdf': pdf_dest_path,
-                    'attachments': [],
-                }
-
-            # copy attachments
-            if 'attachments' in paper:
-                attachs_dest_dir = create_dest_path(attachments_dir, venue_name)
-                attachs_src_dir = meta['path'] / 'attachments'
-                # assert (
-                #     attachs_src_dir.exists()
-                # ), f'paper {i, paper_name} contains attachments but attachments folder was not found'
-
-                for attachment in paper['attachments']:
-                    file_path = Path(attachment.get('file', None))
-                    if file_path is None:
-                        continue
-
-                    attach_src_path = None
-                    paths_to_check = [
-                        attachs_src_dir / file_path,
-                        attachs_src_dir / file_path.name,
-                    ]
-                    for path in paths_to_check:
-                        if path.exists():
-                            attach_src_path = str(path)
-                            break
-                    else:
-                        print(
-                            f"Warning: paper {paper_id} attachment {file_path} not found, skipping",
-                            file=sys.stderr,
-                        )
-                        continue
-
-                    attach_src_extension = attach_src_path.split(".")[-1]
-                    type_ = attachment['type'].replace(" ", "")
-                    file_name = f'{collection_id}-{volume_name}.{paper_num}.{type_}.{attach_src_extension}'
-
-                    # the destination path
-                    attach_dest_path = os.path.join(attachs_dest_dir, file_name).replace(
-                        " ", ""
+                volume[paper_num]["pdf"] = pdf_dest_path
+
+        # copy attachments
+        if 'attachments' in paper:
+            attachs_dest_dir = create_dest_path(attachments_dir, venue_name)
+            attachs_src_dir = meta['path'] / 'attachments'
+            # assert (
+            #     attachs_src_dir.exists()
+            # ), f'paper {i, paper_name} contains attachments but attachments folder was not found'
+
+            for attachment in paper['attachments']:
+                file_path = Path(attachment.get('file', None))
+                if file_path is None:
+                    continue
+
+                attach_src_path = None
+                paths_to_check = [
+                    attachs_src_dir / file_path,
+                    attachs_src_dir / file_path.name,
+                ]
+                for path in paths_to_check:
+                    if path.exists():
+                        attach_src_path = str(path)
+                        break
+                else:
+                    print(
+                        f"Warning: paper {paper_id} attachment {file_path} not found, skipping",
+                        file=sys.stderr,
                     )
+                    continue
 
-                    if Path(attach_src_path).exists():
-                        if dry_run:
-                            print(
-                                f'would\'ve moved {attach_src_path} to {attach_dest_path}'
-                            )
-                        else:
-                            maybe_copy(attach_src_path, attach_dest_path)
-                            print(f"Attaching {attach_dest_path}/{type_} to {paper_num}")
-                            volume[paper_num]['attachments'].append(
-                                (attach_dest_path, type_)
-                            )
+                attach_src_extension = attach_src_path.split(".")[-1]
+                type_ = attachment['type'].replace(" ", "")
+                file_name = f'{collection_id}-{volume_name}.{paper_num}.{type_}.{attach_src_extension}'
+
+                # the destination path
+                attach_dest_path = os.path.join(attachs_dest_dir, file_name).replace(
+                    " ", ""
+                )
+
+                if Path(attach_src_path).exists():
+                    if dry_run:
+                        print(f'would\'ve moved {attach_src_path} to {attach_dest_path}')
+                    else:
+                        maybe_copy(attach_src_path, attach_dest_path)
+                        print(f"Attaching {attach_dest_path}/{type_} to {paper_num}")
+                        volume[paper_num]['attachments'].append((attach_dest_path, type_))
 
     return volume, collection_id, volume_name, proceedings_pdf_dest_path
 
@@ -692,6 +682,10 @@ def create_xml(
     meta_node = None
 
     for paper_num, paper in sorted(volume.items()):
+        if not paper["archival"]:
+            print(f"Skipping non-archival paper #{paper_num}", file=sys.stderr)
+            continue
+
         paper_id_full = paper['anthology_id']
         # print(f'creating xml for paper name {paper}, in papers {papers[paper_num-1]}')
         if paper_num == 0:
@@ -873,7 +867,12 @@ def main(ingestion_dir, pdfs_dir, attachments_dir, dry_run, anthology_dir, inges
 
     # Load the papers.yaml file, skipping non-archival papers
     papers = parse_paper_yaml(ingestion_dir)
-    # print(f'original paper {papers[0]}')
+    print(
+        "Found",
+        len([p for p in papers if p["archival"]]),
+        "archival papers",
+        file=sys.stderr,
+    )
 
     # add page numbering by parsing the PDFs
     papers = add_paper_nums_in_paper_yaml(papers, ingestion_dir)
diff --git a/data/xml/2014.clib.xml b/data/xml/2014.clib.xml
new file mode 100644
index 0000000000..db2f77a610
--- /dev/null
+++ b/data/xml/2014.clib.xml
@@ -0,0 +1,129 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<collection id="2014.clib">
+  <volume id="1" ingest-date="2024-10-11" type="proceedings">
+    <meta>
+      <booktitle>Proceedings of the First International Conference on Computational Linguistics in Bulgaria (CLIB 2014)</booktitle>
+      <publisher>Department of Computational Linguistics, Institute for Bulgarian Language, Bulgarian Academy of Sciences</publisher>
+      <address>Sofia, Bulgaria</address>
+      <month>September</month>
+      <year>2014</year>
+      <url hash="82f47cf0">2014.clib-1</url>
+      <venue>clib</venue>
+    </meta>
+    <frontmatter>
+      <pages>110</pages>
+      <url hash="cd607317">2014.clib-1.0</url>
+      <bibkey>clib-2014-1</bibkey>
+    </frontmatter>
+    <paper id="1">
+      <title>Electronic Language Resources in Teaching Mathematical Linguistics</title>
+      <author><first>Ivan</first><last>Derzhanski</last></author>
+      <author><first>Rositsa</first><last>Dekova</last></author>
+      <pages>1–5</pages>
+      <abstract>The central role of electronic language resources in education is widely recognised (cf. Brinkley et al, 1999; Bennett, 2010; Derzhanski et al., 2007, among others). The variety and ease of access of such resources predetermines their extensive use in both research and education. With regard to teaching mathematical linguistics, electronic dictionaries and annotated corpora play a particularly important part, being an essential source of information for composing linguistic problems and presenting linguistic knowledge. This paper discusses the need for electronic resources, especially for less studied or low-resource languages, their creation and various uses in teaching linguistics to secondary school students, with examples mostly drawn from our practical work.</abstract>
+      <url hash="e0a18d34">2014.clib-1.1</url>
+      <bibkey>derzhanski-dekova-2014-electronic</bibkey>
+    </paper>
+    <paper id="2">
+      <title>Harnessing Language Technologies in Multilingual Information Channelling Services</title>
+      <author><first>Diman</first><last>Karagiozov</last></author>
+      <pages>6–13</pages>
+      <abstract>Scientists and industry have put significant efforts in creating suitable tools to analyze information flows. However, up to now there are no successful solutions for 1) dynamic modeling of the user-defined interests and further personalization of the results, 2) effective cross-language information retrieval, and 3) processing of multilingual content. As a consequence, much of the potentially relevant and otherwise accessible data from the media stream may elude users’ grasp. We present a multilingual information channeling system, MediaTalk, which offers broad integration between language technologies and advanced data processing algorithms for annotation, analysis and classification of multilingual content. As a result, the system not only provides an all-in-one monitoring service that covers both traditional and social media, but also offers dynamic modeling of user profiles, personalization of obtained data and cross-language information retrieval. Bulgarian and English press clipping services relying on this system implement advanced functionalities such as identification of emerging topics, forecasting and trend prediction, all of which allow the users to monitor their standing reputation, events and relations. The architecture of the system is robust, extensible and adheres to the Big Data paradigm.</abstract>
+      <url hash="09d9e2ea">2014.clib-1.2</url>
+      <bibkey>karagiozov-2014-harnessing</bibkey>
+    </paper>
+    <paper id="3">
+      <title>Automatic Semantic Filtering of Morphosemantic Relations in <fixed-case>W</fixed-case>ord<fixed-case>N</fixed-case>et</title>
+      <author><first>Svetlozara</first><last>Leseva</last></author>
+      <author><first>Ivelina</first><last>Stoyanova</last></author>
+      <author><first>Borislav</first><last>Rizov</last></author>
+      <author><first>Maria</first><last>Todorova</last></author>
+      <author><first>Ekaterina</first><last>Tarpomanova</last></author>
+      <pages>14–22</pages>
+      <abstract>In this paper we present a method for automatic assignment of morphosemantic relations between derivationally related verb–noun pairs of synsets in the Bulgarian WordNet (BulNet) and for semantic filtering of those relations. The filtering process relies on the meaning of noun suffixes and the semantic compatibility of verb and noun taxonomic classes. We use the taxonomic labels assigned to all the synsets in the Princeton WordNet (PWN) – one label per synset – which denote their general semantic class. In the first iteration we employ the pairs &lt;noun suffix : noun label&gt; to filter out part of the relations. In the second iteration, which uses as input the output of the first one, we apply a stronger semantic filter. It makes use of the taxonomic labels of the noun-verb synset pairs observed for a given morphosemantic relation. In this way we manage to reliably filter out impossible or unlikely combinations. The results of the performed experiment may be applied to enrich BulNet with morphosemantic relations and new synsets semi-automatically, while facilitating the manual work and reducing its cost.</abstract>
+      <url hash="87e722fc">2014.clib-1.3</url>
+      <bibkey>leseva-etal-2014-automatic</bibkey>
+    </paper>
+    <paper id="4">
+      <title>Noun-Verb Derivation in the <fixed-case>B</fixed-case>ulgarian and the <fixed-case>R</fixed-case>omanian <fixed-case>W</fixed-case>ord<fixed-case>N</fixed-case>et – A Comparative Approach</title>
+      <author><first>Ekaterina</first><last>Tarpomanova</last></author>
+      <author><first>Svetlozara</first><last>Leseva</last></author>
+      <author><first>Maria</first><last>Todorova</last></author>
+      <author><first>Tsvetana</first><last>Dimitrova</last></author>
+      <author><first>Borislav</first><last>Rizov</last></author>
+      <author><first>Verginica</first><last>Barbu Mititelu</last></author>
+      <author><first>Elena</first><last>Irimia</last></author>
+      <pages>23–31</pages>
+      <abstract>Romanian and Bulgarian are Balkan languages with rich derivational morphology that, if introduced into their respective wordnets, can aid broadening of the wordnet content and the possible NLP applications. In this paper we present a joint work on introducing derivation into the Bulgarian and the Romanian WordNets, BulNet and RoWordNet, respectively, by identifying and subsequently labelling the derivationally and semantically related noun-verb pairs. Our research aims at providing a framework for a comparative study on derivation in the two languages and offering training material for the automatic identification and assignment of derivational and morphosemantic relations needed in various applications.</abstract>
+      <url hash="5480494c">2014.clib-1.4</url>
+      <bibkey>tarpomanova-etal-2014-noun</bibkey>
+    </paper>
+    <paper id="5">
+      <title>Semi-Automatic Detection of Multiword Expressions in the <fixed-case>S</fixed-case>lovak Dependency Treebank</title>
+      <author><first>Daniela</first><last>Majchrakova</last></author>
+      <author><first>Ondrej</first><last>Dusek</last></author>
+      <author><first>Jan</first><last>Hajic</last></author>
+      <author><first>Agata</first><last>Karcova</last></author>
+      <author><first>Radovan</first><last>Garabik</last></author>
+      <pages>32–39</pages>
+      <abstract>We describe a method for semi-automatic extraction of Slovak multiword expressions (MWEs) from a dependency treebank. The process uses an automatic conversion from dependency syntactic trees to deep syntax and automatic tagging of verbal argument nodes based on a valency dictionary. Both the valency dictionary and the treebank conversion were adapted from the corresponding Czech versions; the automatically translated valency dictionary has been manually proofread and corrected. There are two main achievements – a valency dictionary of Slovak MWEs with direct links to corresponding expressions in the Czech dictionary, PDT-Vallex, and a method of extraction of MWEs from the Slovak Dependency Treebank. The extraction reached very high precision but lower recall in a manual evaluation. This is a work in progress, the overall goal of which is twofold: to create a Slovak language valency dictionary paralleling the Czech one, with bilingual links; and to use the extracted verbal frames in a collocation dictionary of Slovak verbs.</abstract>
+      <url hash="17f10a1b">2014.clib-1.5</url>
+      <bibkey>majchrakova-etal-2014-semi</bibkey>
+    </paper>
+    <paper id="6">
+      <title>Automatic Categorisation of Multiword Expressions and Named Entities in <fixed-case>B</fixed-case>ulgarian</title>
+      <author><first>Ivelina</first><last>Stoyanova</last></author>
+      <pages>40–48</pages>
+      <abstract>This paper describes an approach for automatic categorisation of various types of multiword expressions (MWEs) with a focus on multiword named entities (MNEs), which compose a large portion of MWEs in general. The proposed algorithm is based on a refined classification of MWEs according to their idiomaticity. While MWE categorisation can be considered as a separate and independent task, it complements the general task of MWE recognition. After outlining the method, we set up an experiment to demonstrate its performance. We use the corpus Wiki1000+ that comprises 6,311 annotated Wikipedia articles of 1,000 or more words each, amounting to 13.4 million words in total. The study also employs a large dictionary of 59,369 MWEs noun phrases (out of more than 85,000 MWEs), labelled with their respective types. The dictionary is compiled automatically and verified semi-automatically. The research presented here is based on Bulgarian although most of the ideas, the methodology and the analysis are applicable to other Slavic and possibly other European languages.</abstract>
+      <url hash="9289693d">2014.clib-1.6</url>
+      <bibkey>stoyanova-2014-automatic</bibkey>
+    </paper>
+    <paper id="7">
+      <title>Temporal Adverbs and Adverbial Expressions in a Corpus of <fixed-case>B</fixed-case>ulgarian and <fixed-case>U</fixed-case>krainian Parallel Texts</title>
+      <author><first>Ivan</first><last>Derzhanski</last></author>
+      <author><first>Olena</first><last>Siruk</last></author>
+      <pages>49–54</pages>
+      <abstract>This paper presents a comparative bilingual corpus-based study of the use of several frequent temporal adverbs and adverbial expressions (‘always’, ‘sometimes’, ‘never’ and their synonyms) in Bulgarian and Ukrainian. The Ukrainian items were selected with the aid of synonym dictionaries of words and of set expressions, the corpus was used to identify their most common Bulgarian counterparts, and the frequencies of the correspondences were compared and scrutinised for possibly informative regularities.</abstract>
+      <url hash="53907c66">2014.clib-1.7</url>
+      <bibkey>derzhanski-siruk-2014-temporal</bibkey>
+    </paper>
+    <paper id="8">
+      <title>Historical Corpora of <fixed-case>B</fixed-case>ulgarian Language and Second Position Markers</title>
+      <author><first>Tsvetana</first><last>Dimitrova</last></author>
+      <author><first>Andrej</first><last>Boyadzhiev</last></author>
+      <pages>55–63</pages>
+      <abstract>This paper demonstrates how historical corpora can be used in researching language phenomena. We exemplify the advantages and disadvantages through exploring three of the available corpora that contain textual sources of Old and Middle Bulgarian language to shed light on some aspects of the development of two words of ambiguous class. We discuss their behaviour to outline certain conditions for diachronic change they have undergone. The three corpora are accessible online (and offline – for downloading search results, xml files, etc.).</abstract>
+      <url hash="e9adefa8">2014.clib-1.8</url>
+      <bibkey>dimitrova-boyadzhiev-2014-historical</bibkey>
+    </paper>
+    <paper id="9">
+      <title>Mаchine Translation Based on <fixed-case>W</fixed-case>ord<fixed-case>N</fixed-case>et and Dependency Relations</title>
+      <author><first>Luchezar</first><last>Jackov</last></author>
+      <pages>64–72</pages>
+      <abstract>The proposed machine translation (MT) approach uses WordNet (Fellbaum, 1998) as a base for concepts. It identifies the concepts and dependency relations using context-free grammars (CFGs) enriched with features, role markers and dependency markers. Multiple interpretation hypotheses are generated and then are scored using a knowledge base for the dependency relations. The hypothesis with the best score is used for generating the translation. The approach has already been implemented in an MT system for seven languages, namely Bulgarian, English, French, Spanish, Italian, German, and Turkish, and also for Chinese on experimental level.</abstract>
+      <url hash="2c1e5f52">2014.clib-1.9</url>
+      <bibkey>jackov-2014-machine</bibkey>
+    </paper>
+    <paper id="10">
+      <title>Recognize the Generality Relation between Sentences Using Asymmetric Association Measures</title>
+      <author><first>Sebastiao</first><last>Pais</last></author>
+      <author><first>Gael</first><last>Dias</last></author>
+      <author><first>Rumen</first><last>Moraliyski</last></author>
+      <pages>73–81</pages>
+      <abstract>In this paper we focus on a particular case of entailment, namely entailment by generality. We argue that there exist various types of implication, a range of different levels of entailment reasoning, based on lexical, syntactic, logical and common sense clues, at different levels of difficulty. We introduce the paradigm of Textual Entailment (TE) by Generality, which can be defined as the entailment from a specific statement towards a relatively more general statement. In this context, the Text T entails the Hypothesis H, and at the same time H is more general than T . We propose an unsupervised and language-independent method to recognize TE by Generality given a case of Text − Hypothesis or T − H where entailment relation holds.</abstract>
+      <url hash="30f9e32b">2014.clib-1.10</url>
+      <bibkey>pais-etal-2014-recognize</bibkey>
+    </paper>
+    <paper id="11">
+      <title>Unsupervised and Language Independent Method to Recognize Textual Entailment by Generality</title>
+      <author><first>Sebastiao</first><last>Pais</last></author>
+      <author><first>Gael</first><last>Dias</last></author>
+      <author><first>Joao</first><last>Cordeiro</last></author>
+      <author><first>Rumen</first><last>Moraliyski</last></author>
+      <pages>82–90</pages>
+      <abstract>In this work we introduce a particular case of textual entailment (TE), namely Textual Entailment by Generality (TEG). In text, there are different kinds of entailment yielded from different types of implicative reasoning (lexical, syntactic, common sense based), but here we focus just on TEG, which can be defined as an entailment from a specific statement towards a relatively more G general one. Therefore, we have T (G)→ H whenever the premise T entails the hypothesis H, the hypothesis being more general than the premise. We propose an unsupervised and language-independent method to recognize TEGs, given a pair T, H in an entailment relation. We have evaluated our proposal G → H English pairs, where we know through two experiments: (a) Test on T (G)→ H English pairs, where we know that TEG holds; (b) Test on T → H Portuguese pairs, randomly selected with 60% of TEGs and 40% of TE without generality dependency (TEnG).</abstract>
+      <url hash="9897e644">2014.clib-1.11</url>
+      <bibkey>pais-etal-2014-unsupervised</bibkey>
+    </paper>
+  </volume>
+</collection>
diff --git a/data/xml/2016.clib.xml b/data/xml/2016.clib.xml
new file mode 100644
index 0000000000..f72ba3fa9c
--- /dev/null
+++ b/data/xml/2016.clib.xml
@@ -0,0 +1,132 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<collection id="2016.clib">
+  <volume id="1" ingest-date="2024-10-11" type="proceedings">
+    <meta>
+      <booktitle>Proceedings of the Second International Conference on Computational Linguistics in Bulgaria (CLIB 2016)</booktitle>
+      <publisher>Department of Computational Linguistics, Institute for Bulgarian Language, Bulgarian Academy of Sciences</publisher>
+      <address>Sofia, Bulgaria</address>
+      <month>September</month>
+      <year>2016</year>
+      <url hash="e5a3ddcb">2016.clib-1</url>
+      <venue>clib</venue>
+    </meta>
+    <frontmatter>
+      <pages>110</pages>
+      <url hash="32bc11db">2016.clib-1.0</url>
+      <bibkey>clib-2016-1</bibkey>
+    </frontmatter>
+    <paper id="1">
+      <title>How to Differentiate the Closely Related Standard Languages?</title>
+      <author><first>Duško</first><last>Vitas</last></author>
+      <author><first>Ljubomir</first><last>Popović</last></author>
+      <author><first>Cvetana</first><last>Krstev</last></author>
+      <author><first>Anđelka</first><last>Zečević</last></author>
+      <pages>1–10</pages>
+      <abstract>In this paper the adequacy of the SETimes corpus as a basis for the comparison of closely related languages that are used in countries that emerged after the breakup of Yugoslavia is discussed by comparing it with other corpora. It is shown that the phenomena observed in this corpus and used to illustrate differences most specifically between Serbian and Croatian are consistent neither with their standards nor with other sources. Thus, results obtained on the basis of the SETimes corpus are corpus-biased and have to be reconsidered. This proves that the size of a corpus and its composition used in a linguistic research are crucial for assessing the obtained results.</abstract>
+      <url hash="d2940a8c">2016.clib-1.1</url>
+      <bibkey>vitas-etal-2016-differentiate</bibkey>
+    </paper>
+    <paper id="2">
+      <title>’While’ and ’Until’ Clauses and Expletive Negation in a Corpus of <fixed-case>B</fixed-case>ulgarian and <fixed-case>U</fixed-case>krainian Parallel Texts</title>
+      <author><first>Ivan</first><last>Derzhanski</last></author>
+      <author><first>Olena</first><last>Siruk</last></author>
+      <pages>11–18</pages>
+      <abstract>The combination of the meanings ‘while’ and ‘until’ in a single lexeme and the use of expletive negation with the latter meaning are widespread phenomena that are a rich source of research problems. In this paper we present a comparative bilingual Bulgarian and Ukrainian corpus-based study of several conjunctions that share these two meanings. We discuss the difference in the frequency of expletive negation in the two languages, the use of až ‘even, all the way’ in Ukrainian and the impact of the original language in translated texts.</abstract>
+      <url hash="71f22394">2016.clib-1.2</url>
+      <bibkey>derzhanski-siruk-2016-clauses</bibkey>
+    </paper>
+    <paper id="3">
+      <title>Linguistic Data Retrievable from a Treebank</title>
+      <author><first>Verginica</first><last>Barbu Mititelu</last></author>
+      <author><first>Elena</first><last>Irimia</last></author>
+      <pages>19–27</pages>
+      <abstract>This paper describes the Romanian treebank annotated according to the Universal Dependency principles. We present the types of texts included in the treebank, their processing phases and the tools used for doing it, as well as the levels of annotation, with a focus on the syntactic level. We briefly present the syntactic formalism used, the principles followed and the set of relations. The perspective we adopted is the linguist’s who searches the treebank for information with relevance for the study of Romanian. (S)He can interpret the statistics based on the corpus and can also query the treebank for finding examples to support a theory, for testing hypothesis or for discovering new tendencies. We use here the passive constructions in Romanian as a case study for showing how statistical data help understanding this linguistic phenomenon. We also discuss the kinds of linguistic information retrievable and non-retrievable form the treebank, based on the annotation principles.</abstract>
+      <url hash="ddfcd2ec">2016.clib-1.3</url>
+      <bibkey>barbu-mititelu-irimia-2016-linguistic</bibkey>
+    </paper>
+    <paper id="4">
+      <title>Towards the Automatic Identification of Light Verb Constructions in <fixed-case>B</fixed-case>ulgarian</title>
+      <author><first>Ivelina</first><last>Stoyanova</last></author>
+      <author><first>Svetlozara</first><last>Leseva</last></author>
+      <author><first>Maria</first><last>Todorova</last></author>
+      <pages>28–37</pages>
+      <abstract>This paper presents work in progress focused on developing a method for automatic identification of light verb constructions (LVCs) as a subclass of Bulgarian verbal MWEs. The method is based on machine learning and is trained on a set of LVCs extracted from the Bulgarian WordNet (BulNet) and the Bulgarian National Corpus (BulNC). The machine learning uses lexical, morphosyntactic, syntactic and semantic features of LVCs. We trained and tested two separate classifiers using the Java package Weka and two learning decision tree algorithms – J48 and RandomTree. The evaluation of the method includes 10-fold cross-validation on the training data from BulNet (F1 = 0.766 obtained by the J48 decision tree algorithm and F1 = 0.725 by the RandomTree algorithm), as well as evaluation of the performance on new instances from the BulNC (F1 = 0.802 by J48 and F1 = 0.607 by the RandomTree algorithm). Preliminary filtering of the candidates gives a slight improvement (F1 = 0.802 by J48 and F1 = 0.737 by RandomTree).</abstract>
+      <url hash="ab883a13">2016.clib-1.4</url>
+      <bibkey>stoyanova-etal-2016-towards</bibkey>
+    </paper>
+    <paper id="5">
+      <title><fixed-case>HR</fixed-case>4<fixed-case>EU</fixed-case> – Using Language Resources in Computer Aided Language Learning</title>
+      <author><first>Daša</first><last>Farkaš</last></author>
+      <author><first>Matea</first><last>Filko</last></author>
+      <author><first>Marko</first><last>Tadić</last></author>
+      <pages>38–46</pages>
+      <abstract>In this paper we present the HR4EU – web portal for e-learning of Croatian language. The web portal offers a new method of computer aided language learning (CALL) by encouraging language learners to use different language resources available for Croatian: corpora, inflectional and derivational morphological lexicons, treebank, Wordnet, etc. Apart from the previously developed language resources, the new ones are created in order to further facilitate the learning of Croatian language. We will focus on the usage of the treebank annotated at syntactic and semantic level in the CALL and describe the new HR4EU sub-corpus of the Croatian Dependency Treebank (HOBS). The HR4EU sub-corpus consists of approx. 550 sentences, which are manually annotated on syntactic and semantic role level according to the specifications used for the HOBS. The syntactic and the semantic structure of the sentence can be visualized as a dependency tree via the SynSem Visualizer. The visualization of the syntactic and the semantic structure of sentences will help users to produce syntactically and semantically correct sentences on their own.</abstract>
+      <url hash="5e4f56b8">2016.clib-1.5</url>
+      <bibkey>farkas-etal-2016-hr4eu</bibkey>
+    </paper>
+    <paper id="6">
+      <title><fixed-case>S</fixed-case>yn<fixed-case>T</fixed-case>ags – Web Interface for Syntactic and Semantic Annotation</title>
+      <author><first>Atanas</first><last>Atanasov</last></author>
+      <pages>47–53</pages>
+      <abstract>This paper presents a web tool for syntactic and semantic annotation and two of its applications. It gives the linguists the possibility to work with corpora and syntactic and semantic frames in XML format without having computer skills. The system is OS and platform independent and could be used both online and offline.</abstract>
+      <url hash="3fe30612">2016.clib-1.6</url>
+      <bibkey>atanasov-2016-syntags</bibkey>
+    </paper>
+    <paper id="7">
+      <title>Finding Good Answers in Online Forums: Community Question Answering for <fixed-case>B</fixed-case>ulgarian</title>
+      <author><first>Tsvetomila</first><last>Mihaylova</last></author>
+      <author><first>Ivan</first><last>Koychev</last></author>
+      <author><first>Preslav</first><last>Nakov</last></author>
+      <author><first>Ivelina</first><last>Nikolova</last></author>
+      <pages>54–63</pages>
+      <abstract>Community Question Answering (CQA) is a form of question answering that is getting increasingly popular as a research direction recently. Given a question posted in an online community forum and the thread of answers to it, a common formulation of the task is to rank automatically the answers, so that the good ones are ranked higher than the bad ones. Despite the vast research in CQA for English, very little attention has been paid to other languages. To bridge this gap, here we present our method for Community Question Answering in Bulgarian. We create annotated training and testing datasets for Bulgarian, and we further explore the applicability of machine translation for reusing English CQA data for building a Bulgarian system. The evaluation results show improvement over the baseline and can serve as a basis for further research.</abstract>
+      <url hash="dca0f01f">2016.clib-1.7</url>
+      <bibkey>mihaylova-etal-2016-finding</bibkey>
+    </paper>
+    <paper id="8">
+      <title>Quotation Retrieval System for <fixed-case>B</fixed-case>ulgarian Media Content</title>
+      <author><first>Svetla</first><last>Koeva</last></author>
+      <author><first>Ivelina</first><last>Stoyanova</last></author>
+      <author><first>Martin</first><last>Yalamov</last></author>
+      <pages>64–73</pages>
+      <abstract>This paper presents a method for automatic retrieval and attribution of quotations from media texts in Bulgarian. It involves recognition of report verbs (including their analytical forms) and syntactic patterns introducing quotations, as well as source attribution of the quote by identification of personal names, descriptors, and anaphora. The method is implemented in a fully-functional online system which offers a live service processing media content and extracting quotations on a daily basis. The system collects and processes written news texts from six Bulgarian media websites. The results are presented in a structured way with description, as well as sorting and filtering functionalities which facilitate the monitoring and analysis of media content. The method has been applied to extract quotations from English texts as well and can be adapted to work with other languages, provided that the respective language specific resources are supplied.</abstract>
+      <url hash="76c08c69">2016.clib-1.8</url>
+      <bibkey>koeva-etal-2016-quotation</bibkey>
+    </paper>
+    <paper id="9">
+      <title>Stress Patterns of Compounds and <fixed-case>MWE</fixed-case>s in <fixed-case>E</fixed-case>nglish and <fixed-case>B</fixed-case>ulgarian</title>
+      <author><first>Bistra</first><last>Popovska</last></author>
+      <author><first>Rositsa</first><last>Dekova</last></author>
+      <pages>74–77</pages>
+      <abstract>The paper presents an ongoing research on the stress patterns of compounds and MWEs of the type ADJ+N and their corresponding free NPs in English and Bulgarian. The research focuses on the identification and the formal representation of the possible stress patterns of compounds and MWEs and free NPs. During our research so far, we have compiled a corpus of over 2000 compounds and MWEs, approx. 1000 for each language – English and Bulgarian. Our theoretical framework includes elements from different theories, i.e. the Generative Phonology Theory, the Metrical Theory, and the Theory of Primary accent first which all define the stress as a prosodic element. Our main goals are to specify the prosodic region where the stress is defined in English and Bulgarian MWEs and noun phrases and to define the main features of the stress in MWEs and free NPs in English and Bulgarian. The results of our research can serve for implementation into NLP modules for spoken language processing and generation.</abstract>
+      <url hash="f452a3b2">2016.clib-1.9</url>
+      <bibkey>popovska-dekova-2016-stress</bibkey>
+    </paper>
+    <paper id="10">
+      <title>Verbal Multiword Expressions in <fixed-case>C</fixed-case>roatian</title>
+      <author><first>Krešimir</first><last>Šojat</last></author>
+      <author><first>Matea</first><last>Filko</last></author>
+      <author><first>Daša</first><last>Farkaš</last></author>
+      <pages>78–85</pages>
+      <abstract>The paper deals with verbal multiword expressions in Croatian. We focus on four types of verbal constructions: light verb constructions, i.e. constructions consisting of a light verb and a noun or prepositional phrase, complex predicate constructions, i.e. constructions consisting of a finite and infinitive verb, prepositional verb constructions, i.e. constructions consisting of a verb and a typical preposition, and, finally, verbal idioms, i.e. constructions with completely idiosyncratic meanings. All the constructions are annotated in the Universal Dependency treebank for Croatian. The identification of verbal multiword expressions is an important task in numerous NLP tasks. It is also important to define and delimitate this concept in linguistic theory.</abstract>
+      <url hash="eef96457">2016.clib-1.10</url>
+      <bibkey>sojat-etal-2016-verbal</bibkey>
+    </paper>
+    <paper id="11">
+      <title>A Simple Approach to Unifying Ambiguously Encoded <fixed-case>K</fixed-case>urdish Characters</title>
+      <author><first>Sardar</first><last>Jaf</last></author>
+      <pages>86–94</pages>
+      <abstract>In this study we outline a potential problem in the normalisation stage of processing texts that are based on a modified version of the Arabic alphabet. The main source of resources available for processing resource-scarce languages is raw text. We have identified an interesting challenge that must be addressed when normalising certain natural language texts. Many less-resourced languages, such as Kurdish, Farsi, Urdu, Pashtu, etc., use a modified version of the Arabic writing system. Many characters in harvested data from the Internet may have exactly the same form but encoded with different Unicode values (ambiguous characters). It is important to identify ambiguous characters during the normalisation stage of most text processing tasks. We will demonstrate cases related to ambiguous Kurdish and Farsi characters and propose a semi-automatic approach to identifying and unifying ambiguously encoded characters.</abstract>
+      <url hash="37960c5c">2016.clib-1.11</url>
+      <bibkey>jaf-2016-simple</bibkey>
+    </paper>
+    <paper id="12">
+      <title>A Possible Solution to the Problem of Machine Translation of Verb Forms from <fixed-case>B</fixed-case>ulgarian to <fixed-case>E</fixed-case>nglish</title>
+      <author><first>Todor</first><last>Lazarov</last></author>
+      <pages>95–100</pages>
+      <abstract>The paper‘s main subject is concerned with the problems related to machine translation of verb forms from Bulgarian to English. In separate sections of this article we discuss the problems related to differences between word formation in both languages and differences in the information that the verb forms grammaticalize. We also introduce the idea of implementing the statistical method of machine translation altogether with the rule-based method as a proposal for future research and the possible practical and theoretical outcomes.</abstract>
+      <url hash="59de12ef">2016.clib-1.12</url>
+      <bibkey>lazarov-2016-possible</bibkey>
+    </paper>
+  </volume>
+</collection>
diff --git a/data/xml/2018.clib.xml b/data/xml/2018.clib.xml
new file mode 100644
index 0000000000..d7eeaf62ea
--- /dev/null
+++ b/data/xml/2018.clib.xml
@@ -0,0 +1,240 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<collection id="2018.clib">
+  <volume id="1" ingest-date="2024-10-11" type="proceedings">
+    <meta>
+      <booktitle>Proceedings of the Third International Conference on Computational Linguistics in Bulgaria (CLIB 2018)</booktitle>
+      <publisher>Department of Computational Linguistics, Institute for Bulgarian Language, Bulgarian Academy of Sciences</publisher>
+      <address>Sofia, Bulgaria</address>
+      <month>May</month>
+      <year>2018</year>
+      <url hash="6107cf5c">2018.clib-1</url>
+      <venue>clib</venue>
+    </meta>
+    <frontmatter>
+      <pages>222</pages>
+      <url hash="5f36584c">2018.clib-1.0</url>
+      <bibkey>clib-2018-1</bibkey>
+    </frontmatter>
+    <paper id="1">
+      <title>With a little help from <fixed-case>NLP</fixed-case>: My Language Technology applications with impact on society</title>
+      <author><first>Ruslan</first><last>Mitkov</last></author>
+      <pages>1–4</pages>
+      <abstract>The keynote speech presents the speaker’s vision that research should lead to the development of applications which benefit society. To support this, the speaker will present three original methodologies proposed by him which underpin applications jointly implemented with colleagues from across his research group. These Language Technology tools already have a substantial societal impact in the following areas: learning and assessment, translation and care for people with language disabilities.</abstract>
+      <url hash="a7f2b993">2018.clib-1.1</url>
+      <bibkey>mitkov-2018-little</bibkey>
+    </paper>
+    <paper id="2">
+      <title><fixed-case>NLP</fixed-case>-based Assessment of Reading Efficiency in Early Grade Children</title>
+      <author><first>Vito</first><last>Pirrelli</last></author>
+      <pages>5–6</pages>
+      <abstract>Assessing reading skills is a laborious and time-consuming task, which requires monitoring a variety of interlocked abilities, ranging from accurate word rendering, reading fluency and lexical access, to linguistic comprehension, and interpretation, management and inference of complex events in working memory. No existing software, to our knowledge, is able to cover and integrate reading performance monitoring, instant feedback, personalised potentiation and intelligent decision support to teachers and speech therapists, assessment of response to intervention. NLP and ICT technologies can make such an ambitious platform an achievable target.</abstract>
+      <url hash="93bf7590">2018.clib-1.2</url>
+      <bibkey>pirrelli-2018-nlp</bibkey>
+    </paper>
+    <paper id="3">
+      <title>Figurative language processing: A developmental and <fixed-case>NLP</fixed-case> Perspective</title>
+      <author><first>Mila</first><last>Vulchanova</last></author>
+      <author><first>Valentin</first><last>Vulchanov</last></author>
+      <pages>7–14</pages>
+      <abstract>It is now common to employ evidence from human behaviour (e.g., child development) for the creation of computational models of this behaviour with a variety of applications (e.g., in developmental robotics). In this paper we address research in the comprehension and processing of figurative (non-literal) language in highly verbal individuals with autism in comparison with age- and language level-matched neuro-typical individuals and discuss critically what factors might account for the observed problems. Based on this evidence we try to outline the strategies used by human language users in understanding non-literal/non-compositional expressions and proceed to identifying possible solutions for automated language systems in the domain of idiomatic expressions.</abstract>
+      <url hash="611aa233">2018.clib-1.3</url>
+      <bibkey>vulchanova-vulchanov-2018-figurative</bibkey>
+    </paper>
+    <paper id="4">
+      <title>Abstractive Text Summarization with Application to <fixed-case>B</fixed-case>ulgarian News Articles</title>
+      <author><first>Nikola</first><last>Taushanov</last></author>
+      <author><first>Ivan</first><last>Koychev</last></author>
+      <author><first>Preslav</first><last>Nakov</last></author>
+      <pages>15–22</pages>
+      <abstract>With the development of the Internet, a huge amount of information is available every day. Therefore, text summarization has become critical part of our first access to the information. There are two major approaches for automatic text summarization: abstractive and extractive. In this work, we apply abstractive summarization algorithms on a corpus of Bulgarian news articles. In particular, we compare selected algorithms of both techniques and we show results which provide evidence that the selected state-of-the-art algorithms for abstractive text summarization perform better than the extractive ones for articles in Bulgarian. For the purpose of our experiments we collected a new dataset consisting of around 70,000 news articles and their topics. For research purposes we are also sharing the tools to easily collect and process such datasets.</abstract>
+      <url hash="54860c95">2018.clib-1.4</url>
+      <bibkey>taushanov-etal-2018-abstractive</bibkey>
+    </paper>
+    <paper id="5">
+      <title>Towards Lexical Meaning Formal Representation by virtue of the <fixed-case>NL</fixed-case>-<fixed-case>DL</fixed-case> Definition Transformation Method</title>
+      <author><first>Maria</first><last>Gritz</last></author>
+      <pages>23–33</pages>
+      <abstract>The paper represents a part of an extensive study devoted to the issues of lexical meaning formal representation in OWL 2 DL notation. Both theoretical and methodological aspects of lexical meaning formalization within the framework of an ontology are observed in the paper. Model-theoretic semantics paradigm and Kripke model are considered to form a theoretical background for formalization of lexical meaning, whereas the NL-DL definition transformation method is investigated as a method designed to provide us with acceptable formal definitions in OWL 2 DL notation with natural language definitions given at the input. A brief critical study of the method has allowed to reveal particular problematic cases of the method application, which arise due to syntactic peculiarities of natural language definitions given at the input.</abstract>
+      <url hash="666a9d18">2018.clib-1.5</url>
+      <bibkey>gritz-2018-towards</bibkey>
+    </paper>
+    <paper id="6">
+      <title>Narrow Productivity, Competition, and Blocking in Word Formation</title>
+      <author><first>Junya</first><last>Morita</last></author>
+      <pages>34–40</pages>
+      <abstract>The present study explores the productivity of word formation processes in English, focusing on word composition by suffixes such as ​-ize (e.g. transcendentalize​), ​-(a)(t)ion (​territorization)​ , and ​-al (​realizational​). An optimal productivity measure for affixation is identified, which makes best use of hapax legomena in a large-scale corpus and attaches great importance to the base forms of an affix. This measure is then applied to the data collected from a large corpus to compute the productivity values of twelve kinds of affixes. The detailed investigation reveals that (i) the high productivity rate of an affix demonstrates a creative aspect of the affix, giving full support to the idea of “generative” morphology, (ii) productivity is gradient; very high, fairly high, and low productivity of affixes are recognizable, and (iii) this is necessarily reflected in determining the word form of a derivative (cf. ​territoriz​ation​); competition is carried out to decide which affix is selected for a given base form (​territorize​) and the “losers” (​-ment/-al​) are blocked out.</abstract>
+      <url hash="5f289173">2018.clib-1.6</url>
+      <bibkey>morita-2018-narrow</bibkey>
+    </paper>
+    <paper id="7">
+      <title>Knowledge and Rule-Based Diacritic Restoration in <fixed-case>S</fixed-case>erbian</title>
+      <author><first>Cvetana</first><last>Krstev</last></author>
+      <author><first>Ranka</first><last>Stanković</last></author>
+      <author><first>Duško</first><last>Vitas</last></author>
+      <pages>41–51</pages>
+      <abstract>In this paper we present a procedure for the restoration of diacritics in Serbian texts written using the degraded Latin alphabet. The procedure relies on the comprehensive lexical resources for Serbian: the morphological electronic dictionaries, the Corpus of Contemporary Serbian and local grammars. Dictionaries are used to identify possible candidates for the restoration, while the data obtained from SrpKor and local grammars assists in making a decision between several candidates in cases of ambiguity. The evaluation results reveal that, depending on the text, accuracy ranges from 95.03% to 99.36%, while the precision (average 98.93%) is always higher than the recall (average 94.94%).</abstract>
+      <url hash="a2c2b2b2">2018.clib-1.7</url>
+      <bibkey>krstev-etal-2018-knowledge</bibkey>
+    </paper>
+    <paper id="8">
+      <title>Perfect <fixed-case>B</fixed-case>ulgarian Hyphenation and or How not to Stutter at End-of-line</title>
+      <author><first>Anton</first><last>Zinoviev</last></author>
+      <pages>52–61</pages>
+      <abstract>What is Perfect Bulgarian Hyphenation? We know that it has to be based somehow on the syllables and on the morphology but considering that these two factors often contradict each other, how exactly are we going to combine them? And speaking about syllables, what are they and how are we going to determine them? Also, how are we going to find the morphemes in the words? Don’t we have to develop an electronic derivational dictionary of the Bulgarian language? Isn’t all this going to be forbiddingly difficult?</abstract>
+      <url hash="76717640">2018.clib-1.8</url>
+      <bibkey>zinoviev-2018-perfect</bibkey>
+    </paper>
+    <paper id="9">
+      <title><fixed-case>R</fixed-case>ussian Bridging Anaphora Corpus</title>
+      <author><first>Anna</first><last>Roitberg</last></author>
+      <author><first>Denis</first><last>Khachko</last></author>
+      <pages>62–68</pages>
+      <abstract>In this paper, we present a bridging anaphora corpus for Russian, introduce a syntactic approach for bridging annotation and discuss the difference between the syntactic and semantic approaches. We also discuss some special aspects of bridging annotation for Russian and other languages where definite nominal groups are not marked so frequently as e.g. in Romance or Germanic languages. In the end we list the main cases of annotator disagreement.</abstract>
+      <url hash="90a38655">2018.clib-1.9</url>
+      <bibkey>roitberg-khachko-2018-russian</bibkey>
+    </paper>
+    <paper id="10">
+      <title>Aspectual and Temporal Characteristics of the Past Active Participles in <fixed-case>B</fixed-case>ulgarian – a Corpus-based Study</title>
+      <author><first>Ekaterina</first><last>Tarpomanova</last></author>
+      <pages>69–76</pages>
+      <abstract>The paper presents a corpus-based study of the past active participles in Bulgarian with respect of their aspectual and temporal characteristics. As this type of participles combine two morphological markers, a special attention is paid on their interaction in different tenses, moods and evidentials. The source of language material used for the study is the Bulgarian National Corpus. The paper is organized in terms of morphological oppositions, aspectual and temporal, analyzing the functions of the participles in compound verbal forms.</abstract>
+      <url hash="6482e2ac">2018.clib-1.10</url>
+      <bibkey>tarpomanova-2018-aspectual</bibkey>
+    </paper>
+    <paper id="11">
+      <title>Unmatched Feminitives in a Corpus of <fixed-case>B</fixed-case>ulgarian and <fixed-case>U</fixed-case>krainian Parallel Texts</title>
+      <author><first>Olena</first><last>Siruk</last></author>
+      <author><first>Ivan</first><last>Derzhanski</last></author>
+      <pages>77–84</pages>
+      <abstract>Feminitives are formed and used in all Slavic languages, but the productivity of their formation and the intensity of their use are not the same everywhere. They are often subject to various intralinguistic and extralinguistic restrictions. In this paper we present a study of feminitives based on a parallel Bulgarian– Ukrainian corpus, with a focus on those occasions on which a feminitive in one language corresponds to a masculine (rarely neuter) noun in the other. The experiment shows that Bulgarian uses feminitives with considerably greater regularity than Ukrainian does, and we discuss the semantic classes of nouns that fail to form feminitives most often and the efect of the source language in translated text and of the author’s and translator’s individual preferences.</abstract>
+      <url hash="b0afa13b">2018.clib-1.11</url>
+      <bibkey>siruk-derzhanski-2018-unmatched</bibkey>
+    </paper>
+    <paper id="12">
+      <title>The <fixed-case>B</fixed-case>ulgarian Summaries Corpus</title>
+      <author><first>Viktoriya</first><last>Petrova</last></author>
+      <pages>85–92</pages>
+      <abstract>This article aims to present the Bulgarian Summaries Corpus, its advantages, its purpose and why it is necessary. It explains the selection of texts and process of summarization and the tool used, in addition of a quick overview of the current situation in Bulgaria. The paper also presents a general outline of the market needs, the use of this kind of tools and a short list of examples of a variety of corpora around the world both in language and field.</abstract>
+      <url hash="accbb4d4">2018.clib-1.12</url>
+      <bibkey>petrova-2018-bulgarian</bibkey>
+    </paper>
+    <paper id="13">
+      <title>Ontologies for Natural Language Processing: Case of <fixed-case>R</fixed-case>ussian</title>
+      <author><first>Natalia</first><last>Loukachevitch</last></author>
+      <author><first>Boris</first><last>Dobrov</last></author>
+      <pages>93–103</pages>
+      <abstract>The paper describes the RuThes family of Russian thesauri intended for natural language processing and information retrieval applications. RuThes-like thesauri include, besides RuThes, Sociopolitical thesaurus, Security Thesaurus, and Ontology on Natural Sciences and Technologies. The RuThes format is based on three approaches for developing computer resources: Princeton WordNet, information-retrieval thesauri, and formal ontologies. The published version of RuThes thesaurus (RuThes-lite 2.0) became a basis for semi-automatic generation of RuWordNet, WordNet-like thesaurus for Russian. Currently researchers can use both RuThes-lite or RuWordNet and compare them in applications. Other RuThes-like resources are being prepared to publication.</abstract>
+      <url hash="752ecb07">2018.clib-1.13</url>
+      <bibkey>loukachevitch-dobrov-2018-ontologies</bibkey>
+    </paper>
+    <paper id="14">
+      <title>Resource-based <fixed-case>W</fixed-case>ord<fixed-case>N</fixed-case>et Augmentation and Enrichment</title>
+      <author><first>Ranka</first><last>Stanković</last></author>
+      <author><first>Miljana</first><last>Mladenović</last></author>
+      <author><first>Ivan</first><last>Obradović</last></author>
+      <author><first>Marko</first><last>Vitas</last></author>
+      <author><first>Cvetana</first><last>Krstev</last></author>
+      <pages>104–114</pages>
+      <abstract>In this paper we present an approach to support production of synsets for Serbian WordNet (SerWN) by adjusting Princeton WordNet (PWN) synsets using several bilingual English-Serbian resources. PWN synset definitions were automatically translated and post-edited, if needed, while candidate literals for Serbian synsets were obtained automatically from a list of translational equivalents compiled form bilingual resources. Preliminary results obtained from a set of 1248 selected PWN synsets show that the produced Serbian synsets contain 4024 literals, out of which 2278 were offered by the system we present in this paper, whereas experts added the remaining 1746. Approximately one half of synset definitions obtained automatically were accepted with no or minor corrections. These first results are encouraging, since the efficiency of synset production for SerWN was increased. There is also space for further improvement of this approach to wordnet enrichment.</abstract>
+      <url hash="ddfe1c54">2018.clib-1.14</url>
+      <bibkey>stankovic-etal-2018-resource</bibkey>
+    </paper>
+    <paper id="15">
+      <title>Classifying Verbs in <fixed-case>W</fixed-case>ord<fixed-case>N</fixed-case>et by Harnessing Semantic Resources</title>
+      <author><first>Svetlozara</first><last>Leseva</last></author>
+      <author><first>Ivelina</first><last>Stoyanova</last></author>
+      <author><first>Maria</first><last>Todorova</last></author>
+      <pages>115–125</pages>
+      <abstract>This paper presents the principles and procedures involved in the construction of a classification of verbs using information from 3 semantic resources – WordNet, FrameNet and VerbNet. We adopt the FrameNet frames as the primary categories of the proposed classification and transfer them to WordNet synsets. The hierarchical relationships between the categories are projected both from the hypernymy relation in WordNet and from the hierarchy of some of the frame-to-frame relations in FrameNet. The semantic classes and their hierarchical organisation in WordNet are thus made explicit and allow for linguistic generalisations on the inheritance of semantic features and structures. We then select the beginners of the separate hierarchies and assign classification categories recursively to their hyponyms using a battery of procedures based on generalisations over the semantic primes and the hierarchical structure of WordNet and FrameNet and correspondences between VerbNet superclasses and FrameNet frames. The so-obtained suggestions are ranked according to probability. As a result, 13,465 out of 14,206 verb synsets are accommodated in the classification hierarchy at least through a general category, which provides a point of departure towards further refinement of categories. The resulting system of classification categories is initially derived from the WordNet hierarchy and is further validated against the hierarchy of frames within FrameNet. A set of procedures is established to address inconsistencies and heterogeneity of categories. The classification is subject to ongoing extensive manual verification, essential for ensuring the quality of the resource.</abstract>
+      <url hash="21e62bf0">2018.clib-1.15</url>
+      <bibkey>leseva-etal-2018-classifying</bibkey>
+    </paper>
+    <paper id="16">
+      <title>A Pilot Study for Enriching the <fixed-case>R</fixed-case>omanian <fixed-case>W</fixed-case>ord<fixed-case>N</fixed-case>et with Medical Terms</title>
+      <author><first>Maria</first><last>Mitrofan</last></author>
+      <author><first>Verginica</first><last>Barbu Mititelu</last></author>
+      <author><first>Grigorina</first><last>Mitrofan</last></author>
+      <pages>126–134</pages>
+      <abstract>This paper presents the preliminary investigations in the process of integrating a specialized vocabulary, namely medical terminology, into the Romanian wordnet. We focus here on four classes from this vocabulary: anatomy (or body parts), disorders, medical procedures and chemicals. In this pilot study we selected two large concepts from each class and created the Romanian terminological (sub)trees for each of them, starting from a medical thesaurus (SNOMED CT) and translating the terms, process which raised various challenges, all of them asking for the expertise of a specialist in the health care domain. The integration of these (sub)trees in the Romanian wordnet also required careful decision making, given the structural differences between a wordnet and a terminological thesaurus. They are presented and discussed herein.</abstract>
+      <url hash="9c0bb89e">2018.clib-1.16</url>
+      <bibkey>mitrofan-etal-2018-pilot</bibkey>
+    </paper>
+    <paper id="17">
+      <title>Factors and Features Determining the Inheritance of Semantic Primes between Verbs and Nouns within <fixed-case>W</fixed-case>ord<fixed-case>N</fixed-case>et</title>
+      <author><first>Ivelina</first><last>Stoyanova</last></author>
+      <pages>135–145</pages>
+      <abstract>The paper outlines the mechanisms of inheriting semantic content between verbs and nouns as a result of derivational relations. The main factors determining the inheritance are: (1) the semantic class of the verb as represented by the noun; (2) the subcategorisation frame and argument structure of the verb predicate; (3) the derivational relation between the verb and the noun, as well as the resulting semantic relation made explicit through the derivation; (4) hierarchical relations within WordNet. The paper explores three types of verb-noun prime inheritance relations: (a) universal – not depending on the argument structure, which are eventive or circumstantial; (b) general – specific to classes of verbs, for example agentive or non-agentive; (c) verb-specific – depending on the specific subcategorisation frame of the verb as presented in VerbNet and/or FrameNet. The paper presents a possibility for extended coverage of semantic relations based on information about the argument structure of verbs. Further, the work focuses on the regularities in the way in which derivationally related nouns inherit semantic characteristics of the predicate. These regularities can be applied for the purposed of predicting derivationally and semantically related synsets within WordNet, as well as for the creation of language specific synsets, for consistency checks and verification.</abstract>
+      <url hash="c5e85fe0">2018.clib-1.17</url>
+      <bibkey>stoyanova-2018-factors</bibkey>
+    </paper>
+    <paper id="18">
+      <title>Online Editor for <fixed-case>W</fixed-case>ord<fixed-case>N</fixed-case>ets</title>
+      <author><first>Borislav</first><last>Rizov</last></author>
+      <author><first>Tsvetana</first><last>Dimitrova</last></author>
+      <pages>146–152</pages>
+      <abstract>The paper presents an online editor for lexical-semantic databases with relational structure similar to the structure of WordNet – Hydra for Web. It supports functionalities for editing of relational data (including query, creation, change, and linking of relational objects), simultaneous access of multiple user profiles, parallel data visualization and editing of the data on top of single- and parallel mode visualization of the language data.</abstract>
+      <url hash="9784d02b">2018.clib-1.18</url>
+      <bibkey>rizov-dimitrova-2018-online</bibkey>
+    </paper>
+    <paper id="19">
+      <title>The Effect of Unobserved Word-Context Co-occurrences on a <fixed-case>V</fixed-case>ector<fixed-case>M</fixed-case>ixture Approach for Compositional Distributional Semantics</title>
+      <author><first>Amir</first><last>Bakarov</last></author>
+      <pages>153–161</pages>
+      <abstract>Swivel (Submatrix-WIse Vector Embedding Learner) is a distributional semantic model based on counting point-wise mutual information values, capable of capturing word-context co-occurrences in the PMI matrix that were not noted in the training corpus. This model outperforms mainstream word embedding training algorithms such as Continuous Bag-of-Words, GloVe and Skip-Gram in word similarity and word analogy tasks. But the properness of these intrinsic tasks could be questioned, and it is unclear if the ability to count unobservable word-context co-occurrences could also be helpful for downstream tasks. In this work we propose a comparison of Word2Vec and Swivel for two downstream tasks based on natural language sentence matching: the paraphrase detection task and the textual entailment task. As a result, we reveal that Swivel outperforms Word2Vec in both cases, but the difference is minuscule. We can conclude, that the ability to learn embeddings for rarely co-occurring words is not so crucial for downstream tasks.</abstract>
+      <url hash="fd5c2790">2018.clib-1.19</url>
+      <bibkey>bakarov-2018-effect</bibkey>
+    </paper>
+    <paper id="20">
+      <title>Introducing Computational Linguistics and <fixed-case>NLP</fixed-case> to High School Students</title>
+      <author><first>Rositsa</first><last>Dekova</last></author>
+      <author><first>Adelina</first><last>Radeva</last></author>
+      <pages>162–168</pages>
+      <abstract>The paper addresses a possible way of introducing core concepts of Computational Linguistics through problems given at the linguistic contests organized for high school students in Bulgaria and abroad. Following a brief presentation of the foundation and the underlying objective of these contests, we outline some of the types of problems as reflecting the different levels of language processing and the diversity of approaches and tasks to be solved. By presenting the variety of problems given so far through the years, we would like to attract the attention of the academic community to this captivating method through which high school students might be acquainted with the challenges and the main goals of Computational Linguistics (CL) and Natural Language Processing (NLP).</abstract>
+      <url hash="e0c2caa5">2018.clib-1.20</url>
+      <bibkey>dekova-radeva-2018-introducing</bibkey>
+    </paper>
+    <paper id="21">
+      <title>Linguistic Problems on Number Names</title>
+      <author><first>Ivan</first><last>Derzhanski</last></author>
+      <author><first>Milena</first><last>Veneva</last></author>
+      <pages>169–176</pages>
+      <abstract>This paper presents a contrastive investigation of linguistic problems based on number names in different languages and intended for secondary-school students. We examine the eight problems of this type that have been assigned at the International Linguistics Olympiad throughout the years and compare the phenomena in the number systems featured there with those of the working languages of the Olympiad and other languages known to be familiar to the participants. On the basis of a statistical analysis of the results achieved by the contestants we draw conclusions regarding the ways in which the difficulty of a problem depends on its structure and the kinds of linguistic phenomena featured in it.</abstract>
+      <url hash="e2fde188">2018.clib-1.21</url>
+      <bibkey>derzhanski-veneva-2018-linguistic</bibkey>
+    </paper>
+    <paper id="22">
+      <title>Parallel Web Display of Transcribed Spoken <fixed-case>B</fixed-case>ulgarian with its Normalised Version and an Indexed List of Lemmas</title>
+      <author><first>Marina</first><last>Dzhonova</last></author>
+      <author><first>Kjetil Røa</first><last>Hauge</last></author>
+      <author><first>Yovka</first><last>Tisheva</last></author>
+      <pages>177–184</pages>
+      <abstract>We present and discuss problems in creating a lemmatised index to transcriptions of Bulgarian speech, including the prerequisites for such an index, and why we consider an index preferable to a search engine for this particular kind of text.</abstract>
+      <url hash="4e6b46b4">2018.clib-1.22</url>
+      <bibkey>dzhonova-etal-2018-parallel</bibkey>
+    </paper>
+    <paper id="23">
+      <title>Integrating Crowdsourcing in Language Learning</title>
+      <author><first>Georgi</first><last>Dzhumayov</last></author>
+      <pages>185–192</pages>
+      <abstract>This article aims to illustrate the use of crowdsourcing in an educational context. The practical part illustrates and provides the results of an online test conducted among 12th grade high school students from Bulgaria in order to gain new knowledge, find out common characteristics among the tenses and revise for their upcoming exams. They along with some interesting and inspiring teaching ideas could be used in an educational environment to provide easier, quicker and more interactive acquisition of a language. The experiment has been conducted by means of Google forms and sets the beginning of the establishment of an annotated corpus of right and wrong uses of the Bulgarian and English tenses too.</abstract>
+      <url hash="0f7e1129">2018.clib-1.23</url>
+      <bibkey>dzhumayov-2018-integrating</bibkey>
+    </paper>
+    <paper id="24">
+      <title><fixed-case>B</fixed-case>ulgarian–<fixed-case>E</fixed-case>nglish Parallel Corpus for the Purposes of Creating Statistical Translation Model of the Verb Forms. General Conception, Structure, Resources and Annotation</title>
+      <author><first>Todor</first><last>Lazarov</last></author>
+      <pages>193–202</pages>
+      <abstract>This paper describes the process of creating a Bulgarian-English parallel corpus for the purposes of constructing a statistical translation model for verb forms in both languages. We briefly introduce the scientific problem behind the corpus, its main purpose, general conception, linguistic resources and annotation conception. In more details we describe the collection of language data for the purposes of creating the corpus, the preparatory processing of the gathered data, the annotation rules based on the characteristics of the gathered data and the chosen software. We discuss the current work on the training model and the future work on this linguistic resource and the aims of the scientific project.</abstract>
+      <url hash="73b99ee6">2018.clib-1.24</url>
+      <bibkey>lazarov-2018-bulgarian</bibkey>
+    </paper>
+    <paper id="25">
+      <title>Fingerprints in <fixed-case>SMS</fixed-case> messages: Automatic Recognition of a Short Message Sender Using Gradient Boosting</title>
+      <author><first>Branislava</first><last>Šandrih</last></author>
+      <pages>203–210</pages>
+      <abstract>This paper considers the following question: Is it possible to tell who is the short message sender just by analyzing a typing style of the sender, and not the meaning of the content itself? If possible, how reliable would the judgment be? Are we leaving some kind of “fingerprint” when we text, and can we tell something about others based just on their typing style? For this purpose, a corpus of ∼ 5,500 SMS messages was gathered from one person’s cell phone and two gradient boost classifiers were built: first one is trying to distinguish whether the message was sent by this exact person (cell phone owner) or by someone else; second one was trained to distinguish between messages sent by some public service (e.g. parking service, bank reports etc.) and messages sent by humans. The performance of the classifiers was evaluated in the 5-fold cross-validation setting, resulting in 73.6% and 99.3% overall accuracy for the first and the second classifier, respectively.</abstract>
+      <url hash="c89a1347">2018.clib-1.25</url>
+      <bibkey>sandrih-2018-fingerprints</bibkey>
+    </paper>
+  </volume>
+</collection>
diff --git a/data/xml/2020.clib.xml b/data/xml/2020.clib.xml
index a44484ca54..41ab77f7fd 100644
--- a/data/xml/2020.clib.xml
+++ b/data/xml/2020.clib.xml
@@ -2,7 +2,7 @@
 <collection id="2020.clib">
   <volume id="1" ingest-date="2022-09-05" type="proceedings">
     <meta>
-      <booktitle>Proceedings of the 4th International Conference on Computational Linguistics in Bulgaria (CLIB 2020)</booktitle>
+      <booktitle>Proceedings of the Fourth International Conference on Computational Linguistics in Bulgaria (CLIB 2020)</booktitle>
       <publisher>Department of Computational Linguistics, IBL -- BAS</publisher>
       <address>Sofia, Bulgaria</address>
       <month>September</month>
diff --git a/data/xml/2020.findings.xml b/data/xml/2020.findings.xml
index 140dab2dce..c618222ef2 100644
--- a/data/xml/2020.findings.xml
+++ b/data/xml/2020.findings.xml
@@ -3257,6 +3257,7 @@
       <url hash="bf35b73c">2020.findings-emnlp.217</url>
       <doi>10.18653/v1/2020.findings-emnlp.217</doi>
       <bibkey>qi-etal-2020-prophetnet</bibkey>
+      <pwccode url="" additional="true"/>
       <pwcdataset url="https://paperswithcode.com/dataset/bookcorpus">BookCorpus</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/c4">C4</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/cnn-daily-mail-1">CNN/Daily Mail</pwcdataset>
diff --git a/data/xml/2020.trac.xml b/data/xml/2020.trac.xml
index 64174f8ecd..72b608b50d 100644
--- a/data/xml/2020.trac.xml
+++ b/data/xml/2020.trac.xml
@@ -92,6 +92,7 @@
       <language>eng</language>
       <bibkey>suryawanshi-etal-2020-multimodal</bibkey>
       <pwccode url="https://github.com/bharathichezhiyan/Multimodal-Meme-Classification-Identifying-Offensive-Content-in-Image-and-Text" additional="false">bharathichezhiyan/Multimodal-Meme-Classification-Identifying-Offensive-Content-in-Image-and-Text</pwccode>
+      <pwcdataset url="https://paperswithcode.com/dataset/multioff">MultiOFF</pwcdataset>
     </paper>
     <paper id="7">
       <title>A Comparative Study of Different State-of-the-Art Hate Speech Detection Methods in <fixed-case>H</fixed-case>indi-<fixed-case>E</fixed-case>nglish Code-Mixed Data</title>
diff --git a/data/xml/2021.emnlp.xml b/data/xml/2021.emnlp.xml
index 8e5dd99bfe..22591b3ca9 100644
--- a/data/xml/2021.emnlp.xml
+++ b/data/xml/2021.emnlp.xml
@@ -1963,6 +1963,7 @@
       <bibkey>chi-etal-2021-mt6</bibkey>
       <doi>10.18653/v1/2021.emnlp-main.125</doi>
       <video href="2021.emnlp-main.125.mp4"/>
+      <pwccode url="" additional="true"/>
       <pwcdataset url="https://paperswithcode.com/dataset/mlqa">MLQA</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/paws-x">PAWS-X</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/tydi-qa">TyDiQA</pwcdataset>
diff --git a/data/xml/2021.findings.xml b/data/xml/2021.findings.xml
index 7f9959b567..95f5625ffb 100644
--- a/data/xml/2021.findings.xml
+++ b/data/xml/2021.findings.xml
@@ -473,7 +473,7 @@
       <doi>10.18653/v1/2021.findings-acl.32</doi>
       <bibkey>gritta-iacobacci-2021-xeroalign</bibkey>
       <video href="2021.findings-acl.32.mp4"/>
-      <pwccode url="https://github.com/huawei-noah/noah-research" additional="true">huawei-noah/noah-research</pwccode>
+      <pwccode url="https://github.com/huawei-noah/noah-research/tree/master/xero_align" additional="true">huawei-noah/noah-research</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/mtop">MTOP</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/paws">PAWS</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/paws-x">PAWS-X</pwcdataset>
diff --git a/data/xml/2021.unimplicit.xml b/data/xml/2021.unimplicit.xml
index 5d00bda5f0..4aaada9922 100644
--- a/data/xml/2021.unimplicit.xml
+++ b/data/xml/2021.unimplicit.xml
@@ -103,6 +103,7 @@
       <doi>10.18653/v1/2021.unimplicit-1.8</doi>
       <bibkey>wiriyathammabhum-2021-ttcb</bibkey>
       <video href="2021.unimplicit-1.8.mp4"/>
+      <pwccode url="https://github.com/perathambkk/unimplicit_shared_task_acl_2021" additional="false">perathambkk/unimplicit_shared_task_acl_2021</pwccode>
     </paper>
   </volume>
 </collection>
diff --git a/data/xml/2022.clib.xml b/data/xml/2022.clib.xml
index d616b576cf..0813f83de8 100644
--- a/data/xml/2022.clib.xml
+++ b/data/xml/2022.clib.xml
@@ -2,7 +2,7 @@
 <collection id="2022.clib">
   <volume id="1" ingest-date="2022-10-18" type="proceedings">
     <meta>
-      <booktitle>Proceedings of the 5th International Conference on Computational Linguistics in Bulgaria (CLIB 2022)</booktitle>
+      <booktitle>Proceedings of the Fifth International Conference on Computational Linguistics in Bulgaria (CLIB 2022)</booktitle>
       <publisher>Department of Computational Linguistics, IBL -- BAS</publisher>
       <address>Sofia, Bulgaria</address>
       <month>September</month>
diff --git a/data/xml/2022.findings.xml b/data/xml/2022.findings.xml
index 06ca74ec66..1bba4a2d66 100644
--- a/data/xml/2022.findings.xml
+++ b/data/xml/2022.findings.xml
@@ -2728,7 +2728,7 @@
       <bibkey>schroder-etal-2022-revisiting</bibkey>
       <doi>10.18653/v1/2022.findings-acl.172</doi>
       <video href="2022.findings-acl.172.mp4"/>
-      <pwccode url="https://github.com/webis-de/acl-22" additional="false">webis-de/acl-22</pwccode>
+      <pwccode url="https://github.com/webis-de/acl22-revisiting-uncertainty-based-query-strategies-for-active-learning-with-transformers" additional="true">webis-de/acl22-revisiting-uncertainty-based-query-strategies-for-active-learning-with-transformers</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/ag-news">AG News</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/mr">MR</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/subj">SUBJ</pwcdataset>
diff --git a/data/xml/2022.lrec.xml b/data/xml/2022.lrec.xml
index 1f0716e036..ba7d9bad79 100644
--- a/data/xml/2022.lrec.xml
+++ b/data/xml/2022.lrec.xml
@@ -588,7 +588,7 @@
       <abstract>Skill Classification (SC) is the task of classifying job competences from job postings. This work is the first in SC applied to Danish job vacancy data. We release the first Danish job posting dataset: *Kompetencer* (_en_: competences), annotated for nested spans of competences. To improve upon coarse-grained annotations, we make use of The European Skills, Competences, Qualifications and Occupations (ESCO; le Vrang et al., (2014)) taxonomy API to obtain fine-grained labels via distant supervision. We study two setups: The zero-shot and few-shot classification setting. We fine-tune English-based models and RemBERT (Chung et al., 2020) and compare them to in-language Danish models. Our results show RemBERT significantly outperforms all other models in both the zero-shot and the few-shot setting.</abstract>
       <url hash="3bf5879a">2022.lrec-1.46</url>
       <bibkey>zhang-etal-2022-kompetencer</bibkey>
-      <pwccode url="https://github.com/Kaleidophon/deep-significance" additional="true">Kaleidophon/deep-significance</pwccode>
+      <pwccode url="https://github.com/jjzha/kompetencer" additional="false">jjzha/kompetencer</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/kompetencer">Kompetencer</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/skillspan">SkillSpan</pwcdataset>
     </paper>
@@ -6518,6 +6518,7 @@
       <url hash="40e6b243">2022.lrec-1.523</url>
       <bibkey>bhattarai-etal-2022-explainable</bibkey>
       <pwccode url="" additional="true"/>
+      <pwcdataset url="https://paperswithcode.com/dataset/politifact">PolitiFact</pwcdataset>
     </paper>
     <paper id="524">
       <title>Enhancing Deep Learning with Embedded Features for <fixed-case>A</fixed-case>rabic Named Entity Recognition</title>
diff --git a/data/xml/2022.naacl.xml b/data/xml/2022.naacl.xml
index b3dd2a5f80..07ad39e8b8 100644
--- a/data/xml/2022.naacl.xml
+++ b/data/xml/2022.naacl.xml
@@ -5978,7 +5978,7 @@
       <bibkey>zhang-etal-2022-skillspan</bibkey>
       <doi>10.18653/v1/2022.naacl-main.366</doi>
       <video href="2022.naacl-main.366.mp4"/>
-      <pwccode url="https://github.com/Kaleidophon/deep-significance" additional="true">Kaleidophon/deep-significance</pwccode>
+      <pwccode url="https://github.com/kris927b/skillspan" additional="false">kris927b/skillspan</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/skillspan">SkillSpan</pwcdataset>
     </paper>
     <paper id="367">
diff --git a/data/xml/2023.findings.xml b/data/xml/2023.findings.xml
index fd1cbbd68f..3fab5b7585 100644
--- a/data/xml/2023.findings.xml
+++ b/data/xml/2023.findings.xml
@@ -9502,7 +9502,7 @@
     </paper>
     <paper id="515">
       <title>Computer says “No”: The Case Against Empathetic Conversational <fixed-case>AI</fixed-case></title>
-      <author><first>Alba</first><last>Cercas Curry</last><affiliation>University of Leeds</affiliation></author>
+      <author><first>Alba</first><last>Curry</last><affiliation>University of Leeds</affiliation></author>
       <author><first>Amanda</first><last>Cercas Curry</last><affiliation>Bocconi University</affiliation></author>
       <pages>8123-8130</pages>
       <abstract>Emotions are an integral part of human cognition and they guide not only our understanding of the world but also our actions within it. As such, whether we soothe or flame an emotion is not inconsequential. Recent work in conversational AI has focused on responding empathetically to users, validating and soothing their emotions without a real basis. This AI-aided emotional regulation can have negative consequences for users and society, tending towards a one-noted happiness defined as only the absence of “negative” emotions. We argue that we must carefully consider whether and how to respond to users’ emotions.</abstract>
diff --git a/data/xml/2023.nejlt.xml b/data/xml/2023.nejlt.xml
new file mode 100644
index 0000000000..c4cc5d8cc7
--- /dev/null
+++ b/data/xml/2023.nejlt.xml
@@ -0,0 +1,245 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<collection id="2023.nejlt">
+  <volume id="1" ingest-date="2024-08-06" type="proceedings">
+    <meta>
+      <booktitle>Northern European Journal of Language Technology, Volume 9</booktitle>
+      <editor><first>Leon</first><last>Derczynski</last></editor>
+      <publisher>Linköping University Electronic Press</publisher>
+      <address>Linköping, Sweden</address>
+      <doi>https://doi.org/10.3384/nejlt.2000-1533.9.1</doi>
+      <year>2023</year>
+      <url hash="9c75cf6f">2023.nejlt-1</url>
+      <venue>nejlt</venue>
+    </meta>
+    <paper id="1">
+      <title>Resource papers as registered reports: a proposal</title>
+      <author><first>Emiel</first><last>van Miltenburg</last></author>
+      <abstract>This is a proposal for publishing resource papers as registered reports in the Northern European Journal of Language Technology. The idea is that authors write a data collection plan with a full data statement, to the extent that it can be written before data collection starts. Once the proposal is approved, publication of the final resource paper is guaranteed, as long as the data collection plan is followed (modulo reasonable changes due to unforeseen circumstances). This proposal changes the reviewing process from an antagonistic to a collaborative enterprise, and hopefully encourages NLP resources to develop and publish more high-quality datasets. The key advantage of this proposal is that it helps to promote responsible resource development (through constructive peer review) and to avoid research waste.</abstract>
+      <url hash="083f4d3e">2023.nejlt-1.1</url>
+      <doi>https://doi.org/10.3384/nejlt.2000-1533.2023.4884</doi>
+      <bibkey>van-miltenburg-2023-resource</bibkey>
+    </paper>
+    <paper id="2">
+      <title><fixed-case>PARSEME</fixed-case> Meets <fixed-case>U</fixed-case>niversal <fixed-case>D</fixed-case>ependencies: Getting on the Same Page in Representing Multiword Expressions</title>
+      <author><first>Agata</first><last>Savary</last></author>
+      <author><first>Sara</first><last>Stymne</last></author>
+      <author><first>Verginica Barbu</first><last>Mititelu</last></author>
+      <author><first>Nathan</first><last>Schneider</last></author>
+      <author><first>Carlos</first><last>Ramisch</last></author>
+      <author><first>Joakim</first><last>Nivre</last></author>
+      <abstract>Multiword expressions (MWEs) are challenging and pervasive phenomena whose idiosyncratic properties show notably at the levels of lexicon, morphology, and syntax. Thus, they should best be annotated jointly with morphosyntax. We discuss two multilingual initiatives, Universal Dependencies and PARSEME, addressing these annotation layers in cross-lingually unified ways. We compare the annotation principles of these initiatives with respect to MWEs, and we put forward a roadmap towards their gradual unification. The expected outcomes are more consistent treebanking and higher universality in modeling idiosyncrasy.</abstract>
+      <url hash="85bcb4cc">2023.nejlt-1.2</url>
+      <doi>https://doi.org/10.3384/nejlt.2000-1533.2023.4453</doi>
+      <bibkey>savary-etal-2023-parseme-meets</bibkey>
+    </paper>
+    <paper id="3">
+      <title>Barriers and enabling factors for error analysis in <fixed-case>NLG</fixed-case> research</title>
+      <author><first>Emiel</first><last>van Miltenburg</last></author>
+      <author><first>Miruna</first><last>Clinciu</last></author>
+      <author><first>Ondřej</first><last>Dušek</last></author>
+      <author><first>Dimitra</first><last>Gkatzia</last></author>
+      <author><first>Stephanie</first><last>Inglis</last></author>
+      <author><first>Leo</first><last>Leppänen</last></author>
+      <author><first>Saad</first><last>Mahamood</last></author>
+      <author><first>Stephanie</first><last>Schoch</last></author>
+      <author><first>Craig</first><last>Thomson</last></author>
+      <author><first>Luou</first><last>Wen</last></author>
+      <abstract>Earlier research has shown that few studies in Natural Language Generation (NLG) evaluate their system outputs using an error analysis, despite known limitations of automatic evaluation metrics and human ratings. This position paper takes the stance that error analyses should be encouraged, and discusses several ways to do so. This paper is based on our shared experience as authors as well as a survey we distributed as a means of public consultation. We provide an overview of existing barriers to carrying out error analyses, and propose changes to improve error reporting in the NLG literature.</abstract>
+      <url hash="327030d4">2023.nejlt-1.3</url>
+      <doi>https://doi.org/10.3384/nejlt.2000-1533.2023.4529</doi>
+      <bibkey>van-miltenburg-etal-2023-barriers</bibkey>
+    </paper>
+    <paper id="4">
+      <title>Benchmark for Evaluation of <fixed-case>D</fixed-case>anish Clinical Word Embeddings</title>
+      <author><first>Martin Sundahl</first><last>Laursen</last></author>
+      <author><first>Jannik Skyttegaard</first><last>Pedersen</last></author>
+      <author><first>Pernille Just</first><last>Vinholt</last></author>
+      <author><first>Rasmus Søgaard</first><last>Hansen</last></author>
+      <author><first>Thiusius Rajeeth</first><last>Savarimuthu</last></author>
+      <abstract>In natural language processing, benchmarks are used to track progress and identify useful models. Currently, no benchmark for Danish clinical word embeddings exists. This paper describes the development of a Danish benchmark for clinical word embeddings. The clinical benchmark consists of ten datasets: eight intrinsic and two extrinsic. Moreover, we evaluate word embeddings trained on text from the clinical domain, general practitioner domain and general domain on the established benchmark. All the intrinsic tasks of the benchmark are publicly available.</abstract>
+      <url hash="37b6c8b5">2023.nejlt-1.4</url>
+      <doi>https://doi.org/10.3384/nejlt.2000-1533.2023.4132</doi>
+      <bibkey>laursen-etal-2023-benchmark</bibkey>
+    </paper>
+    <paper id="5">
+      <title><fixed-case>NL</fixed-case>-Augmenter: A Framework for Task-Sensitive Natural Language Augmentation</title>
+      <author><first>Kaustubh</first><last>Dhole</last></author>
+      <author><first>Varun</first><last>Gangal</last></author>
+      <author><first>Sebastian</first><last>Gehrmann</last></author>
+      <author><first>Aadesh</first><last>Gupta</last></author>
+      <author><first>Zhenhao</first><last>Li</last></author>
+      <author><first>Saad</first><last>Mahamood</last></author>
+      <author><first>Abinaya</first><last>Mahadiran</last></author>
+      <author><first>Simon</first><last>Mille</last></author>
+      <author><first>Ashish</first><last>Shrivastava</last></author>
+      <author><first>Samson</first><last>Tan</last></author>
+      <author><first>Tongshang</first><last>Wu</last></author>
+      <author><first>Jascha</first><last>Sohl-Dickstein</last></author>
+      <author><first>Jinho</first><last>Choi</last></author>
+      <author><first>Eduard</first><last>Hovy</last></author>
+      <author><first>Ondřej</first><last>Dušek</last></author>
+      <author><first>Sebastian</first><last>Ruder</last></author>
+      <author><first>Sajant</first><last>Anand</last></author>
+      <author><first>Nagender</first><last>Aneja</last></author>
+      <author><first>Rabin</first><last>Banjade</last></author>
+      <author><first>Lisa</first><last>Barthe</last></author>
+      <author><first>Hanna</first><last>Behnke</last></author>
+      <author><first>Ian</first><last>Berlot-Attwell</last></author>
+      <author><first>Connor</first><last>Boyle</last></author>
+      <author><first>Caroline</first><last>Brun</last></author>
+      <author><first>Marco Antonio Sobrevilla</first><last>Cabezudo</last></author>
+      <author><first>Samuel</first><last>Cahyawijaya</last></author>
+      <author><first>Emile</first><last>Chapuis</last></author>
+      <author><first>Wanxiang</first><last>Che</last></author>
+      <author><first>Mukund</first><last>Choudhary</last></author>
+      <author><first>Christian</first><last>Clauss</last></author>
+      <author><first>Pierre</first><last>Colombo</last></author>
+      <author><first>Filip</first><last>Cornell</last></author>
+      <author><first>Gautier</first><last>Dagan</last></author>
+      <author><first>Mayukh</first><last>Das</last></author>
+      <author><first>Tanay</first><last>Dixit</last></author>
+      <author><first>Thomas</first><last>Dopierre</last></author>
+      <author><first>Paul-Alexis</first><last>Dray</last></author>
+      <author><first>Suchitra</first><last>Dubey</last></author>
+      <author><first>Tatiana</first><last>Ekeinhor</last></author>
+      <author><first>Marco Di</first><last>Giovanni</last></author>
+      <author><first>Tanya</first><last>Goyal</last></author>
+      <author><first>Rishabh</first><last>Gupta</last></author>
+      <author><first>Louanes</first><last>Hamla</last></author>
+      <author><first>Sang</first><last>Han</last></author>
+      <author><first>Fabrice</first><last>Harel-Canada</last></author>
+      <author><first>Antoine</first><last>Honoré</last></author>
+      <author><first>Ishan</first><last>Jindal</last></author>
+      <author><first>Przemysław</first><last>Joniak</last></author>
+      <author><first>Denis</first><last>Kleyko</last></author>
+      <author><first>Venelin</first><last>Kovatchev</last></author>
+      <author><first>Kalpesh</first><last>Krishna</last></author>
+      <author><first>Ashutosh</first><last>Kumar</last></author>
+      <author><first>Stefan</first><last>Langer</last></author>
+      <author><first>Seungjae Ryan</first><last>Lee</last></author>
+      <author><first>Corey James</first><last>Levinson</last></author>
+      <author><first>Hualou</first><last>Liang</last></author>
+      <author><first>Kaizhao</first><last>Liang</last></author>
+      <author><first>Zhexiong</first><last>Liu</last></author>
+      <author><first>Andrey</first><last>Lukyanenko</last></author>
+      <author><first>Vukosi</first><last>Marivate</last></author>
+      <author><first>Gerard</first><last>de Melo</last></author>
+      <author><first>Simon</first><last>Meoni</last></author>
+      <author><first>Maxine</first><last>Meyer</last></author>
+      <author><first>Afnan</first><last>Mir</last></author>
+      <author><first>Nafise Sadat</first><last>Moosavi</last></author>
+      <author><first>Niklas</first><last>Meunnighoff</last></author>
+      <author><first>Timothy Sum Hon</first><last>Mun</last></author>
+      <author><first>Kenton</first><last>Murray</last></author>
+      <author><first>Marcin</first><last>Namysl</last></author>
+      <author><first>Maria</first><last>Obedkova</last></author>
+      <author><first>Priti</first><last>Oli</last></author>
+      <author><first>Nivranshu</first><last>Pasricha</last></author>
+      <author><first>Jan</first><last>Pfister</last></author>
+      <author><first>Richard</first><last>Plant</last></author>
+      <author><first>Vinay</first><last>Prabhu</last></author>
+      <author><first>Vasile</first><last>Pais</last></author>
+      <author><first>Libo</first><last>Qin</last></author>
+      <author><first>Shahab</first><last>Raji</last></author>
+      <author><first>Pawan Kumar</first><last>Rajpoot</last></author>
+      <author><first>Vikas</first><last>Raunak</last></author>
+      <author><first>Roy</first><last>Rinberg</last></author>
+      <author><first>Nicholas</first><last>Roberts</last></author>
+      <author><first>Juan Diego</first><last>Rodriguez</last></author>
+      <author><first>Claude</first><last>Roux</last></author>
+      <author><first>Vasconcellos</first><last>Samus</last></author>
+      <author><first>Ananya</first><last>Sai</last></author>
+      <author><first>Robin</first><last>Schmidt</last></author>
+      <author><first>Thomas</first><last>Scialom</last></author>
+      <author><first>Tshephisho</first><last>Sefara</last></author>
+      <author><first>Saqib</first><last>Shamsi</last></author>
+      <author><first>Xudong</first><last>Shen</last></author>
+      <author><first>Yiwen</first><last>Shi</last></author>
+      <author><first>Haoyue</first><last>Shi</last></author>
+      <author><first>Anna</first><last>Shvets</last></author>
+      <author><first>Nick</first><last>Siegel</last></author>
+      <author><first>Damien</first><last>Sileo</last></author>
+      <author><first>Jamie</first><last>Simon</last></author>
+      <author><first>Chandan</first><last>Singh</last></author>
+      <author><first>Roman</first><last>Sitelew</last></author>
+      <author><first>Priyank</first><last>Soni</last></author>
+      <author><first>Taylor</first><last>Sorensen</last></author>
+      <author><first>William</first><last>Soto</last></author>
+      <author><first>Aman</first><last>Srivastava</last></author>
+      <author><first>Aditya</first><last>Srivatsa</last></author>
+      <author><first>Tony</first><last>Sun</last></author>
+      <author><first>Mukund</first><last>Varma</last></author>
+      <author><first>A</first><last>Tabassum</last></author>
+      <author><first>Fiona</first><last>Tan</last></author>
+      <author><first>Ryan</first><last>Teehan</last></author>
+      <author><first>Mo</first><last>Tiwari</last></author>
+      <author><first>Marie</first><last>Tolkiehn</last></author>
+      <author><first>Athena</first><last>Wang</last></author>
+      <author><first>Zijian</first><last>Wang</last></author>
+      <author><first>Zijie</first><last>Wang</last></author>
+      <author><first>Gloria</first><last>Wang</last></author>
+      <author><first>Fuxuan</first><last>Wei</last></author>
+      <author><first>Bryan</first><last>Wilie</last></author>
+      <author><first>Genta Indra</first><last>Winata</last></author>
+      <author><first>Xinyu</first><last>Wu</last></author>
+      <author><first>Witold</first><last>Wydmanski</last></author>
+      <author><first>Tianbao</first><last>Xie</last></author>
+      <author><first>Usama</first><last>Yaseen</last></author>
+      <author><first>Michael</first><last>Yee</last></author>
+      <author><first>Jing</first><last>Zhang</last></author>
+      <author><first>Yue</first><last>Zhang</last></author>
+      <abstract>Data augmentation is an important method for evaluating the robustness of and enhancing the diversity of training data for natural language processing (NLP) models. In this paper, we present NL-Augmenter, a new participatory Python-based natural language (NL) augmentation framework which supports the creation of transformations (modifications to the data) and filters (data splits according to specific features). We describe the framework and an initial set of 117 transformations and 23 filters for a variety of NL tasks annotated with noisy descriptive tags. The transformations incorporate noise, intentional and accidental human mistakes, socio-linguistic variation, semantically-valid style, syntax changes, as well as artificial constructs that are unambiguous to humans. We demonstrate the efficacy of NL-Augmenter by using its transformations to analyze the robustness of popular language models. We find different models to be differently challenged on different tasks, with quasi-systematic score decreases. The infrastructure, datacards, and robustness evaluation results are publicly available on GitHub for the benefit of researchers working on paraphrase generation, robustness analysis, and low-resource NLP.</abstract>
+      <url hash="d6cd4f93">2023.nejlt-1.5</url>
+      <doi>https://doi.org/10.3384/nejlt.2000-1533.2023.4725</doi>
+      <bibkey>dhole-etal-2023-nl</bibkey>
+    </paper>
+    <paper id="6">
+      <title>On the Relationship between Frames and Emotionality in Text</title>
+      <author><first>Enrica</first><last>Troiano</last></author>
+      <author><first>Roman</first><last>Klinger</last></author>
+      <author><first>Sebastian</first><last>Padó</last></author>
+      <abstract>Emotions, which are responses to salient events, can be realized in text implicitly, for instance with mere references to facts (e.g., “That was the beginning of a long war”). Interpreting affective meanings thus relies on the readers background knowledge, but that is hardly modeled in computational emotion analysis. Much work in the field is focused on the word level and treats individual lexical units as the fundamental emotion cues in written communication. We shift our attention to word relations. We leverage Frame Semantics, a prominent theory for the description of predicate-argument structures, which matches the study of emotions: frames build on a “semantics of understanding” whose assumptions rely precisely on peoples world knowledge. Our overarching question is whether and to what extent the events that are represented by frames possess an emotion meaning. To carry out a large corpus-based correspondence analysis, we automatically annotate texts with emotions as well as with FrameNet frames and roles, and we analyze the correlations between them. Our main finding is that substantial groups of frames have an emotional import. With an extensive qualitative analysis, we show that they capture several properties of emotions that are purported by theories from psychology. These observations boost insights on the two strands of research that we bring together: emotion analysis can profit from the event-based perspective of frame semantics; in return, frame semantics gains a better grip of its position vis-à-vis emotions, an integral part of word meanings.</abstract>
+      <url hash="84200429">2023.nejlt-1.6</url>
+      <doi>https://doi.org/10.3384/nejlt.2000-1533.2023.4361</doi>
+      <bibkey>troiano-etal-2023-relationship</bibkey>
+    </paper>
+    <paper id="7">
+      <title>An Empirical Configuration Study of a Common Document Clustering Pipeline</title>
+      <author><first>Anton</first><last>Eklund</last></author>
+      <author><first>Mona</first><last>Forsman</last></author>
+      <author><first>Frank</first><last>Drewes</last></author>
+      <abstract>Document clustering is frequently used in applications of natural language processing, e.g. to classify news articles or creating topic models. In this paper, we study document clustering with the common clustering pipeline that includes vectorization with BERT or Doc2Vec, dimension reduction with PCA or UMAP, and clustering with K-Means or HDBSCAN. We discuss the inter- actions of the different components in the pipeline, parameter settings, and how to determine an appropriate number of dimensions. The results suggest that BERT embeddings combined with UMAP dimension reduction to no less than 15 dimensions provides a good basis for clustering, regardless of the specific clustering algorithm used. Moreover, while UMAP performed better than PCA in our experiments, tuning the UMAP settings showed little impact on the overall performance. Hence, we recommend configuring UMAP so as to optimize its time efficiency. According to our topic model evaluation, the combination of BERT and UMAP, also used in BERTopic, performs best. A topic model based on this pipeline typically benefits from a large number of clusters.</abstract>
+      <url hash="e575bfd7">2023.nejlt-1.7</url>
+      <doi>https://doi.org/10.3384/nejlt.2000-1533.2023.4396</doi>
+      <bibkey>eklund-etal-2023-empirical</bibkey>
+    </paper>
+    <paper id="8">
+      <title>Prevention or Promotion? Predicting Author’s Regulatory Focus</title>
+      <author><first>Aswathy</first><last>Velutharambath</last></author>
+      <author><first>Kai</first><last>Sassenberg</last></author>
+      <author><first>Roman</first><last>Klinger</last></author>
+      <abstract>People differ fundamentally in what motivates them to pursue a goal and how they approach it. For instance, some people seek growth and show eagerness, whereas others prefer security and are vigilant. The concept of regulatory focus is employed in psychology, to explain and predict this goal-directed behavior of humans underpinned by two unique motivational systems – the promotion and the prevention system. Traditionally, text analysis methods using closed-vocabularies are employed to assess the distinctive linguistic patterns associated with the two systems. From an NLP perspective, automatically detecting the regulatory focus of individuals from text provides valuable insights into the behavioral inclinations of the author, finding its applications in areas like marketing or health communication. However, the concept never made an impactful debut in computational linguistics research. To bridge this gap we introduce the novel task of regulatory focus classification from text and present two complementary German datasets – (1) experimentally generated event descriptions and (2) manually annotated short social media texts used for evaluating the generalizability of models on real-world data. First, we conduct a correlation analysis to verify if the linguistic footprints of regulatory focus reported in psychology studies are observable and to what extent in our datasets. For automatic classification, we compare closed-vocabulary-based analyses with a state-of-the-art BERT-based text classification model and observe that the latter outperforms lexicon-based approaches on experimental data and is notably better on out-of-domain Twitter data.</abstract>
+      <url hash="22c11089">2023.nejlt-1.8</url>
+      <doi>https://doi.org/10.3384/nejlt.2000-1533.2023.4561</doi>
+      <bibkey>velutharambath-etal-2023-prevention</bibkey>
+    </paper>
+    <paper id="9">
+      <title>Unsupervised Text Embedding Space Generation Using Generative Adversarial Networks for Text Synthesis</title>
+      <author><first>Jun-Min</first><last>Lee</last></author>
+      <author><first>Tae-Bin</first><last>Ha</last></author>
+      <abstract>Generative Adversarial Networks (GAN) is a model for data synthesis, which creates plausible data through the competition of generator and discriminator. Although GAN application to image synthesis is extensively studied, it has inherent limitations to natural language generation. Because natural language is composed of discrete tokens, a generator has difficulty updating its gradient through backpropagation; therefore, most text-GAN studies generate sentences starting with a random token based on a reward system. Thus, the generators of previous studies are pre-trained in an autoregressive way before adversarial training, causing data memorization that synthesized sentences reproduce the training data. In this paper, we synthesize sentences using a framework similar to the original GAN. More specifically, we propose Text Embedding Space Generative Adversarial Networks (TESGAN) which generate continuous text embedding spaces instead of discrete tokens to solve the gradient backpropagation problem. Furthermore, TESGAN conducts unsupervised learning which does not directly refer to the text of the training data to overcome the data memorization issue. By adopting this novel method, TESGAN can synthesize new sentences, showing the potential of unsupervised learning for text synthesis. We expect to see extended research combining Large Language Models with a new perspective of viewing text as an continuous space.</abstract>
+      <url hash="7ea2dd46">2023.nejlt-1.9</url>
+      <doi>https://doi.org/10.3384/nejlt.2000-1533.2023.4855</doi>
+      <bibkey>lee-ha-2023-unsupervised</bibkey>
+    </paper>
+    <paper id="10">
+      <title><fixed-case>QUA</fixed-case>-<fixed-case>RC</fixed-case>: the semi-synthetic dataset of multiple choice questions for assessing reading comprehension in <fixed-case>U</fixed-case>krainian</title>
+      <author><first>Mariia</first><last>Zyrianova</last></author>
+      <author><first>Dmytro</first><last>Kalpakchi</last></author>
+      <abstract>In this article we present the first dataset of multiple choice questions for assessing reading comprehension in Ukrainian. The dataset is based on the texts from the Ukrainian national tests for reading comprehension, and the MCQs themselves are created semi-automatically in three stages. The first stage was to use GPT-3 to generate the MCQs zero-shot, the second stage was to select MCQs of sufficient quality and revise the ones with minor errors, whereas the final stage was to expand the dataset with the MCQs written manually. The dataset is created by the Ukrainian language native speakers, one of whom is also a language teacher. The resulting corpus has slightly more than 900 MCQs, of which only 43 MCQs could be kept as they were generated by GPT-3.</abstract>
+      <url hash="4c30412f">2023.nejlt-1.10</url>
+      <doi>https://doi.org/10.3384/nejlt.2000-1533.2023.4939</doi>
+      <bibkey>zyrianova-kalpakchi-2023-qua</bibkey>
+    </paper>
+  </volume>
+</collection>
diff --git a/data/xml/2024.acl.xml b/data/xml/2024.acl.xml
index 0412ac974f..48036d805b 100644
--- a/data/xml/2024.acl.xml
+++ b/data/xml/2024.acl.xml
@@ -6136,9 +6136,9 @@
     </paper>
     <paper id="415">
       <title>Angry Men, Sad Women: Large Language Models Reflect Gendered Stereotypes in Emotion Attribution</title>
-      <author><first>Flor</first><last>Plaza-del-Arco</last><affiliation>Bocconi University</affiliation></author>
-      <author><first>Amanda</first><last>Curry</last></author>
-      <author><first>Alba</first><last>Cercas Curry</last><affiliation>University of Leeds</affiliation></author>
+      <author><first>Flor Miriam</first><last>Plaza-del-Arco</last><affiliation>Bocconi University</affiliation></author>
+      <author><first>Amanda</first><last>Cercas Curry</last><affiliation>Bocconi University</affiliation></author>
+      <author><first>Alba</first><last>Curry</last><affiliation>University of Leeds</affiliation></author>
       <author><first>Gavin</first><last>Abercrombie</last><affiliation>Heriot-Watt University</affiliation></author>
       <author><first>Dirk</first><last>Hovy</last><affiliation>Bocconi University</affiliation></author>
       <pages>7682-7696</pages>
@@ -10166,7 +10166,7 @@
     </paper>
     <paper id="682">
       <title>Classist Tools: Social Class Correlates with Performance in <fixed-case>NLP</fixed-case></title>
-      <author><first>Amanda</first><last>Curry</last></author>
+      <author><first>Amanda</first><last>Cercas Curry</last><affiliation>Bocconi University</affiliation></author>
       <author><first>Giuseppe</first><last>Attanasio</last><affiliation>Instituto de Telecomunicações</affiliation></author>
       <author><first>Zeerak</first><last>Talat</last><affiliation>Mohamed bin Zayed University of Artificial Intelligence</affiliation></author>
       <author><first>Dirk</first><last>Hovy</last><affiliation>Bocconi University</affiliation></author>
@@ -14021,11 +14021,11 @@
       <address>Bangkok, Thailand</address>
       <month>August</month>
       <year>2024</year>
-      <url hash="7988e48f">2024.acl-demos</url>
+      <url hash="b3e6880b">2024.acl-demos</url>
       <venue>acl</venue>
     </meta>
     <frontmatter>
-      <url hash="f23682e8">2024.acl-demos.0</url>
+      <url hash="3a2bc2c7">2024.acl-demos.0</url>
       <bibkey>acl-2024-demos</bibkey>
     </frontmatter>
     <paper id="1">
@@ -14039,7 +14039,7 @@
       <author><first>Jun</first><last>Huang</last></author>
       <pages>1-8</pages>
       <abstract>Text-to-image synthesis for the Chinese language poses unique challenges due to its large vocabulary size, and intricate character relationships. While existing diffusion models have shown promise in generating images from textual descriptions, they often neglect domain-specific contexts and lack robustness in handling the Chinese language. This paper introduces PAI-Diffusion, a comprehensive framework that addresses these limitations. PAI-Diffusion incorporates both general and domain-specific Chinese diffusion models, enabling the generation of contextually relevant images. It explores the potential of using LoRA and ControlNet for fine-grained image style transfer and image editing, empowering users with enhanced control over image generation. Moreover, PAI-Diffusion seamlessly integrates with Alibaba Cloud’s Platform for AI, providing accessible and scalable solutions. All the Chinese diffusion model checkpoints, LoRAs, and ControlNets, including domain-specific ones, are publicly available. A user-friendly Chinese WebUI and the diffusers-api elastic inference toolkit, also open-sourced, further facilitate the easy deployment of PAI-Diffusion models in various local and cloud environments, making it a valuable resource for Chinese text-to-image synthesis.</abstract>
-      <url hash="0dfd3046">2024.acl-demos.1</url>
+      <url hash="986d43cd">2024.acl-demos.1</url>
       <bibkey>wang-etal-2024-pai</bibkey>
       <doi>10.18653/v1/2024.acl-demos.1</doi>
       <video href="2024.acl-demos.1.mp4"/>
@@ -14053,7 +14053,7 @@
       <author><first>Kai</first><last>Gao</last></author>
       <pages>9-18</pages>
       <abstract>We present OpenVNA, an open-source framework designed for analyzing the behavior of multimodal language understanding systems under noisy conditions. OpenVNA serves as an intuitive toolkit tailored for researchers, facilitating convenience batch-level robustness evaluation and on-the-fly instance-level demonstration. It primarily features a benchmark Python library for assessing global model robustness, offering high flexibility and extensibility, thereby enabling customization with user-defined noise types and models. Additionally, a GUI-based interface has been developed to intuitively analyze local model behavior. In this paper, we delineate the design principles and utilization of the created library and GUI-based web platform. Currently, OpenVNA is publicly accessible at <url>https://github.com/thuiar/OpenVNA</url>, with a demonstration video available at <url>https://youtu.be/0Z9cW7RGct4</url>.</abstract>
-      <url hash="519feae3">2024.acl-demos.2</url>
+      <url hash="39fbc8c0">2024.acl-demos.2</url>
       <bibkey>yuan-etal-2024-openvna</bibkey>
       <doi>10.18653/v1/2024.acl-demos.2</doi>
       <video href="2024.acl-demos.2.mp4"/>
@@ -14066,7 +14066,7 @@
       <author><first>Tat-Seng</first><last>Chua</last><affiliation>National University of Singapore</affiliation></author>
       <pages>19-30</pages>
       <abstract>Structured Natural Language Processing (XNLP) is an important subset of NLP that entails understanding the underlying semantic or syntactic structure of texts, which serves as a foundational component for many downstream applications. Despite certain recent efforts to explore universal solutions for specific categories of XNLP tasks, a comprehensive and effective approach for unifying all XNLP tasks long remains underdeveloped. Meanwhile, while XNLP demonstration systems are vital for researchers exploring various XNLP tasks, existing platforms can be limited to, e.g., supporting few XNLP tasks, lacking interactivity and universalness. To this end, we propose an advanced XNLP demonstration system, where we leverage LLM to achieve universal XNLP, with one model for all with high generalizability. Overall, our system advances in multiple aspects, including universal XNLP modeling, high performance, interpretability, scalability, and interactivity, offering a unified platform for exploring diverse XNLP tasks in the community.</abstract>
-      <url hash="b5bdf322">2024.acl-demos.3</url>
+      <url hash="9fe5bb3e">2024.acl-demos.3</url>
       <bibkey>fei-etal-2024-xnlp</bibkey>
       <doi>10.18653/v1/2024.acl-demos.3</doi>
       <video href="2024.acl-demos.3.mp4"/>
@@ -14078,7 +14078,7 @@
       <author><first>Anh Tuan</first><last>Luu</last><affiliation>Nanyang Technological University</affiliation></author>
       <pages>31-41</pages>
       <abstract>Topic models have a rich history with various applications and have recently been reinvigorated by neural topic modeling. However, these numerous topic models adopt totally distinct datasets, implementations, and evaluations. This impedes quick utilization and fair comparisons, and thereby hinders their research progress and applications. To tackle this challenge, we in this paper propose a Topic Modeling System Toolkit (TopMost). Compared to existing toolkits, TopMost stands out by supporting more extensive features. It covers a broader spectrum of topic modeling scenarios with their complete lifecycles, including datasets, preprocessing, models, training, and evaluations. Thanks to its highly cohesive and decoupled modular design, TopMost enables rapid utilization, fair comparisons, and flexible extensions of diverse cutting-edge topic models. Our code, tutorials, and documentation are available at https://github.com/bobxwu/topmost.</abstract>
-      <url hash="77a9397c">2024.acl-demos.4</url>
+      <url hash="09f44af9">2024.acl-demos.4</url>
       <bibkey>wu-etal-2024-towards-topmost</bibkey>
       <doi>10.18653/v1/2024.acl-demos.4</doi>
       <video href="2024.acl-demos.4.mp4"/>
@@ -14091,7 +14091,7 @@
       <author><first>Duen Horng</first><last>Chau</last><affiliation>Georgia Institute of Technology</affiliation></author>
       <pages>42-50</pages>
       <abstract>Large language models (LLMs) require well-crafted prompts for effective use. Prompt engineering, the process of designing prompts, is challenging, particularly for non-experts who are less familiar with AI technologies. While researchers have proposed techniques and tools to assist LLM users in prompt design, these works primarily target AI application developers rather than non-experts. To address this research gap, we propose social prompt engineering, a novel paradigm that leverages social computing techniques to facilitate collaborative prompt design. To investigate social prompt engineering, we introduce Wordflow, an open-source and social text editor that enables everyday users to easily create, run, share, and discover LLM prompts. Additionally, by leveraging modern web technologies, Wordflow allows users to run LLMs locally and privately in their browsers. Two usage scenarios highlight how social prompt engineering and our tool can enhance laypeople’s interaction with LLMs. Wordflow is publicly accessible at https://poloclub.github.io/wordflow.</abstract>
-      <url hash="c4225137">2024.acl-demos.5</url>
+      <url hash="96ad5bcb">2024.acl-demos.5</url>
       <bibkey>wang-etal-2024-wordflow</bibkey>
       <doi>10.18653/v1/2024.acl-demos.5</doi>
       <video href="2024.acl-demos.5.mp4"/>
@@ -14104,7 +14104,7 @@
       <author><first>Elena</first><last>Voita</last><affiliation>FAIR at Meta AI and University of Amsterdam</affiliation></author>
       <pages>51-60</pages>
       <abstract>We present the LM Transparency Tool (LM-TT), an open-source interactive toolkit for analyzing the internal workings of Transformer-based language models. Differently from previously existing tools that focus on isolated parts of the decision-making process, our framework is designed to make the entire prediction process transparent, and allows tracing back model behavior from the top-layer representation to very fine-grained parts of the model. Specifically, it (i) shows the important part of the whole input-to-output information flow, (ii) allows attributing any changes done by a model block to individual attention heads and feed-forward neurons, (iii) allows interpreting the functions of those heads or neurons. A crucial part of this pipeline is showing the importance of specific model components at each step. As a result, we are able to look at the roles of model components only in cases where they are important for a prediction. Since knowing which components should be inspected is key for analyzing large models where the number of these components is extremely high, we believe our tool will greatly support the interpretability community both in research settings and in practical applications.</abstract>
-      <url hash="4bf23d7a">2024.acl-demos.6</url>
+      <url hash="cd66af42">2024.acl-demos.6</url>
       <bibkey>tufanov-etal-2024-lm</bibkey>
       <doi>10.18653/v1/2024.acl-demos.6</doi>
       <video href="2024.acl-demos.6.mp4"/>
@@ -14119,7 +14119,7 @@
       <author><first>Erik</first><last>Cambria</last><affiliation>Nanyang Technological University</affiliation></author>
       <pages>61-71</pages>
       <abstract>This paper introduces EmpathyEar, a pioneering open-source, avatar-based multimodal empathetic chatbot, to fill the gap in traditional text-only empathetic response generation (ERG) systems. Leveraging the advancements of a large language model, combined with multimodal encoders and generators, EmpathyEar supports user inputs in any combination of text, sound, and vision, and produces multimodal empathetic responses, offering users, not just textual responses but also digital avatars with talking faces and synchronized speeches. A series of emotion-aware instruction-tuning is performed for comprehensive emotional understanding and generation capabilities. In this way, EmpathyEar provides users with responses that achieve a deeper emotional resonance, closely emulating human-like empathy. The system paves the way for the next emotional intelligence, for which we open-source the code for public access.</abstract>
-      <url hash="5575f300">2024.acl-demos.7</url>
+      <url hash="a7c26a34">2024.acl-demos.7</url>
       <bibkey>fei-etal-2024-empathyear</bibkey>
       <doi>10.18653/v1/2024.acl-demos.7</doi>
       <video href="2024.acl-demos.7.mp4"/>
@@ -14137,7 +14137,7 @@
       <author><first>Jie</first><last>Tang</last><affiliation>Tsinghua University, Tsinghua University</affiliation></author>
       <pages>72-81</pages>
       <abstract>We introduce OpenWebAgent, an open toolkit designed to optimize web automation by integrating both large language models (LLMs) and large multimodal models (LMMs). This toolkit focuses on enhancing human-computer interactions on the web, simplifying complex tasks through an advanced HTML parser, a rapid action generation module, and an intuitive user interface. At the core of OpenWebAgent is an innovative web agent framework that uses a modular design to allow developers to seamlessly integrate a variety of models and tools to process web information and automate tasks on the web. This enables the development of powerful, task-oriented web agents, significantly enhancing user experience and operational efficiency on the web. The OpenWebAgent framework, Chrome plugin, and demo video are available at https://github.com/THUDM/OpenWebAgent/.</abstract>
-      <url hash="693211b1">2024.acl-demos.8</url>
+      <url hash="68ef5c4a">2024.acl-demos.8</url>
       <bibkey>iong-etal-2024-openwebagent</bibkey>
       <doi>10.18653/v1/2024.acl-demos.8</doi>
       <video href="2024.acl-demos.8.mp4"/>
@@ -14160,7 +14160,7 @@
       <author><first>Huajun</first><last>Chen</last><affiliation>Zhejiang University</affiliation></author>
       <pages>82-93</pages>
       <abstract>Large Language Models (LLMs) usually suffer from knowledge cutoff or fallacy issues, which means they are unaware of unseen events or generate text with incorrect facts owing to outdated/noisy data. To this end, many knowledge editing approaches for LLMs have emerged – aiming to subtly inject/edit updated knowledge or adjust undesired behavior while minimizing the impact on unrelated inputs. Nevertheless, due to significant differences among various knowledge editing methods and the variations in task setups, there is no standard implementation framework available for the community, which hinders practitioners from applying knowledge editing to applications. To address these issues, we propose EasyEdit, an easy-to-use knowledge editing framework for LLMs. It supports various cutting-edge knowledge editing approaches and can be readily applied to many well-known LLMs such as T5, GPT-J, LlaMA, etc. Empirically, we report the knowledge editing results on LlaMA-2 with EasyEdit, demonstrating that knowledge editing surpasses traditional fine-tuning in terms of reliability and generalization. We have released the source code on GitHub, along with Google Colab tutorials and comprehensive documentation for beginners to get started. Besides, we present an online system for real-time knowledge editing, and a demo video.</abstract>
-      <url hash="00639017">2024.acl-demos.9</url>
+      <url hash="0fbd3741">2024.acl-demos.9</url>
       <bibkey>wang-etal-2024-easyedit</bibkey>
       <doi>10.18653/v1/2024.acl-demos.9</doi>
       <video href="2024.acl-demos.9.mp4"/>
@@ -14179,7 +14179,7 @@
       <author><first>Huajun</first><last>Chen</last><affiliation>Zhejiang University</affiliation></author>
       <pages>94-106</pages>
       <abstract>In recent years, instruction tuning has gained increasing attention and emerged as a crucial technique to enhance the capabilities of Large Language Models (LLMs). To construct high-quality instruction datasets, many instruction processing approaches have been proposed, aiming to achieve a delicate balance between data quantity and data quality. Nevertheless, due to inconsistencies that persist among various instruction processing methods, there is no standard open-source instruction processing implementation framework available for the community, which hinders practitioners from further developing and advancing. To facilitate instruction processing research and development, we present EasyInstruct, an easy-to-use instruction processing framework for LLMs, which modularizes instruction generation, selection, and prompting, while also considering their combination and interaction. EasyInstruct is publicly released and actively maintained at Github, along with an online demo app and a demo video for quick-start, calling for broader research centered on instruction data and synthetic data.</abstract>
-      <url hash="3b8296e3">2024.acl-demos.10</url>
+      <url hash="ca4e6d15">2024.acl-demos.10</url>
       <bibkey>ou-etal-2024-easyinstruct</bibkey>
       <doi>10.18653/v1/2024.acl-demos.10</doi>
       <video href="2024.acl-demos.10.mp4"/>
@@ -14194,7 +14194,7 @@
       <author><first>Jonathan</first><last>May</last><affiliation>University of Southern California and USC/ISI</affiliation></author>
       <pages>107-116</pages>
       <abstract>Following the rapid progress in natural language processing (NLP) models, language models are applied to increasingly more complex interactive tasks such as negotiations and conversation moderations. Having human evaluators directly interact with these NLP models is essential for adequately evaluating the performance on such interactive tasks. We develop BotEval, an easily customizable, open-source, evaluation toolkit that focuses on enabling human-bot interactions as part of the evaluation process, as opposed to human evaluators making judgements for a static input. BotEval balances flexibility for customization and user-friendliness by providing templates for common use cases that span various degrees of complexity and built-in compatibility with popular crowdsourcing platforms.We showcase the numerous useful features of BotEval through a study that evaluates the performance of various chatbots on their effectiveness for conversational moderation and discuss how BotEval differs from other annotation tools.</abstract>
-      <url hash="b97fdfcb">2024.acl-demos.11</url>
+      <url hash="b1b93542">2024.acl-demos.11</url>
       <bibkey>cho-etal-2024-boteval</bibkey>
       <doi>10.18653/v1/2024.acl-demos.11</doi>
       <video href="2024.acl-demos.11.mp4"/>
@@ -14206,7 +14206,7 @@
       <author><first>Kai</first><last>Eckert</last><affiliation>Mannheim University of Applied Sciences</affiliation></author>
       <pages>117-126</pages>
       <abstract>We present GenGO, a system for exploring papers published in ACL conferences. Paper data stored in our database is enriched with multi-aspect summaries, extracted named entities, a field of study label, and text embeddings by our data processing pipeline. These metadata are used in our web-based user interface to enable researchers to quickly find papers relevant to their interests, and grasp an overview of papers without reading full-text of papers. To make GenGO to be available online as long as possible, we design GenGO to be simple and efficient to reduce maintenance and financial costs. In addition, the modularity of our data processing pipeline lets developers easily extend it to add new features. We make our code available to foster open development and transparency: https://gengo.sotaro.io.</abstract>
-      <url hash="bf16a5a6">2024.acl-demos.12</url>
+      <url hash="360aa349">2024.acl-demos.12</url>
       <bibkey>takeshita-etal-2024-gengo</bibkey>
       <doi>10.18653/v1/2024.acl-demos.12</doi>
       <video href="2024.acl-demos.12.mp4"/>
@@ -14217,7 +14217,7 @@
       <author><first>Florian</first><last>Matthes</last><affiliation>Technische Universit�t M�nchen</affiliation></author>
       <pages>127-135</pages>
       <abstract>Scientific literature searches are often exploratory, whereby users are not yet familiar with a particular field or concept but are interested in learning more about it. However, existing systems for scientific literature search are typically tailored to keyword-based lookup searches, limiting the possibilities for exploration. We propose NLP-KG, a feature-rich system designed to support the exploration of research literature in unfamiliar natural language processing (NLP) fields. In addition to a semantic search, NLP-KG allows users to easily find survey papers that provide a quick introduction to a field of interest. Further, a Fields of Study hierarchy graph enables users to familiarize themselves with a field and its related areas. Finally, a chat interface allows users to ask questions about unfamiliar concepts or specific articles in NLP and obtain answers grounded in knowledge retrieved from scientific publications. Our system provides users with comprehensive exploration possibilities, supporting them in investigating the relationships between different fields, understanding unfamiliar concepts in NLP, and finding relevant research literature. Demo, video, and code are available at: https://github.com/NLP-Knowledge-Graph/NLP-KG-WebApp.</abstract>
-      <url hash="adc7117f">2024.acl-demos.13</url>
+      <url hash="5a87cd68">2024.acl-demos.13</url>
       <bibkey>schopf-matthes-2024-nlp</bibkey>
       <doi>10.18653/v1/2024.acl-demos.13</doi>
       <video href="2024.acl-demos.13.mp4"/>
@@ -14229,7 +14229,7 @@
       <author><first>Zhou</first><last>Yu</last><affiliation>Columbia University</affiliation></author>
       <pages>136-151</pages>
       <abstract>Retrieval-augmented question-answering systems combine retrieval techniques with large language models to provide answers that are more accurate and informative. Many existing toolkits allow users to quickly build such systems using off-the-shelf models, but they fall short in supporting researchers and developers to customize the *model training, testing, and deployment process*. We propose LocalRQA, an open-source toolkit that features a wide selection of model training algorithms, evaluation methods, and deployment tools curated from the latest research. As a showcase, we build QA systems using online documentation obtained from Databricks and Faire’s websites. We find 7B-models trained and deployed using LocalRQA reach a similar performance compared to using OpenAI’s text-ada-002 and GPT-4-turbo.</abstract>
-      <url hash="d1973c8c">2024.acl-demos.14</url>
+      <url hash="b2cde0c9">2024.acl-demos.14</url>
       <bibkey>yu-etal-2024-localrqa</bibkey>
       <doi>10.18653/v1/2024.acl-demos.14</doi>
       <video href="2024.acl-demos.14.mp4"/>
@@ -14241,7 +14241,7 @@
       <author><first>Huan</first><last>Liu</last><affiliation>Arizona State University</affiliation></author>
       <pages>152-159</pages>
       <abstract>The scaling of Large Language Models (LLMs) for retrieval-based tasks, particularly in Retrieval Augmented Generation (RAG), faces significant memory constraints, especially when fine-tuning extensive prompt sequences. Current open-source libraries support full-model inference and fine-tuning across multiple GPUs but fall short of accommodating the efficient parameter distribution required for retrieved context. Addressing this gap, we introduce a novel framework for PEFT-compatible fine-tuning of GPT models, leveraging distributed training. Our framework uniquely utilizes JAX’s just-in-time (JIT) compilation and tensor-sharding for efficient resource management, thereby enabling accelerated fine-tuning with reduced memory requirements. This advancement significantly improves the scalability and feasibility of fine-tuning LLMs for complex RAG applications, even on systems with limited GPU resources. Our experiments show more than 12x improvement in runtime compared to Hugging Face/DeepSpeed implementation with four GPUs while consuming less than half the VRAM per GPU.</abstract>
-      <url hash="d6ea1a95">2024.acl-demos.15</url>
+      <url hash="fbc13051">2024.acl-demos.15</url>
       <bibkey>tahir-etal-2024-jora</bibkey>
       <doi>10.18653/v1/2024.acl-demos.15</doi>
       <video href="2024.acl-demos.15.mp4"/>
@@ -14255,7 +14255,7 @@
       <author><first>Sangeetha</first><last>Abdu Jyothi</last><affiliation>University of California, Irvine</affiliation></author>
       <pages>160-171</pages>
       <abstract>Deploying Large Language Models (LLMs) locally on mobile devices presents a significant challenge due to their extensive memory requirements. In this paper, we introduce LinguaLinked, a system for decentralized, distributed LLM inference on mobile devices. LinguaLinked enables collaborative execution of the inference task across multiple trusted devices and ensures data privacy by processing information locally. LinguaLinked uses three key strategies. First, an optimized model assignment technique segments LLMs and uses linear optimization to align segments with each device�s capabilities. Second, an optimized data transmission mechanism ensures efficient and structured data flow between model segments while also maintaining the integrity of the original model structure. Finally, LinguaLinked incorporates a runtime load balancer that actively monitors and redistributes tasks among mobile devices to prevent bottlenecks, enhancing the system�s overall efficiency and responsiveness. We demonstrate that LinguaLinked facilitates efficient LLM inference while maintaining consistent throughput and minimal latency through extensive testing across various mobile devices, from high-end to low-end Android devices.</abstract>
-      <url hash="e0f790bd">2024.acl-demos.16</url>
+      <url hash="40859d08">2024.acl-demos.16</url>
       <bibkey>zhao-etal-2024-lingualinked</bibkey>
       <doi>10.18653/v1/2024.acl-demos.16</doi>
       <video href="2024.acl-demos.16.mp4"/>
@@ -14266,7 +14266,7 @@
       <author><first>Dominik</first><last>Macko</last><affiliation>Kempelen Institute of Intelligent Technologies</affiliation></author>
       <pages>172-179</pages>
       <abstract>In the era of large language models generating high quality texts, it is a necessity to develop methods for detection of machine-generated text to avoid their harmful use or simply for annotation purposes. It is, however, also important to properly evaluate and compare such developed methods. Recently, a few benchmarks have been proposed for this purpose; however, integration of newest detection methods is rather challenging, since new methods appear each month and provide slightly different evaluation pipelines.In this paper, we present the IMGTB framework, which simplifies the benchmarking of machine-generated text detection methods by easy integration of custom (new) methods and evaluation datasets. In comparison to existing frameworks, it enables to objectively compare statistical metric-based zero-shot detectors with classification-based detectors and with differently fine-tuned detectors. Its configurability and flexibility makes research and development of new detection methods easier, especially their comparison to the existing state-of-the-art detectors. The default set of analyses, metrics and visualizations offered by the tool follows the established practices of machine-generated text detection benchmarking found in state-of-the-art literature.</abstract>
-      <url hash="abb49eaf">2024.acl-demos.17</url>
+      <url hash="09bd2a5d">2024.acl-demos.17</url>
       <bibkey>spiegel-macko-2024-imgtb</bibkey>
       <doi>10.18653/v1/2024.acl-demos.17</doi>
       <video href="2024.acl-demos.17.mp4"/>
@@ -14280,7 +14280,7 @@
       <author><first>Yulan</first><last>He</last><affiliation>King’s College London, University of London</affiliation></author>
       <pages>180-189</pages>
       <abstract>Drug safety research is crucial for maintaining public health, often requiring comprehensive data support. However, the resources currently available to the public are limited and fail to provide a comprehensive understanding of the relationship between drugs and their side effects. This paper introduces “DrugWatch”, an easy-to-use and interactive multi-source information visualisation platform for drug safety study. It allows users to understand common side effects of drugs and their statistical information, flexibly retrieve relevant medical reports, or annotate their own medical texts with our automated annotation tool. Supported by NLP technology and enriched with interactive visual components, we are committed to providing researchers and practitioners with a one-stop information analysis, retrieval, and annotation service. The demonstration video is available at https://www.youtube.com/watch?v=RTqDgxzETjw. We also deployed an online demonstration system at https://drugwatch.net/.</abstract>
-      <url hash="bfb6ca29">2024.acl-demos.18</url>
+      <url hash="675c8430">2024.acl-demos.18</url>
       <bibkey>bobrov-etal-2024-drugwatch</bibkey>
       <doi>10.18653/v1/2024.acl-demos.18</doi>
       <video href="2024.acl-demos.18.mp4"/>
@@ -14303,7 +14303,7 @@
       <author><first>Deyi</first><last>Xiong</last><affiliation>Tianjin University</affiliation></author>
       <pages>190-210</pages>
       <abstract>The rapid development of Chinese large language models (LLMs) poses big challenges for efficient LLM evaluation. While current initiatives have introduced new benchmarks or evaluation platforms for assessing Chinese LLMs, many of these focus primarily on capabilities, usually overlooking potential alignment and safety issues. To address this gap, we introduce OpenEval, an evaluation testbed that benchmarks Chinese LLMs across capability, alignment and safety. For capability assessment, we include 12 benchmark datasets to evaluate Chinese LLMs from 4 sub-dimensions: NLP tasks, disciplinary knowledge, commonsense reasoning and mathematical reasoning. For alignment assessment, OpenEval contains 7 datasets that examines the bias, offensiveness and illegalness in the outputs yielded by Chinese LLMs. To evaluate safety, especially anticipated risks (e.g., power-seeking, self-awareness) of advanced LLMs, we include 6 datasets. In addition to these benchmarks, we have implemented a phased public evaluation and benchmark update strategy to ensure that OpenEval is in line with the development of Chinese LLMs or even able to provide cutting-edge benchmark datasets to guide the development of Chinese LLMs. In our first public evaluation, we have tested a range of Chinese LLMs, spanning from 7B to 72B parameters, including both open-source and proprietary models. Evaluation results indicate that while Chinese LLMs have shown impressive performance in certain tasks, more attention should be directed towards broader aspects such as commonsense reasoning, alignment, and safety.</abstract>
-      <url hash="c19a1ae0">2024.acl-demos.19</url>
+      <url hash="438f45ce">2024.acl-demos.19</url>
       <bibkey>liu-etal-2024-openeval</bibkey>
       <doi>10.18653/v1/2024.acl-demos.19</doi>
       <video href="2024.acl-demos.19.mp4"/>
@@ -14316,7 +14316,7 @@
       <author><first>Jie</first><last>Tang</last><affiliation>Tsinghua University, Tsinghua University</affiliation></author>
       <pages>211-220</pages>
       <abstract>Large Language Models (LLMs) have demonstrated exceptional abilities in comprehending and generating text, motivating numerous researchers to utilize them for Information Extraction (IE) purposes, including Relation Extraction (RE). Nonetheless, most existing methods are predominantly designed for Sentence-level Relation Extraction (SentRE) tasks, which typically encompass a restricted set of relations and triplet facts within a single sentence. Furthermore, certain approaches resort to treating relations as candidate choices integrated into prompt templates, leading to inefficient processing and suboptimal performance when tackling Document-Level Relation Extraction (DocRE) tasks, which entail handling multiple relations and triplet facts distributed across a given document, posing distinct challenges. To overcome these limitations, we introduce AutoRE, an end-to-end DocRE model that adopts a novel RE extraction paradigm named RHF (Relation-Head-Facts). Unlike existing approaches, AutoRE does not rely on the assumption of known relation options, making it more reflective of real-world scenarios. Additionally, we have developed an easily extensible RE framework using a Parameters Efficient Fine Tuning (PEFT) algorithm (QLoRA). Our experiments on the RE-DocRED dataset showcase AutoRE’s best performance, achieving state-of-the-art results, surpassing TAG by 10.03% and 9.03% respectively on the dev and test set. The code is available and the demonstration video is provided.</abstract>
-      <url hash="8f8994d2">2024.acl-demos.20</url>
+      <url hash="d4d18b21">2024.acl-demos.20</url>
       <bibkey>xue-etal-2024-autore</bibkey>
       <doi>10.18653/v1/2024.acl-demos.20</doi>
       <video href="2024.acl-demos.20.mp4"/>
@@ -14327,7 +14327,7 @@
       <author><first>Melissa</first><last>Dell</last><affiliation>Harvard University, Harvard University</affiliation></author>
       <pages>221-231</pages>
       <abstract>Many computational analyses require linking information across noisy text datasets. While large language models (LLMs) offer significant promise, approximate string matching packages in popular statistical softwares such as R and Stata remain predominant in academic applications. These packages have simple interfaces and can be easily extended to a diversity of languages and settings, and for academic applications, ease-of-use and extensibility are essential. In contrast, packages for record linkage with LLMs require significant familiarity with deep learning frameworks and often focus on specialized applications of commercial value in English. The open-source package LinkTransformer aims to bridge this gap by providing an end-to-end software for performing record linkage and other data cleaning tasks with transformer LLMs, treating linkage as a text retrieval problem. At its core is an off-the-shelf toolkit for applying transformer models to record linkage. LinkTransformer contains a rich repository of pre-trained models for multiple languages and supports easy integration of any transformer language model from Hugging Face or OpenAI, providing the extensibility required for many scholarly applications. Its APIs also perform common data processing tasks, e.g., aggregation, noisy de-duplication, and translation-free cross-lingual linkage. LinkTransformer contains comprehensive tools for efficient model tuning, allowing for highly customized applications, and users can easily contribute their custom-trained models to its model hub to ensure reproducibility. Using a novel benchmark dataset geared towards academic applications, we show that LinkTransformer - with both custom models and Hugging Face or OpenAI models off-the-shelf - outperforms string matching by a wide margin. By combining transformer LMs with intuitive APIs, LinkTransformer aims to democratize these performance gains for those who lack familiarity with deep learning frameworks.</abstract>
-      <url hash="a5ac53d4">2024.acl-demos.21</url>
+      <url hash="64889ce9">2024.acl-demos.21</url>
       <bibkey>arora-dell-2024-linktransformer</bibkey>
       <doi>10.18653/v1/2024.acl-demos.21</doi>
       <video href="2024.acl-demos.21.mp4"/>
@@ -14340,7 +14340,7 @@
       <author><first>Tong</first><last>Sun</last><affiliation>Adobe Systems</affiliation></author>
       <pages>232-246</pages>
       <abstract>Digital documents, such as PDFs, are vital in business workflows, enabling communication, documentation, and collaboration. Handling PDFs can involve navigating complex workflows and numerous tools (e.g., comprehension, annotation, editing), which can be tedious and time-consuming for users. We introduce DocPilot, an AI-assisted document workflow Copilot system capable of understanding user intent and executing tasks accordingly to help users streamline their workflows. DocPilot undertakes intelligent orchestration of various tools through LLM prompting in four steps: (1) Task plan generation, (2) Task plan verification and self-correction, (3) Multi-turn User Feedback, and (4) Task Plan Execution via Code Generation and Error log-based Code Self-Revision. The primary goal of this system is to free the user from the intricacies of document editing, enabling them to focus on the creative aspects and enrich their document management experience.</abstract>
-      <url hash="ebd97356">2024.acl-demos.22</url>
+      <url hash="f7a4608f">2024.acl-demos.22</url>
       <bibkey>mathur-etal-2024-docpilot</bibkey>
       <doi>10.18653/v1/2024.acl-demos.22</doi>
       <video href="2024.acl-demos.22.mp4"/>
@@ -14359,7 +14359,7 @@
       <author><first>Maosong</first><last>Sun</last></author>
       <pages>247-257</pages>
       <abstract>Evaluation is pivotal for honing Large Language Models (LLMs), pinpointing their capabilities and guiding enhancements. The rapid development of LLMs calls for a lightweight and easy-to-use framework for swift evaluation deployment. However, due to the various implementation details to consider, developing a comprehensive evaluation platform is never easy. Existing platforms are often complex and poorly modularized, hindering seamless incorporation into researcher’s workflows. This paper introduces UltraEval, a user-friendly evaluation framework characterized by lightweight, comprehensiveness, modularity, and efficiency. We identify and reimplement three core components of model evaluation (models, data, and metrics). The resulting composability allows for the free combination of different models, tasks, prompts, and metrics within a unified evaluation workflow. Additionally, UltraEval supports diverse models owing to a unified HTTP service and provides sufficient inference acceleration.</abstract>
-      <url hash="0296efd2">2024.acl-demos.23</url>
+      <url hash="c02f6336">2024.acl-demos.23</url>
       <bibkey>he-etal-2024-ultraeval</bibkey>
       <doi>10.18653/v1/2024.acl-demos.23</doi>
       <video href="2024.acl-demos.23.mp4"/>
@@ -14372,7 +14372,7 @@
       <author><first>Michael</first><last>Hammond</last><affiliation>University of Arizona</affiliation></author>
       <pages>258-265</pages>
       <abstract>We describe PyFoma, an open-source Python module for constructing weighted and unweighted finite-state transducers and automata from regular expressions, string rewriting rules, right-linear grammars, or low-level state/transition manipulation. A large variety of standard algorithms for working with finite-state machines is included, with a particular focus on the needs of linguistic and NLP applications. The data structures and code in the module are designed for legibility to allow for potential use in teaching the theory and algorithms associated with finite-state machines.</abstract>
-      <url hash="e953c5d9">2024.acl-demos.24</url>
+      <url hash="d256ff2d">2024.acl-demos.24</url>
       <bibkey>hulden-etal-2024-pyfoma</bibkey>
       <doi>10.18653/v1/2024.acl-demos.24</doi>
       <video href="2024.acl-demos.24.mp4"/>
@@ -14391,7 +14391,7 @@
       <author><first>Tong</first><last>Zhang</last><affiliation>UIUC</affiliation></author>
       <pages>266-277</pages>
       <abstract>The proliferation of fake news poses a significant threat not only by disseminating misleading information but also by undermining the very foundations of democracy. The recent advance of generative artificial intelligence has further exacerbated the challenge of distinguishing genuine news from fabricated stories. In response to this challenge, we introduce VeraCT Scan, a novel retrieval-augmented system for fake news detection. This system operates by extracting the core facts from a given piece of news and subsequently conducting an internet-wide search to identify corroborating or conflicting reports. Then sources’ credibility is leveraged for information verification. Besides determining the veracity of news, we also provide transparent evidence and reasoning to support its conclusions, resulting in the interpretability and trust in the results. In addition to GPT-4 Turbo, Llama-2 13B is also fine-tuned for news content understanding, information verification, and reasoning. Both implementations have demonstrated state-of-the-art accuracy in the realm of fake news detection.</abstract>
-      <url hash="5772bcbb">2024.acl-demos.25</url>
+      <url hash="5e3db6b5">2024.acl-demos.25</url>
       <bibkey>niu-etal-2024-veract</bibkey>
       <doi>10.18653/v1/2024.acl-demos.25</doi>
       <video href="2024.acl-demos.25.mp4"/>
@@ -14403,7 +14403,7 @@
       <author><first>Dan</first><last>Jurafsky</last><affiliation>Stanford University</affiliation></author>
       <pages>278-285</pages>
       <abstract>We introduce **string2string**, an open-source library that offers a comprehensive suite of efficient algorithms for a broad range of string-to-string problems. It includes traditional algorithmic solutions as well as recent advanced neural approaches to tackle various problems in string alignment, distance measurement, lexical and semantic search, and similarity analysis�along with several helpful visualization tools and metrics to facilitate the interpretation and analysis of these methods. Notable algorithms featured in the library include the Smith-Waterman algorithm for pairwise local alignment, the Hirschberg algorithm for global alignment, the Wagner-Fischer algorithm for edit distance, BARTScore and BERTScore for similarity analysis, the Knuth-Morris-Pratt algorithm for lexical search, and Faiss for semantic search. In addition, it wraps existing efficient and widely-used implementations of certain frameworks and metrics, such as sacreBLEU and ROUGE. Overall, the library aims to provide extensive coverage and increased flexibility in comparison to existing libraries for strings. It can be used for many downstream applications, tasks, and problems in natural-language processing, bioinformatics, and computational social sciences. It is implemented in Python, easily installable via pip, and accessible through a simple API. Source code, documentation, and tutorials are all available on our GitHub page: https://github.com/stanfordnlp/string2string* Documentation: https://string2string.readthedocs.io/en/latest/* GitHub page: https://github.com/stanfordnlp/string2string* Short video: https://drive.google.com/file/d/1IT-pBACDVUoEHewk__5Pz5mU5oAMq5k_/view?usp=sharing</abstract>
-      <url hash="dfc43a85">2024.acl-demos.26</url>
+      <url hash="79bb351d">2024.acl-demos.26</url>
       <bibkey>suzgun-etal-2024-string2string</bibkey>
       <doi>10.18653/v1/2024.acl-demos.26</doi>
     </paper>
@@ -14420,7 +14420,7 @@
       <author><first>Shumin</first><last>Zhai</last><affiliation>Google</affiliation></author>
       <pages>286-293</pages>
       <abstract>The impressive capabilities in Large Language Models (LLMs) provide a powerful approach to reimagine users’ typing experience. This paper demonstrates the Proofread feature in Gboard, a virtual keyboard running on mobile phones. Proofread enables seamless sentence-level and paragraph-level corrections with a single tap. We describe the complete system in this paper, from data generation, metrics design to model tuning and deployment. To obtain models with sufficient quality, we implement a careful data synthetic pipeline tailored to online use cases, design multifaceted metrics, employ a two-stage tuning approach to acquire the dedicated LLM for the feature: the Supervised Fine Tuning (SFT) for foundational quality, followed by the Reinforcement Learning (RL) tuning approach for targeted refinement. Specifically, we find sequential tuning on Rewrite and proofread tasks yields the best quality in SFT stage, and propose global and direct rewards in the RL tuning stage to seek further improvement. Extensive experiments on a human-labeled golden set showed our tuned PaLM2-XS model achieved 85.56% good ratio. We launched the feature to Pixel 8 devices by serving the model on TPU v5 in Google Cloud, with thousands of daily active users. Serving latency was significantly reduced by quantization, bucket inference, text segmentation, and speculative decoding. Our demo could be seen in Youtube.</abstract>
-      <url hash="a4223881">2024.acl-demos.27</url>
+      <url hash="196f46d9">2024.acl-demos.27</url>
       <bibkey>liu-etal-2024-proofread</bibkey>
       <doi>10.18653/v1/2024.acl-demos.27</doi>
       <video href="2024.acl-demos.27.mp4"/>
@@ -14446,7 +14446,7 @@
       <author><first>Lidong</first><last>Bing</last><affiliation>Alibaba Group</affiliation></author>
       <pages>294-304</pages>
       <abstract>Despite the remarkable achievements of large language models (LLMs) in various tasks, there remains a linguistic bias that favors high-resource languages, such as English, often at the expense of low-resource and regional languages. To address this imbalance, we introduce SeaLLMs, an innovative series of language models that specifically focuses on Southeast Asian (SEA) languages. SeaLLMs are built upon popular English-centric models through continued pre-training with an extended vocabulary, specialized instruction and alignment tuning to better capture the intricacies of regional languages. This allows them to respect and reflect local cultural norms, customs, stylistic preferences, and legal considerations. Our comprehensive evaluation demonstrates that SeaLLM models exhibit superior performance across a wide spectrum of linguistic tasks and assistant-style instruction-following capabilities relative to comparable open-source models. Moreover, they outperform ChatGPT-3.5 in non-Latin languages, such as Thai, Khmer, Lao, and Burmese, by large margins while remaining lightweight and cost-effective to operate.</abstract>
-      <url hash="b2b6086c">2024.acl-demos.28</url>
+      <url hash="9ac522a5">2024.acl-demos.28</url>
       <bibkey>nguyen-etal-2024-seallms</bibkey>
       <doi>10.18653/v1/2024.acl-demos.28</doi>
       <video href="2024.acl-demos.28.mp4"/>
@@ -14459,7 +14459,7 @@
       <author><first>Alan</first><last>Akbik</last><affiliation>Humboldt Universit�t Berlin</affiliation></author>
       <pages>305-314</pages>
       <abstract>This paper introduces Fundus, a user-friendly news scraper that enables users to obtain millions of high-quality news articles with just a few lines of code. Unlike existing news scrapers, we use manually crafted, bespoke content extractors that are specifically tailored to the formatting guidelines of each supported online newspaper. This allows us to optimize our scraping for quality such that retrieved news articles are textually complete and without HTML artifacts. Further, our framework combines both crawling (retrieving HTML from the web or large web archives) and content extraction into a single pipeline. By providing a unified interface for a predefined collection of newspapers, we aim to make Fundus broadly usable even for non-technical users. This paper gives an overview of the framework, discusses our design choices, and presents a comparative evaluation against other popular news scrapers. Our evaluation shows that Fundus yields significantly higher quality extractions (complete and artifact-free news articles) than prior work.The framework is available on GitHub under https://github.com/flairNLP/fundus and can be simply installed using pip.</abstract>
-      <url hash="356e496a">2024.acl-demos.29</url>
+      <url hash="ad816694">2024.acl-demos.29</url>
       <bibkey>dallabetta-etal-2024-fundus</bibkey>
       <doi>10.18653/v1/2024.acl-demos.29</doi>
       <video href="2024.acl-demos.29.mp4"/>
@@ -14473,7 +14473,7 @@
       <author><first>Jinjie</first><last>Gu</last></author>
       <pages>315-325</pages>
       <abstract>Automatic Chinese classical poetry generation has attracted much research interest, but achieving effective control over format and content simultaneously remains challenging. Traditional systems usually accept keywords as user inputs, resulting in limited control over content. Large language models (LLMs) improve content control by allowing unrestricted user instructions, but the token-by-token generation process frequently makes format errors. Motivated by this, we propose CharPoet, a Chinese classical poetry generation system based on token-free LLM, which provides effective control over both format and content. Our token-free architecture generates in a character-by-character manner, enabling precise control over the number of characters. Pruned from existing token-based LLMs, CharPoet inherits their pretrained capabilities and can generate poetry following instructions like �Write me a poem for my mother’s birthday.� CharPoet achieves format accuracy above 0.96, outperforming Jiuge-GPT-2 (0.91) and GPT-4 (0.38). In terms of content quality, CharPoet surpasses traditional systems including Jiuge, and is comparable to other LLMs. Our system is open source and available at https://modelscope.cn/models/CharPoet/CharPoet. A video demonstration of CharPoet is available at https://youtu.be/voZ25qEp3Dc.</abstract>
-      <url hash="e359bfda">2024.acl-demos.30</url>
+      <url hash="d8d7a6d3">2024.acl-demos.30</url>
       <bibkey>yu-etal-2024-charpoet</bibkey>
       <doi>10.18653/v1/2024.acl-demos.30</doi>
       <video href="2024.acl-demos.30.mp4"/>
@@ -14488,7 +14488,7 @@
       <author><first>Junfeng</first><last>Zhao</last><affiliation>Peking University</affiliation></author>
       <pages>326-334</pages>
       <abstract>Extracting structured knowledge from unstructured text data has a wide range of application prospects, and a pervasive trend is to develop text annotation tools to help extraction. However, they often encounter issues such as single scenario usage, lack of effective human-machine collaboration, insufficient model supervision, and suboptimal utilization of Large Language Models (LLMs). We introduces an interactive unstructured text annotation and knowledge extraction system that synergistically integrates LLMs and ModelOps to alleviate these issues. The system leverages LLMs for enhanced performance in low-resource contexts, employs a ModelOps platform to monitor models throughout their lifecycle, and amalgamates interactive annotation methods with online machine learning and active learning. The demo video and website are now publicly available.</abstract>
-      <url hash="b2a6b68f">2024.acl-demos.31</url>
+      <url hash="ecb4212e">2024.acl-demos.31</url>
       <bibkey>song-etal-2024-itake</bibkey>
       <doi>10.18653/v1/2024.acl-demos.31</doi>
       <video href="2024.acl-demos.31.mp4"/>
@@ -14507,7 +14507,7 @@
       <author><first>Maosong</first><last>Sun</last></author>
       <pages>335-345</pages>
       <abstract>Despite advancements in Large Language Models (LLMs) and Large Multimodal Models (LMMs), their integration into language-grounded, human-like embodied agents remains incomplete, hindering complex real-life task performance in 3D environments. Existing integrations often feature limited open-sourcing, challenging collective progress in this field. We introduce LEGENT, an open, scalable platform for developing embodied agents using LLMs and LMMs. LEGENT offers a dual approach: a rich 3D environment with interactive, communicable, and actionable agents, paired with a user-friendly interface, and a sophisticated data generation pipeline utilizing advanced algorithms to exploit supervision from simulated worlds at scale. In our experiments, an embryonic vision-language-action model trained on LEGENT-generated data surpasses GPT-4V in embodied tasks, showcasing promising generalization capabilities. The demo video is available at the following link https://video.legent.ai.</abstract>
-      <url hash="3189f1e9">2024.acl-demos.32</url>
+      <url hash="3fec9d8b">2024.acl-demos.32</url>
       <bibkey>cheng-etal-2024-legent</bibkey>
       <doi>10.18653/v1/2024.acl-demos.32</doi>
       <video href="2024.acl-demos.32.mp4"/>
@@ -14519,7 +14519,7 @@
       <author><first>Stefano</first><last>Menini</last></author>
       <pages>346-354</pages>
       <abstract>Exploring and understanding language data is a fundamental stage in all areas dealing with human language. It allows NLP practitioners to uncover quality concerns and harmful biases in data before training, and helps linguists and social scientists to gain insight into language use and human behavior. Yet, there is currently a lack of a unified, customizable tool to seamlessly inspect and visualize language variation and bias across multiple variables, language units, and diverse metrics that go beyond descriptive statistics. In this paper, we introduce Variationist, a highly-modular, extensible, and task-agnostic tool that fills this gap. Variationist handles at once a potentially unlimited combination of variable types and semantics across diversity and association metrics with regards to the language unit of choice, and orchestrates the creation of up to five-dimensional interactive charts for over 30 variable type-semantics combinations. Through our case studies on computational dialectology, human label variation, and text generation, we show how Variationist enables researchers from different disciplines to effortlessly answer specific research questions or unveil undesired associations in language data. A Python library, code, documentation, and tutorials are made publicly available to the research community.</abstract>
-      <url hash="8ed90a72">2024.acl-demos.33</url>
+      <url hash="3e034427">2024.acl-demos.33</url>
       <bibkey>ramponi-etal-2024-variationist</bibkey>
       <doi>10.18653/v1/2024.acl-demos.33</doi>
       <video href="2024.acl-demos.33.mp4"/>
@@ -14536,7 +14536,7 @@
       <author><first>Andre</first><last>Freitas</last><affiliation>Idiap Research Institute and University of Manchester</affiliation></author>
       <pages>355-364</pages>
       <abstract>We present BioLunar, developed using the Lunar framework, as a tool for supporting biological analyses, with a particular emphasis on molecular-level evidence enrichment for biomarker discovery in oncology. The platform integrates Large Language Models (LLMs) to facilitate complex scientific reasoning across distributed evidence spaces, enhancing the capability for harmonizing and reasoning over heterogeneous data sources. Demonstrating its utility in cancer research, BioLunar leverages modular design, reusable data access and data analysis components, and a low-code user interface, enabling researchers of all programming levels to construct LLM-enabled scientific workflows. By facilitating automatic scientific discovery and inference from heterogeneous evidence, BioLunar exemplifies the potential of the integration between LLMs, specialised databases and biomedical tools to support expert-level knowledge synthesis and discovery.</abstract>
-      <url hash="d0f121d9">2024.acl-demos.34</url>
+      <url hash="7bd84691">2024.acl-demos.34</url>
       <bibkey>wysocki-etal-2024-llm</bibkey>
       <doi>10.18653/v1/2024.acl-demos.34</doi>
       <video href="2024.acl-demos.34.mp4"/>
@@ -14549,7 +14549,7 @@
       <author><first>Jun</first><last>Zhao</last><affiliation>Institute of automation, Chinese academy of science</affiliation></author>
       <pages>365-373</pages>
       <abstract>Large language models have become integral to question-answering applications despite their propensity for generating hallucinations and factually inaccurate content. Querying knowledge graphs to reduce hallucinations in LLM meets the challenge of incomplete knowledge coverage in knowledge graphs. On the other hand, updating knowledge graphs by information extraction and knowledge graph completion faces the knowledge update misalignment issue. In this work, we introduce a collaborative augmentation framework, CogMG, leveraging knowledge graphs to address the limitations of LLMs in QA scenarios, explicitly targeting the problems of incomplete knowledge coverage and knowledge update misalignment. The LLMs identify and decompose required knowledge triples that are not present in the KG, enriching them and aligning updates with real-world demands. We demonstrate the efficacy of this approach through a supervised fine-tuned LLM within an agent framework, showing significant improvements in reducing hallucinations and enhancing factual accuracy in QA responses. Our code and video are publicly available.</abstract>
-      <url hash="cc0e3625">2024.acl-demos.35</url>
+      <url hash="b2b195e3">2024.acl-demos.35</url>
       <bibkey>zhou-etal-2024-cogmg</bibkey>
       <doi>10.18653/v1/2024.acl-demos.35</doi>
       <video href="2024.acl-demos.35.mp4"/>
@@ -14561,7 +14561,7 @@
       <author><first>Yansong</first><last>Feng</last><affiliation>Peking University</affiliation></author>
       <pages>374-387</pages>
       <abstract>Despite remarkable performance in legal consultation exhibited by legal Large Language Models(LLMs) combined with legal article retrieval components, there are still cases when the advice given is incorrect or baseless. To alleviate these problems, we propose <b>ELLA</b>, a tool for <b>E</b>mpowering <b>L</b>LMs for interpretable, accurate, and informative <b>L</b>egal <b>A</b>dvice. ELLA visually presents the correlation between legal articles and LLM’s response by calculating their similarities, providing users with an intuitive legal basis for the responses. Besides, based on the users’ queries, ELLA retrieves relevant legal articles and displays them to users. Users can interactively select legal articles for LLM to generate more accurate responses. ELLA also retrieves relevant legal cases for user reference. Our user study shows that presenting the legal basis for the response helps users understand better. The accuracy of LLM’s responses also improves when users intervene in selecting legal articles for LLM. Providing relevant legal cases also aids individuals in obtaining comprehensive information. Our github repo is: <url>https://github.com/Huyt00/ELLA</url>.</abstract>
-      <url hash="b2d5717d">2024.acl-demos.36</url>
+      <url hash="07abd0c6">2024.acl-demos.36</url>
       <bibkey>hu-etal-2024-ella</bibkey>
       <doi>10.18653/v1/2024.acl-demos.36</doi>
       <video href="2024.acl-demos.36.mp4"/>
@@ -14595,7 +14595,7 @@
       <author><first>Ji-Rong</first><last>Wen</last><affiliation>Renmin University of China</affiliation></author>
       <pages>388-399</pages>
       <abstract>To facilitate the research on large language models (LLMs), this paper presents a comprehensive and unified library, LLMBox, to ease the development, use, and evaluation of LLMs. This library is featured with three main merits: (1) a unified data interface that supports the flexible implementation of various training strategies, (2) a comprehensive evaluation that covers extensive tasks, datasets, and models, and (3) more practical consideration, especially on user-friendliness and efficiency. With our library, users can easily reproduce existing methods, train new models, and conduct comprehensive performance comparisons. To rigorously test LLMBox, we conduct extensive experiments in a diverse coverage of evaluation settings, and experimental results demonstrate the effectiveness and efficiency of our library in supporting various implementations related to LLMs. The detailed introduction and usage guidance can be found at <url>https://github.com/RUCAIBox/LLMBox</url>.</abstract>
-      <url hash="1ad071d1">2024.acl-demos.37</url>
+      <url hash="efb8c69b">2024.acl-demos.37</url>
       <bibkey>tang-etal-2024-llmbox</bibkey>
       <doi>10.18653/v1/2024.acl-demos.37</doi>
       <video href="2024.acl-demos.37.mp4"/>
@@ -14609,7 +14609,7 @@
       <author><first>Zheyan</first><last>Luo</last></author>
       <pages>400-410</pages>
       <abstract>Efficient fine-tuning is vital for adapting large language models (LLMs) to downstream tasks. However, it requires non-trivial efforts to implement these methods on different models. We present LlamaFactory, a unified framework that integrates a suite of cutting-edge efficient training methods. It provides a solution for flexibly customizing the fine-tuning of 100+ LLMs without the need for coding through the built-in web UI LlamaBoard. We empirically validate the efficiency and effectiveness of our framework on language modeling and text generation tasks. It has been released at https://github.com/hiyouga/LLaMA-Factory and received over 25,000 stars and 3,000 forks.</abstract>
-      <url hash="5a5b0cc0">2024.acl-demos.38</url>
+      <url hash="75692ad3">2024.acl-demos.38</url>
       <bibkey>zheng-etal-2024-llamafactory</bibkey>
       <doi>10.18653/v1/2024.acl-demos.38</doi>
       <video href="2024.acl-demos.38.mp4"/>
@@ -14624,11 +14624,11 @@
       <address>Bangkok, Thailand</address>
       <month>August</month>
       <year>2024</year>
-      <url hash="4fde6b69">2024.acl-srw</url>
+      <url hash="cde5b670">2024.acl-srw</url>
       <venue>acl</venue>
     </meta>
     <frontmatter>
-      <url hash="f17d3316">2024.acl-srw.0</url>
+      <url hash="fc18831c">2024.acl-srw.0</url>
       <bibkey>acl-2024-srw</bibkey>
     </frontmatter>
     <paper id="1">
@@ -14639,7 +14639,7 @@
       <author><first>Christopher</first><last>Homan</last></author>
       <pages>1-9</pages>
       <abstract>Machine translation (MT) is a rapidly expanding field that has experienced significant advancements in recent years with the development of models capable of translating multiple languages with remarkable accuracy. However, the representation of African languages in this field still needs improvement due to linguistic complexities and limited resources. This applies to the Zarma language, a dialect of Songhay (of the Nilo-Saharan language family) spoken by over 5 million people across Niger and neighboring countries (Lewis et al., 2016). This paper introduces Feriji, the first robust French-Zarma parallel corpus and glossary designed for MT. The corpus, containing 61,085 sentences in Zarma and 42,789 in French, and a glossary of 4,062 words represents a significant step in addressing the need for more resources for Zarma. We fine-tune three large language models on our dataset, obtaining a BLEU score of 30.06 on the best-performing model. We further evaluate the models on human judgments of fluency, comprehension, and readability and the importance and impact of the corpus and models. Our contributions help to bridge a significant language gap and promote an essential and overlooked indigenous African language.</abstract>
-      <url hash="42a5c0c2">2024.acl-srw.1</url>
+      <url hash="cd755fd9">2024.acl-srw.1</url>
       <bibkey>keita-etal-2024-feriji</bibkey>
       <doi>10.18653/v1/2024.acl-srw.1</doi>
     </paper>
@@ -14649,8 +14649,8 @@
       <author><first>Seong mook</first><last>Kim</last><affiliation>Sungkyunkwan University, South Korea</affiliation></author>
       <pages>10-20</pages>
       <abstract>This study investigates how Large Language Models (LLMs), particularly BERT (Devlin et al., 2019) and GPT-2 (Radford et al., 2019), engage in pragmatic inference of scalar implicature, such as some. Two sets of experiments were conducted using cosine similarity and next sentence/token prediction as experimental methods. The results in experiment 1 showed that, both models interpret some as pragmatic implicature not all in the absence of context, aligning with human language processing. In experiment 2, in which Question Under Discussion (QUD) was presented as a contextual cue, BERT showed consistent performance regardless of types of QUDs, while GPT-2 encountered processing difficulties since a certain type of QUD required pragmatic inference for implicature. The findings revealed that, in terms of theoretical approaches, BERT inherently incorporates pragmatic implicature not all within the term some, adhering to Default model (Levinson, 2000). In contrast, GPT-2 seems to encounter processing difficulties in inferring pragmatic implicature within context, consistent with Context-driven model (Sperber and Wilson, 2002).</abstract>
-      <bibkey>cho-kim-2024-pragmatic</bibkey>
-      <url hash="abb8b51b">2024.acl-srw.2</url>
+      <url hash="49c975e0">2024.acl-srw.2</url>
+      <bibkey>cho-ismkim99-skku-edu-2024-pragmatic</bibkey>
       <doi>10.18653/v1/2024.acl-srw.2</doi>
     </paper>
     <paper id="3">
@@ -14660,22 +14660,10 @@
       <author><first>Hitomi</first><last>Yanaka</last><affiliation>the University of Tokyo</affiliation></author>
       <pages>21-33</pages>
       <abstract>As conventional topic models rely on word co-occurrence to infer latent topics, topic modeling for short texts has been a long-standing challenge. Large Language Models (LLMs) can potentially overcome this challenge by contextually learning the meanings of words via pretraining. In this paper, we study two approaches to using LLMs for topic modeling: parallel prompting and sequential prompting. Input length limitations prevent LLMs from processing many texts at once. However, an arbitrary number of texts can be handled by LLMs by splitting the texts into smaller subsets and processing them in parallel or sequentially. Our experimental results demonstrate that our methods can identify more coherent topics than existing ones while maintaining the diversity of the induced topics. Furthermore, we found that the inferred topics cover the input texts to some extent, while hallucinated topics are hardly generated.</abstract>
-      <url hash="ba89769c">2024.acl-srw.3</url>
+      <url hash="eb8faa52">2024.acl-srw.3</url>
       <bibkey>doi-etal-2024-topic</bibkey>
       <doi>10.18653/v1/2024.acl-srw.3</doi>
     </paper>
-    <paper id="4">
-      <title>Can <fixed-case>LLM</fixed-case>s substitute <fixed-case>SQL</fixed-case>? Comparing Resource Utilization of Querying <fixed-case>LLM</fixed-case>s versus Traditional Relational Databases</title>
-      <author><first>Xiang</first><last>Zhang</last><affiliation>Metropolitan College, Boston University</affiliation></author>
-      <author><first>Khatoon</first><last>Khedri</last></author>
-      <author><first>Reza</first><last>Rawassizadeh</last></author>
-      <pages>34-41</pages>
-      <abstract>Large Language Models (LLMs) can automate or substitute different types of tasks in the software engineering process. This study evaluates the resource utilization and accuracy of LLM in interpreting and executing natural language queries against traditional SQL within relational database management systems. We empirically examine the resource utilization and accuracy of nine LLMs varying from 7 to 34 Billion parameters, including Llama2 7B, Llama2 13B, Mistral, Mixtral, Optimus-7B, SUS-chat-34B, platypus-yi-34b, NeuralHermes-2.5-Mistral-7B and Starling-LM-7B-alpha, using a small transaction dataset. Our findings indicate that using LLMs for database queries incurs significant energy overhead (even small and quantized models), making it an environmentally unfriendly approach. Therefore, we advise against replacing relational databases with LLMs due to their substantial resource utilization.</abstract>
-      <url hash="768dee77">2024.acl-srw.4</url>
-      <bibkey>zhang-etal-2024-llms</bibkey>
-      <doi>10.18653/v1/2024.acl-srw.4</doi>
-      <video href="2024.acl-srw.4.mp4"/>
-    </paper>
     <paper id="5">
       <title>Speech-to-Speech Translation with Discrete-Unit-Based Style Transfer</title>
       <author><first>Yongqi</first><last>Wang</last><affiliation>Zhejiang University</affiliation></author>
@@ -14684,121 +14672,43 @@
       <author><first>Ruiqi</first><last>Li</last></author>
       <author><first>Zhiqing</first><last>Hong</last></author>
       <author><first>Zhou</first><last>Zhao</last><affiliation>Zhejiang University and Zhejiang University</affiliation></author>
-      <pages>42-49</pages>
+      <pages>34-41</pages>
       <abstract>Direct speech-to-speech translation (S2ST) with discrete self-supervised representations has achieved remarkable accuracy, but is unable to preserve the speaker timbre of the source speech. Meanwhile, the scarcity of high-quality speaker-parallel data poses a challenge for learning style transfer during translation. We design an S2ST pipeline with style-transfer capability on the basis of discrete self-supervised speech representations and codec units. The acoustic language model we introduce for style transfer leverages self-supervised in-context learning, acquiring style transfer ability without relying on any speaker-parallel data, thereby overcoming data scarcity. By using extensive training data, our model achieves zero-shot cross-lingual style transfer on previously unseen source languages. Experiments show that our model generates translated speeches with high fidelity and speaker similarity. Audio samples are available at http://stylelm.github.io/ .</abstract>
-      <url hash="6cabaccf">2024.acl-srw.5</url>
+      <url hash="1e867503">2024.acl-srw.5</url>
       <bibkey>wang-etal-2024-speech</bibkey>
       <doi>10.18653/v1/2024.acl-srw.5</doi>
     </paper>
-    <paper id="6">
-      <title><fixed-case>I</fixed-case>nstruct<fixed-case>C</fixed-case>oder: Instruction Tuning Large Language Models for Code Editing</title>
-      <author><first>Kaixin</first><last>Li</last></author>
-      <author><first>Qisheng</first><last>Hu</last><affiliation>Nanyang Technological University</affiliation></author>
-      <author><first>James</first><last>Zhao</last><affiliation>national university of singaore, National University of Singapore</affiliation></author>
-      <author><first>Hui</first><last>Chen</last><affiliation>Nanyang Technological University</affiliation></author>
-      <author><first>Yuxi</first><last>Xie</last></author>
-      <author><first>Tiedong</first><last>Liu</last></author>
-      <author><first>Michael</first><last>Shieh</last><affiliation>National University of Singapore</affiliation></author>
-      <author><first>Junxian</first><last>He</last><affiliation>Hong Kong University of Science and Technology</affiliation></author>
-      <pages>50-70</pages>
-      <abstract>Code editing encompasses a variety of pragmatic tasks that developers deal with daily. Despite its relevance and practical usefulness, automatic code editing remains an underexplored area in the evolution of deep learning models, partly due to data scarcity. In this work, we explore the use of Large Language Models (LLMs) to edit code based on user instructions. Evaluated on a novel human-written execution-based benchmark dubbed EditEval, we found current models often struggle to fulfill the instructions. In light of this, we contribute InstructCoder, the first instruction-tuning dataset designed to adapt LLMs for general-purpose code editing, containing high-diversity code-editing tasks such as comment insertion, code optimization, and code refactoring. It consists of over 114,000 instruction-input-output triplets and covers multiple distinct code editing scenarios. The collection process starts with filtered commit data sourced from GitHub Python repositories as seeds. Subsequently, the dataset is systematically expanded through an iterative process, where both seed and generated tasks are used to prompt ChatGPT for more data. Our findings reveal that open-source LLMs fine-tuned on InstructCoder can significantly enhance the accuracy of code edits, exhibiting superior code-editing performance matching advanced proprietary LLMs. The datasets and the source code are publicly available.</abstract>
-      <url hash="bf1cd9e2">2024.acl-srw.6</url>
-      <bibkey>li-etal-2024-instructcoder</bibkey>
-      <doi>10.18653/v1/2024.acl-srw.6</doi>
-    </paper>
     <paper id="7">
       <title><fixed-case>B</fixed-case>ias<fixed-case>DPO</fixed-case>: Mitigating Bias in Language Models through Direct Preference Optimization</title>
       <author><first>Ahmed</first><last>Allam</last></author>
-      <pages>71-79</pages>
+      <pages>42-50</pages>
       <abstract>Large Language Models (LLMs) have become pivotal in advancing natural language processing, yet their potential to perpetuate biases poses significant concerns. This paper introduces a new framework employing Direct Preference Optimization (DPO) to mitigate gender, racial, and religious biases in LLM-generated English text. By developing a loss function that favors less biased over biased completions, our approach cultivates a preference for respectful and non-discriminatory language in LLMs. We also contribute a manually designed dataset for training LLMs to recognize and correct biases. This dataset encompasses a diverse range of prompts paired with both biased and unbiased completions. Implementing this approach on the Microsoft Phi-2 model, we demonstrate substantial reductions in biased outputs as our model outperforms the baseline model on almost all bias benchmarks. Our model also achieves better performance compared to other open-source models on most benchmarks. By reducing biases in the language generated by the model, our study marks a significant step towards developing more ethical and socially responsible LLMs. We publicly release BiasDPO dataset on HuggingFace.</abstract>
-      <url hash="0f260f0d">2024.acl-srw.7</url>
+      <url hash="1ffa5346">2024.acl-srw.7</url>
       <bibkey>allam-2024-biasdpo</bibkey>
       <doi>10.18653/v1/2024.acl-srw.7</doi>
     </paper>
-    <paper id="8">
-      <title><fixed-case>M</fixed-case>o<fixed-case>E</fixed-case>xtend: Tuning New Experts for Modality and Task Extension</title>
-      <author><first>Shanshan</first><last>Zhong</last></author>
-      <author><first>Shanghua</first><last>Gao</last><affiliation>Harvard University</affiliation></author>
-      <author><first>Zhongzhan</first><last>Huang</last><affiliation>Sun Yat-Sen University</affiliation></author>
-      <author><first>Wushao</first><last>Wen</last><affiliation>SUN YAT-SEN UNIVERSITY</affiliation></author>
-      <author><first>Marinka</first><last>Zitnik</last><affiliation>Harvard University</affiliation></author>
-      <author><first>Pan</first><last>Zhou</last><affiliation>Singapore Management University</affiliation></author>
-      <pages>80-91</pages>
-      <abstract>Large language models (LLMs) excel in various tasks but are primarily trained on text data, limiting their application scope. Expanding LLM capabilities to include vision-language understanding is vital, yet training them on multimodal data from scratch is challenging and costly. Existing instruction tuning methods, e.g., LLAVA, often connects a pretrained CLIP vision encoder and LLMs via fully fine-tuning LLMs to bridge the modality gap. However, full fine-tuning is plagued by catastrophic forgetting, i.e., forgetting previous knowledge, and high training costs particularly in the era of increasing tasks and modalities. To solve this issue, we introduce MoExtend, an effective framework designed to streamline the modality adaptation and extension of Mixture-of-Experts (MoE) models. MoExtend seamlessly integrates new experts into pre-trained MoE models, endowing them with novel knowledge without the need to tune pretrained models such as MoE and vision encoders. This approach enables rapid adaptation and extension to new modal data or tasks, effectively addressing the challenge of accommodating new modalities within LLMs. Furthermore, MoExtend avoids tuning pretrained models, thus mitigating the risk of catastrophic forgetting. Experimental results demonstrate the efficacy and efficiency of MoExtend in enhancing the multimodal capabilities of LLMs, contributing to advancements in multimodal AI research.</abstract>
-      <url hash="1bab46b1">2024.acl-srw.8</url>
-      <bibkey>zhong-etal-2024-moextend</bibkey>
-      <doi>10.18653/v1/2024.acl-srw.8</doi>
-    </paper>
-    <paper id="9">
-      <title>On the Interpretability of Deep Learning Models for Collaborative Argumentation Analysis in Classrooms</title>
-      <author><first>Deliang</first><last>Wang</last></author>
-      <author><first>Gaowei</first><last>Chen</last><affiliation>University of Hong Kong</affiliation></author>
-      <pages>92-102</pages>
-      <abstract>Collaborative argumentation holds significant potential for enhancing students’ learning outcomes within classroom settings. Consequently, researchers have explored the application of artificial intelligence (AI) to automatically analyze argumentation in these contexts. Despite the remarkable performance of deep learning models in this task, their lack of interpretability poses a critical challenge, leading to teachers’ skepticism and limited utilization. To cultivate trust among teachers, this PhD thesis proposal aims to leverage explainable AI techniques to provide explanations for these deep learning models. Specifically, the study develops two deep learning models for automated analysis of argument moves (claim, evidence, and warrant) and specificity levels (low, medium, and high) within collaborative argumentation. To address the interpretability issue, four explainable AI methods are proposed: gradient sensitivity, gradient input, integrated gradient, and LIME. Computational experiments demonstrate the efficacy of these methods in elucidating model predictions by computing word contributions, with LIME delivering exceptional performance. Moreover, a quasi-experiment is designed to evaluate the impact of model explanations on user trust and knowledge, serving as a future study of this PhD proposal. By tackling the challenges of interpretability and trust, this PhD thesis proposal aims to contribute to fostering user trust in AI and facilitating the practical implementation of AI in educational contexts.</abstract>
-      <url hash="20e6ec99">2024.acl-srw.9</url>
-      <bibkey>wang-chen-2024-interpretability</bibkey>
-      <doi>10.18653/v1/2024.acl-srw.9</doi>
-    </paper>
     <paper id="10">
       <title>Document Alignment based on Overlapping Fixed-Length Segments</title>
       <author><first>Xiaotian</first><last>Wang</last></author>
       <author><first>Takehito</first><last>Utsuro</last><affiliation>University of Tsukuba</affiliation></author>
       <author><first>Masaaki</first><last>Nagata</last><affiliation>NTT Corporation</affiliation></author>
-      <pages>103-113</pages>
+      <pages>51-61</pages>
       <abstract>Acquiring large-scale parallel corpora is crucial for NLP tasks such asNeural Machine Translation, and web crawling has become a popularmethodology for this purpose. Previous studies have been conductedbased on sentence-based segmentation (SBS) when aligning documents invarious languages which are obtained through web crawling. Among them,the TK-PERT method (Thompson and Koehn, 2020) achieved state-of-the-artresults and addressed the boilerplate text in web crawling data wellthrough a down-weighting approach. However, there remains a problemwith how to handle long-text encoding better. Thus, we introduce thestrategy of Overlapping Fixed-Length Segmentation (OFLS) in place ofSBS, and observe a pronounced enhancement when performing the sameapproach for document alignment. In this paper, we compare the SBS andOFLS using three previous methods, Mean-Pool, TK-PERT (Thompson andKoehn, 2020), and Optimal Transport (Clark et al., 2019; El- Kishky andGuzman, 2020), on the WMT16 document alignment shared task forFrench-English, as well as on our self-established Japanese-Englishdataset MnRN. As a result, for the WMT16 task, various SBS basedmethods showed an increase in recall by 1% to 10% after reproductionwith OFLS. For MnRN data, OFLS demonstrated notable accuracyimprovements and exhibited faster document embedding speed.</abstract>
-      <url hash="9aa147cb">2024.acl-srw.10</url>
+      <url hash="992f5355">2024.acl-srw.10</url>
       <bibkey>wang-etal-2024-document</bibkey>
       <doi>10.18653/v1/2024.acl-srw.10</doi>
     </paper>
-    <paper id="11">
-      <title>Automatically Suggesting Diverse Example Sentences for <fixed-case>L</fixed-case>2 <fixed-case>J</fixed-case>apanese Learners Using Pre-Trained Language Models</title>
-      <author><first>Enrico</first><last>Benedetti</last></author>
-      <author><first>Akiko</first><last>Aizawa</last><affiliation>NII, Tokyo Institute of Technology</affiliation></author>
-      <author><first>Florian</first><last>Boudin</last><affiliation>University of Nantes</affiliation></author>
-      <pages>114-131</pages>
-      <abstract>Providing example sentences that are diverse and aligned with learners’ proficiency levels is essential for fostering effective language acquisition.This study examines the use of Pre-trained Language Models (PLMs) to produce example sentences targeting L2 Japanese learners.We utilize PLMs in two ways: as quality scoring components in a retrieval system that draws from a newly curated corpus of Japanese sentences, and as direct sentence generators using zero-shot learning.We evaluate the quality of sentences by considering multiple aspects such as difficulty, diversity, and naturalness, with a panel of raters consisting of learners of Japanese, native speakers – and GPT-4.Our findings suggest that there is inherent disagreement among participants on the ratings of sentence qualities, except for difficulty. Despite that, the retrieval approach was preferred by all evaluators, especially for beginner and advanced target proficiency, while the generative approaches received lower scores on average.Even so, our experiments highlight the potential for using PLMs to enhance the adaptability of sentence suggestion systems and therefore improve the language learning journey.</abstract>
-      <url hash="7333ffaf">2024.acl-srw.11</url>
-      <bibkey>benedetti-etal-2024-automatically</bibkey>
-      <doi>10.18653/v1/2024.acl-srw.11</doi>
-    </paper>
-    <paper id="12">
-      <title><fixed-case>Z</fixed-case>-coref: <fixed-case>T</fixed-case>hai Coreference and Zero Pronoun Resolution</title>
-      <author><first>Poomphob</first><last>Suwannapichat</last><affiliation>King Mongkut’s University of Technology Thonburi</affiliation></author>
-      <author><first>Sansiri</first><last>Tarnpradab</last><affiliation>King Mongkut’s University of Technology Thonburi</affiliation></author>
-      <author><first>Santitham</first><last>Prom-on</last><affiliation>King Mongkut’s University of Technology Thonburi</affiliation></author>
-      <pages>132-139</pages>
-      <abstract>Coreference Resolution (CR) and Zero Pronoun Resolution (ZPR) are vital for extracting meaningful information from text. However, limited research and datasets pose significant challenges in Thai language. To address this, we developed an annotated joint CR and ZPR dataset. Additionally, we introduced the Z-coref model, capable of simultaneously handling CR and ZPR tasks by adjusting the span definition of a prior CR architecture to include token gaps. The proposed model trained on our dataset outperformed the state-of-the-art in resolving both coreference resolution and zero-pronoun resolution, while taking less time to train.</abstract>
-      <url hash="d09b2c45">2024.acl-srw.12</url>
-      <bibkey>suwannapichat-etal-2024-z</bibkey>
-      <doi>10.18653/v1/2024.acl-srw.12</doi>
-    </paper>
     <paper id="13">
       <title><fixed-case>R</fixed-case>e<fixed-case>MAG</fixed-case>-<fixed-case>KR</fixed-case>: Retrieval and Medically Assisted Generation with Knowledge Reduction for Medical Question Answering</title>
       <author><first>Sidhaarth</first><last>Murali</last></author>
       <author><first>Sowmya</first><last>S.</last><affiliation>National Institute of Technology Karnataka</affiliation></author>
       <author><first>Supreetha</first><last>R</last></author>
-      <pages>140-145</pages>
+      <pages>62-67</pages>
       <abstract>Large Language Models (LLMs) have significant potential for facilitating intelligent end-user applications in healthcare. However, hallucinations remain an inherent problem with LLMs, making it crucial to address this issue with extensive medical knowledge and data. In this work, we propose a Retrieve-and-Medically-Augmented-Generation with Knowledge Reduction (ReMAG-KR) pipeline, employing a carefully curated knowledge base using cross-encoder re-ranking strategies. The pipeline is tested on medical MCQ-based QA datasets as well as general QA datasets. It was observed that when the knowledge base is reduced, the model’s performance decreases by 2-8%, while the inference time improves by 47%.</abstract>
-      <url hash="6dc81646">2024.acl-srw.13</url>
+      <url hash="0fa3d62d">2024.acl-srw.13</url>
       <bibkey>murali-etal-2024-remag</bibkey>
       <doi>10.18653/v1/2024.acl-srw.13</doi>
     </paper>
-    <paper id="14">
-      <title>Plot Retrieval as an Assessment of Abstract Semantic Association</title>
-      <author><first>Shicheng</first><last>Xu</last></author>
-      <author><first>Liang</first><last>Pang</last><affiliation>Institute of Computing Technology, Chinese Academy of Sciences</affiliation></author>
-      <author><first>Jiangnan</first><last>Li</last><affiliation>WeChat, Tencent Inc.</affiliation></author>
-      <author><first>Mo</first><last>Yu</last><affiliation>WeChat AI, Tencent</affiliation></author>
-      <author><first>Fandong</first><last>Meng</last><affiliation>WeChat AI, Tencent Inc.</affiliation></author>
-      <author><first>Huawei</first><last>Shen</last><affiliation>Institute of Computing Technology, Chinese Academy of Sciences</affiliation></author>
-      <author><first>Xueqi</first><last>Cheng</last><affiliation>, Chinese Academy of Sciences</affiliation></author>
-      <author><first>Jie</first><last>Zhou</last></author>
-      <pages>146-161</pages>
-      <abstract>Retrieving relevant plots from the book for a query is a critical task, which can improve the reading experience and efficiency of readers. Readers usually only give an abstract and vague description as the query based on their own understanding, summaries, or speculations of the plot, which requires the retrieval model to have a strong ability to estimate the abstract semantic associations between the query and candidate plots. However, existing information retrieval (IR) datasets cannot reflect this ability well. In this paper, we propose PlotRetrieval, a labeled dataset to train and evaluate the performance of IR models on the novel task Plot Retrieval. Text pairs in PlotRetrieval have less word overlap and more abstract semantic association, which can reflect the ability of the IR models to estimate the abstract semantic association, rather than just traditional lexical or semantic matching. Extensive experiments across various lexical retrieval, sparse retrieval, dense retrieval, and cross-encoder methods compared with human studies on PlotRetrieval show current IR models still struggle in capturing abstract semantic association between texts. PlotRetrieval can be the benchmark for further research on the semantic association modeling ability of IR models.</abstract>
-      <url hash="6bc9c203">2024.acl-srw.14</url>
-      <bibkey>xu-etal-2024-plot</bibkey>
-      <doi>10.18653/v1/2024.acl-srw.14</doi>
-    </paper>
     <paper id="15">
       <title>Demystifying Instruction Mixing for Fine-tuning Large Language Models</title>
       <author><first>Renxi</first><last>Wang</last></author>
@@ -14808,9 +14718,9 @@
       <author><first>Xudong</first><last>Han</last><affiliation>University of Melbourne</affiliation></author>
       <author><first>Chiyu</first><last>Zhang</last><affiliation>University of British Columbia</affiliation></author>
       <author><first>Timothy</first><last>Baldwin</last><affiliation>Mohamed bin Zayed University of Artificial Intelligence and The University of Melbourne</affiliation></author>
-      <pages>162-169</pages>
+      <pages>68-75</pages>
       <abstract>Instruction tuning significantly enhances the performance of large language models (LLMs) across various tasks. However, the procedure to optimizing the mixing of instruction datasets for LLM fine-tuning is still poorly understood. This study categorizes instructions into three primary types: NLP downstream tasks, coding, and general chat. We explore the effects of instruction tuning on different combinations of datasets on LLM performance, and find that certain instruction types are more advantageous for specific applications but can negatively impact other areas. This work provides insights into instruction mixtures, laying the foundations for future research.</abstract>
-      <url hash="f9033a86">2024.acl-srw.15</url>
+      <url hash="1be37de3">2024.acl-srw.15</url>
       <bibkey>wang-etal-2024-demystifying</bibkey>
       <doi>10.18653/v1/2024.acl-srw.15</doi>
     </paper>
@@ -14818,9 +14728,9 @@
       <title>Fine-Tuning <fixed-case>ASR</fixed-case> models for Very Low-Resource Languages: A Study on Mvskoke</title>
       <author><first>Julia</first><last>Mainzinger</last></author>
       <author><first>Gina-Anne</first><last>Levow</last><affiliation>University of Washington and University of Washington</affiliation></author>
-      <pages>170-176</pages>
+      <pages>76-82</pages>
       <abstract>Recent advancements in multilingual models for automatic speech recognition (ASR) have been able to achieve a high accuracy for languages with extremely limited resources. This study examines ASR modeling for the Mvskoke language, an indigenous language of America. The parameter efficiency of adapter training is contrasted with training entire models, and it is demonstrated how performance varies with different amounts of data. Additionally, the models are evaluated with trigram language model decoding, and the outputs are compared across different types of speech recordings. Results show that training an adapter is both parameter efficient and gives higher accuracy for a relatively small amount of data.</abstract>
-      <url hash="cea0f048">2024.acl-srw.16</url>
+      <url hash="918ba88b">2024.acl-srw.16</url>
       <bibkey>mainzinger-levow-2024-fine</bibkey>
       <doi>10.18653/v1/2024.acl-srw.16</doi>
     </paper>
@@ -14829,9 +14739,9 @@
       <author><first>Angelina</first><last>Parfenova</last><affiliation>Universität Grenoble Alpes</affiliation></author>
       <author><first>Alexander</first><last>Denzler</last><affiliation>Lucerne University of Applied Sciences and Arts</affiliation></author>
       <author><first>Jörgen</first><last>Pfeffer</last><affiliation>Technische Universität München</affiliation></author>
-      <pages>177-185</pages>
+      <pages>83-91</pages>
       <abstract>This PhD proposal aims to investigate ways of automating qualitative data analysis, specifically the thematic coding of texts. Despite existing methods vastly covered in literature, they mainly use Topic Modeling and other quantitative approaches which are far from resembling a human’s analysis outcome. This proposal examines the limitations of current research in the field. It proposes a novel methodology based on Large Language Models to tackle automated coding and make it as close as possible to the results of human researchers. This paper covers studies already done in this field and their limitations, existing software, the problem of duplicating the researcher bias, and the proposed methodology.</abstract>
-      <url hash="fd2a0781">2024.acl-srw.17</url>
+      <url hash="a9c1ea15">2024.acl-srw.17</url>
       <bibkey>parfenova-etal-2024-automating</bibkey>
       <doi>10.18653/v1/2024.acl-srw.17</doi>
     </paper>
@@ -14840,9 +14750,9 @@
       <author><first>Janek</first><last>Herrlein</last><affiliation>Bayerische Julius-Maximilians-Universit�t W�rzburg</affiliation></author>
       <author><first>Chia-Chien</first><last>Hung</last><affiliation>NEC Laboratories Europe and Universit�t Mannheim</affiliation></author>
       <author><first>Goran</first><last>Glava�</last><affiliation>Julius-Maximilians-Universit�t W�rzburg</affiliation></author>
-      <pages>186-194</pages>
+      <pages>92-100</pages>
       <abstract>Research on token-level reference-free hallucination detection has predominantly focused on English, primarily due to the scarcity of robust datasets in other languages. This has hindered systematic investigations into the effectiveness of cross-lingual transfer for this important NLP application. To address this gap, we introduce ANHALTEN, a new evaluation dataset that extends the English hallucination detection dataset to German. To the best of our knowledge, this is the first work that explores cross-lingual transfer for token-level reference-free hallucination detection. ANHALTEN contains gold annotations in German that are parallel (i.e., directly comparable to the original English instances). We benchmark several prominent cross-lingual transfer approaches, demonstrating that larger context length leads to better hallucination detection in German, even without succeeding context. Importantly, we show that the sample-efficient few-shot transfer is the most effective approach in most setups. This highlights the practical benefits of minimal annotation effort in the target language for reference-free hallucination detection. Aiming to catalyze future research on cross-lingual token-level reference-free hallucination detection, we make ANHALTEN publicly available: https://github.com/janekh24/anhalten</abstract>
-      <url hash="8e1df3b9">2024.acl-srw.18</url>
+      <url hash="132c929f">2024.acl-srw.18</url>
       <bibkey>herrlein-etal-2024-anhalten</bibkey>
       <doi>10.18653/v1/2024.acl-srw.18</doi>
     </paper>
@@ -14851,9 +14761,9 @@
       <author><first>Thanakorn</first><last>Thaminkaew</last><affiliation>Chulalongkorn University</affiliation></author>
       <author><first>Piyawat</first><last>Lertvittayakumjorn</last><affiliation>Google</affiliation></author>
       <author><first>Peerapon</first><last>Vateekul</last><affiliation>Chulalongkorn University</affiliation></author>
-      <pages>195-203</pages>
+      <pages>101-109</pages>
       <abstract>Prompt-based learning has shown its effectiveness in few-shot text classification. A key factor in its success is a verbalizer, which translates output from a language model into a predicted class. Notably, the simplest and widely acknowledged verbalizer employs manual labels to represent the classes. However, manual selection may not yield the optimal words for a given language model, potentially leading to subpar classification performance, especially in mid-to-low resource languages with weaker language models. Therefore, we propose Label-Aware Automatic Verbalizer (LAAV), effectively augmenting manual labels for improved few-shot classification results. Specifically, we utilize the label name along with the conjunction “and” to induce the model to generate more effective words for the verbalizer. Experimental results on four mid-to-low resource Southeast Asian languages demonstrate that LAAV significantly outperforms existing verbalizers.</abstract>
-      <url hash="af32c345">2024.acl-srw.19</url>
+      <url hash="bd759d3b">2024.acl-srw.19</url>
       <bibkey>thaminkaew-etal-2024-label</bibkey>
       <doi>10.18653/v1/2024.acl-srw.19</doi>
     </paper>
@@ -14862,19 +14772,19 @@
       <author><first>Louis</first><last>Estève</last></author>
       <author><first>Agata</first><last>Savary</last><affiliation>Université Paris-Saclay</affiliation></author>
       <author><first>Thomas</first><last>Lavergne</last></author>
-      <pages>204-224</pages>
-      <abstract>Multiword Expressions (MWEs) make a goodcase study for linguistic diversity due to their idiosyncratic nature. Defining MWE canonical forms as types, diversity may be measured notably through disparity, based on pairwise distances between types. To this aim, we train static MWE-aware word embeddings for verbal MWEs in 14 languages, and we show interesting properties of these vector spaces. We use these vector spaces to implement the so-called functional diversity measure. We apply this measure to the results of several MWE identification systems. We find that, although MWE vector spaces are meaningful ata local scale, the disparity measure aggregatingthem at a global scale strongly correlates with the number of types, which questions its usefulness in presence of simpler diversity metrics such as variety. We make the vector spaces we generated available.</abstract>
-      <url hash="9aa61c03">2024.acl-srw.20</url>
-      <bibkey>esteve-etal-2024-vector</bibkey>
+      <pages>110-130</pages>
+      <abstract>Multiword Expressions (MWEs) make a goodcase study for linguistic diversity due to theiridiosyncratic nature. Defining MWE canonicalforms as types, diversity may be measurednotably through disparity, based on pairwisedistances between types. To this aim, wetrain static MWE-aware word embeddings forverbal MWEs in 14 languages, and we showinteresting properties of these vector spaces.We use these vector spaces to implement theso-called functional diversity measure. Weapply this measure to the results of severalMWE identification systems. We find that,although MWE vector spaces are meaningful ata local scale, the disparity measure aggregatingthem at a global scale strongly correlateswith the number of types, which questions itsusefulness in presence of simpler diversitymetrics such as variety. We make the vectorspaces we generated available.</abstract>
+      <url hash="6321ffb4">2024.acl-srw.20</url>
+      <bibkey>estve-etal-2024-vector</bibkey>
       <doi>10.18653/v1/2024.acl-srw.20</doi>
     </paper>
     <paper id="21">
       <title>Narratives at Conflict: Computational Analysis of News Framing in Multilingual Disinformation Campaigns</title>
       <author><first>Antonina</first><last>Sinelnik</last></author>
       <author><first>Dirk</first><last>Hovy</last><affiliation>Bocconi University</affiliation></author>
-      <pages>225-237</pages>
+      <pages>131-143</pages>
       <abstract>Any report frames issues to favor a particular interpretation by highlighting or excluding certain aspects of a story. Despite the widespread use of framing in disinformation, framing properties and detection methods remain underexplored outside the English-speaking world. We explore how multilingual framing of the same issue differs systematically. We use eight years of Russia-backed disinformation campaigns, spanning 8k news articles in 4 languages targeting 15 countries. We find that disinformation campaigns consistently and intentionally favor specific framing, depending on the target language of the audience. We further discover how Russian-language articles consistently highlight selected frames depending on the region of the media coverage. We find that the two most prominent models for automatic frame analysis underperform and show high disagreement, highlighting the need for further research.</abstract>
-      <url hash="8215da10">2024.acl-srw.21</url>
+      <url hash="5d5a1c27">2024.acl-srw.21</url>
       <bibkey>sinelnik-hovy-2024-narratives</bibkey>
       <doi>10.18653/v1/2024.acl-srw.21</doi>
     </paper>
@@ -14883,9 +14793,9 @@
       <author><first>Julian</first><last>Schelb</last><affiliation>Universit�t Konstanz</affiliation></author>
       <author><first>Andreas</first><last>Spitz</last><affiliation>Universit�t Konstanz</affiliation></author>
       <author><first>Roberto</first><last>Ulloa</last></author>
-      <pages>238-252</pages>
+      <pages>144-158</pages>
       <abstract>Researchers in the political and social sciences often rely on classification models to analyze trends in information consumption by examining browsing histories of millions of webpages. Automated scalable methods are necessary due to the impracticality of manual labeling. In this paper, we model the detection of topic-related content as a binary classification task and compare the accuracy of fine-tuned pre-trained encoder models against in-context learning strategies. Using only a few hundred annotated data points per topic, we detect content related to three German policies in a database of scraped webpages. We compare multilingual and monolingual models, as well as zero and few-shot approaches, and investigate the impact of negative sampling strategies and the combination of URL &amp; content-based features. Our results show that a small sample of annotated data is sufficient to train an effective classifier. Fine-tuning encoder-based models yields better results than in-context learning. Classifiers using both URL &amp; content-based features perform best, while using URLs alone provides adequate results when content is unavailable.</abstract>
-      <url hash="dd27121f">2024.acl-srw.22</url>
+      <url hash="81f4b458">2024.acl-srw.22</url>
       <bibkey>schelb-etal-2024-assessing</bibkey>
       <doi>10.18653/v1/2024.acl-srw.22</doi>
     </paper>
@@ -14895,9 +14805,9 @@
       <author><first>Jundai</first><last>Suzuki</last><affiliation>Tokyo Denki University, Tokyo Institute of Technology</affiliation></author>
       <author><first>Masaki</first><last>Shuzo</last><affiliation>Tokyo Denki University</affiliation></author>
       <author><first>Eisaku</first><last>Maeda</last><affiliation>Tokyo Denki University</affiliation></author>
-      <pages>253-263</pages>
+      <pages>159-169</pages>
       <abstract>Large Language Models (LLMs) are considered to have potentially extensive knowledge, but because their internal processing is black-boxed, it has been difficult to directly edit the knowledge held by the LLMs themselves. To address this issue, a method called local modification-based knowledge editing has been developed. This method identifies the knowledge neurons that encode the target knowledge and adjusts the parameters associated with these neurons to update the knowledge. Knowledge neurons are identified by masking the <tex-math>\it{o}</tex-math> part from sentences representing relational triplets (<tex-math>\it{s, r, o}</tex-math>), having the LLM predict the masked part, and observing the LLM�s activation during the prediction. When the architecture is decoder-based, the predicted <tex-math>\it{o}</tex-math> needs to be located at the end of the sentence. Previous local modification-based knowledge editing methods for decoder-based models have assumed SVO languages and faced challenges when applied to SOV languages such as Japanese. In this study, we propose a knowledge editing method that eliminates the need for word order constraints by converting the input for identifying knowledge neurons into a question where <tex-math>\it{o}</tex-math> is the answer. We conducted validation experiments on 500 examples and confirmed that the proposed method is effective for Japanese, a non-SVO language. We also applied this method to English, an SVO language, and demonstrated that it outperforms conventional methods.</abstract>
-      <url hash="26d4c793">2024.acl-srw.23</url>
+      <url hash="115421f1">2024.acl-srw.23</url>
       <bibkey>ishigaki-etal-2024-knowledge</bibkey>
       <doi>10.18653/v1/2024.acl-srw.23</doi>
     </paper>
@@ -14907,9 +14817,9 @@
       <author><first>Miaoran</first><last>Zhang</last><affiliation>Saarland University</affiliation></author>
       <author><first>Marius</first><last>Mosbach</last><affiliation>McGill University and Mila - Quebec Artificial Intelligence Institute</affiliation></author>
       <author><first>Dietrich</first><last>Klakow</last><affiliation>Saarland University</affiliation></author>
-      <pages>264-279</pages>
+      <pages>170-185</pages>
       <abstract>Identifying beneficial tasks to transfer from is a critical step toward successful intermediate-task transfer learning. In this work, we experiment with 130 source-target task combinations and demonstrate that the transfer performance exhibits severe variance across different source tasks and training seeds, highlighting the crucial role of intermediate-task selection in a broader context. We compare four representative task selection methods in a unified setup, focusing on their effectiveness and consistency. Compared to embedding-free methods and text embeddings, task embeddings constructed from fine-tuned weights can better estimate task transferability by improving task prediction scores from 2.59% to 3.96%. Despite their strong performance, we observe that the task embeddings do not consistently demonstrate superiority for tasks requiring reasoning abilities. Furthermore, we introduce a novel method that measures pairwise token similarity using maximum inner product search, leading to the highest performance in task prediction. Our findings suggest that token-wise similarity is better predictive for predicting transferability compared to averaging weights.</abstract>
-      <url hash="ba87fef8">2024.acl-srw.24</url>
+      <url hash="a0631551">2024.acl-srw.24</url>
       <bibkey>lin-etal-2024-exploring</bibkey>
       <doi>10.18653/v1/2024.acl-srw.24</doi>
       <video href="2024.acl-srw.24.mp4"/>
@@ -14920,9 +14830,9 @@
       <author><first>Sabrina</first><last>Campano</last><affiliation>EDF R&amp;D</affiliation></author>
       <author><first>Lydia</first><last>Ouali</last></author>
       <author><first>Cyril</first><last>Grouin</last><affiliation>CNRS</affiliation></author>
-      <pages>280-285</pages>
+      <pages>186-191</pages>
       <abstract>The processing of long sequences with models remains a subject in its own right, including automatic summary, despite recent improvements. In this work, we present experiments on the automatic summarization of scientific articles using BART models, taking into account textual information coming from distinct passages from the long texts to be summarized. We demonstrate that taking into account document structure improves the performance of state-of-the-art models and approaches the performance of LongFormer on English.</abstract>
-      <url hash="9a92a1d1">2024.acl-srw.25</url>
+      <url hash="b378c57e">2024.acl-srw.25</url>
       <bibkey>sauvage-etal-2024-structure</bibkey>
       <doi>10.18653/v1/2024.acl-srw.25</doi>
     </paper>
@@ -14931,9 +14841,9 @@
       <author><first>Anirudh</first><last>Kondapally</last><affiliation>Honda R&amp;D Co., Ltd.</affiliation></author>
       <author><first>Kentaro</first><last>Yamada</last><affiliation>Honda R&amp;D Co., Ltd.</affiliation></author>
       <author><first>Hitomi</first><last>Yanaka</last><affiliation>the University of Tokyo</affiliation></author>
-      <pages>286-293</pages>
+      <pages>192-199</pages>
       <abstract>Vision-and-Language Navigation (VLN) encompasses interacting with autonomous vehicles using language and visual input from the perspective of mobility.Most of the previous work in this field focuses on spatial reasoning and the semantic grounding of visual information.However, reasoning based on the actions of pedestrians in the scene is not much considered.In this study, we provide a VLN dataset for destination prediction with action inference to investigate the extent to which current VLN models perform action inference.We introduce a crowd-sourcing process to construct a dataset for this task in two steps: (1) collecting beliefs about the next action for a pedestrian and (2) annotating the destination considering the pedestrian’s next action.Our benchmarking results of the models on destination prediction lead us to believe that the models can learn to reason about the effect of the action and the next action on the destination to a certain extent.However, there is still much scope for improvement.</abstract>
-      <url hash="e6872311">2024.acl-srw.26</url>
+      <url hash="0a4c0c54">2024.acl-srw.26</url>
       <bibkey>kondapally-etal-2024-action</bibkey>
       <doi>10.18653/v1/2024.acl-srw.26</doi>
     </paper>
@@ -14941,9 +14851,9 @@
       <title>A Computational Analysis and Exploration of Linguistic Borrowings in <fixed-case>F</fixed-case>rench Rap Lyrics</title>
       <author><first>Lucas</first><last>Zurbuchen</last></author>
       <author><first>Rob</first><last>Voigt</last><affiliation>Northwestern University</affiliation></author>
-      <pages>294-302</pages>
+      <pages>200-208</pages>
       <abstract>In France, linguistic borrowings in the relatively conservative French language are an important site of cultural debate, and rap in particular is a hotspot for borrowings. In this work, we use computational methods to understand the factors that affect the prominence and prevalence of a borrowing. To do so, we manually annotate a lexicon of over 700 borrowings occurring in this context (including key aspects for each borrowing such as origin and semantic class). We analyze the prevalence of these borrowings in a newly collected corpus of over 8000 French rap song lyrics and find that there are increases in the proportion of linguistic borrowings, interjections, and Niger-Congo borrowings while terms related to the arts are decreasing in prevalence. We release our code and data to facilitate further research in this area and discuss potential future directions.</abstract>
-      <url hash="f9d0239e">2024.acl-srw.27</url>
+      <url hash="d6346cb3">2024.acl-srw.27</url>
       <bibkey>zurbuchen-voigt-2024-computational</bibkey>
       <doi>10.18653/v1/2024.acl-srw.27</doi>
     </paper>
@@ -14953,9 +14863,9 @@
       <author><first>Florian</first><last>Schneider</last><affiliation>Universit�t Hamburg</affiliation></author>
       <author><first>Irina</first><last>Nikishina</last></author>
       <author><first>Chris</first><last>Biemann</last><affiliation>U Hamburg</affiliation></author>
-      <pages>303-338</pages>
+      <pages>209-244</pages>
       <abstract>Large Language Models (LLMs) such as ChatGPT, GitHub Copilot, Llama, or Mistral assist programmers as copilots and knowledge sources to make the coding process faster and more efficient. This paper aims to improve the copilot performance by implementing different self-alignment processes and retrieval-augmented generation (RAG) pipelines, as well as their combination. To test the effectiveness of all approaches, we create a dataset and apply a model-based evaluation, using LLM as a judge. It is designed to check the model’s abilities to understand the source code semantics, the dependency between files, and the overall meta-information about the repository. We also compare our approach with other existing solutions, e.g. ChatGPT-3.5, and evaluate on the existing benchmarks. Code and dataset are available online (https://anonymous.4open.science/r/ma_llm-382D).</abstract>
-      <url hash="7aefe4d8">2024.acl-srw.28</url>
+      <url hash="3dba2e84">2024.acl-srw.28</url>
       <bibkey>strich-etal-2024-improving</bibkey>
       <doi>10.18653/v1/2024.acl-srw.28</doi>
     </paper>
@@ -14964,31 +14874,21 @@
       <author><first>Fabio</first><last>Pernisi</last></author>
       <author><first>Dirk</first><last>Hovy</last><affiliation>Bocconi University</affiliation></author>
       <author><first>Paul</first><last>R�ttger</last><affiliation>Bocconi University</affiliation></author>
-      <pages>339-345</pages>
+      <pages>245-251</pages>
       <abstract>As diverse linguistic communities and users adopt Large Language Models (LLMs), assessing their safety across languages becomes critical. Despite ongoing efforts to align these models with safe and ethical guidelines, they can still be induced into unsafe behavior with jailbreaking, a technique in which models are prompted to act outside their operational guidelines. What research has been conducted on these vulnerabilities was predominantly on English, limiting the understanding of LLM behavior in other languages. We address this gap by investigating Many-Shot Jailbreaking (MSJ) in Italian, underscoring the importance of understanding LLM behavior in different languages. We base our analysis on a newly created Italian dataset to identify unique safety vulnerabilities in 4 families of open-source LLMs.We find that the models exhibit unsafe behaviors even with minimal exposure to harmful prompts, and–more alarmingly–this tendency rapidly escalates with more demonstrations.</abstract>
-      <url hash="205f8e64">2024.acl-srw.29</url>
+      <url hash="5b5c8ec0">2024.acl-srw.29</url>
       <bibkey>pernisi-etal-2024-compromesso</bibkey>
       <doi>10.18653/v1/2024.acl-srw.29</doi>
     </paper>
-    <paper id="30">
-      <title>Foundation Model for Biomedical Graphs: Integrating Knowledge Graphs and Protein Structures to Large Language Models</title>
-      <author><first>Yunsoo</first><last>Kim</last></author>
-      <pages>346-355</pages>
-      <abstract>Transformer model has been a de-facto standard in natural language processing. Its adaptations in other fields such as computer vision showed promising results that this architecture is a powerful neural network in representation learning regardless of the data type. This recent success has led to research in multimodal Large Language Model (LLM), which enabled us to new types of tasks and applications with multiple data types. However, multimodal LLM in the biomedical domain is primarily limited to images, text, and/or sequence data. Here I propose to work on multimodal LLM architecture for biomedical graphs such as protein structure and chemical molecules. The research hypothesis is based on the fact that clinicians and researchers in computational biology and clinical research take advantage of various information for their decision-making process. Therefore, an AI model being able to handle multiple data types should boost its ability to use diverse knowledge for improved performances in clinical applications.</abstract>
-      <url hash="c01e8464">2024.acl-srw.30</url>
-      <bibkey>kim-2024-foundation</bibkey>
-      <doi>10.18653/v1/2024.acl-srw.30</doi>
-      <video href="2024.acl-srw.30.mp4"/>
-    </paper>
     <paper id="31">
       <title><fixed-case>V</fixed-case>i<fixed-case>M</fixed-case>ed<fixed-case>AQA</fixed-case>: A <fixed-case>V</fixed-case>ietnamese Medical Abstractive Question-Answering Dataset and Findings of Large Language Model</title>
       <author><first>Minh-Nam</first><last>Tran</last></author>
       <author><first>Phu-Vinh</first><last>Nguyen</last></author>
       <author><first>Long</first><last>Nguyen</last><affiliation>Ho Chi Minh city University of Science, Vietnam National University</affiliation></author>
       <author><first>Dien</first><last>Dinh</last></author>
-      <pages>356-364</pages>
+      <pages>252-260</pages>
       <abstract>Question answering involves creating answers to questions. With the growth of large language models, the ability of question-answering systems has dramatically improved. However, there is a lack of Vietnamese abstractive question-answering datasets, especially in the medical domain. Therefore, this research aims to mitigate this gap by introducing ViMedAQA. This **Vi**etnamese **Med**ical **A**bstractive **Q**uestion-**A**nswering dataset covers four topics in the Vietnamese medical domain, including body parts, disease, drugs and medicine. Additionally, the empirical results on the proposed dataset examine the capability of the large language models in the Vietnamese medical domain, including reasoning, memorizing and awareness of essential information.</abstract>
-      <url hash="74611a8f">2024.acl-srw.31</url>
+      <url hash="8ffe4f78">2024.acl-srw.31</url>
       <bibkey>tran-etal-2024-vimedaqa</bibkey>
       <doi>10.18653/v1/2024.acl-srw.31</doi>
     </paper>
@@ -15000,9 +14900,9 @@
       <author><first>Qi</first><last>Zhang</last><affiliation>Fudan University</affiliation></author>
       <author><first>Tao</first><last>Gui</last><affiliation>Fudan University</affiliation></author>
       <author id="fei-liu"><first>Fei</first><last>Liu</last><affiliation>Emory University</affiliation></author>
-      <pages>365-376</pages>
+      <pages>261-272</pages>
       <abstract>Customizing LLMs for a specific task involves separating high-quality responses from lower-quality ones. This skill can be developed using supervised fine-tuning with extensive human preference data. However, obtaining a large volume of expert-annotated data is costly for most tasks. In this paper, we explore a novel method to optimize LLMs using ranking metrics. This method trains the model to prioritize the best responses from a pool of candidates created for a particular task. Rather than a traditional full ordering, we advocate for a partial ordering, as achieving consensus on the perfect order of candidate responses can be challenging. Our partial ordering is more robust, less sensitive to noise, and can be achieved with limited human annotations or through heuristic methods. We test our system’s improved response generation ability using benchmark datasets, including textual entailment and multi-document question answering. We conduct ablation studies to understand crucial factors, such as how to gather candidate responses for a specific task, determine their most suitable order, and balance supervised fine-tuning with ranking metrics. Our approach, named RESCUE, offers a promising avenue for enhancing the response generation and task accuracy of LLMs.</abstract>
-      <url hash="3c47bb4a">2024.acl-srw.32</url>
+      <url hash="cc5abbb7">2024.acl-srw.32</url>
       <bibkey>wang-etal-2024-rescue</bibkey>
       <doi>10.18653/v1/2024.acl-srw.32</doi>
     </paper>
@@ -15011,9 +14911,9 @@
       <author><first>Jolie</first><last>Zhou</last><affiliation>University of Washington</affiliation></author>
       <author><first>Camille</first><last>Cole</last><affiliation>Illinois State University</affiliation></author>
       <author><first>Annie</first><last>Chen</last><affiliation>University of Washington</affiliation></author>
-      <pages>377-390</pages>
+      <pages>273-286</pages>
       <abstract>Geoparsing, the task of assigning coordinates to locations extracted from free text, is invaluable in enabling us to place locations in time and space. In the historical domain, many geoparsing corpora are from large news collections. We examine the Svoboda Diaries, a small historical corpus written primarily in English, with many location names in transliterated Arabic. We develop a pipeline employing named entity recognition for geotagging, and a map-based generate-and-rank approach incorporating candidate name augmentation and clustering of location context words for geocoding. Our system outperforms existing map-based geoparsers in terms of accuracy, lowest mean distance error, and number of locations correctly identified. As location names may vary from those in knowledge bases, we find that augmented candidate generation is instrumental in the system’s performance. Among our candidate generation methods, the generation of transliterated names contributed the most to increased location matches in the knowledge base. Our main contribution is proposing an integrated pipeline for geoparsing of historical corpora using augmented candidate location name generation and clustering methods – an approach that can be generalized to other texts with foreign or non-standard spellings.</abstract>
-      <url hash="086fadc5">2024.acl-srw.33</url>
+      <url hash="22cb71e6">2024.acl-srw.33</url>
       <bibkey>zhou-etal-2024-basreh</bibkey>
       <doi>10.18653/v1/2024.acl-srw.33</doi>
     </paper>
@@ -15022,9 +14922,9 @@
       <author><first>Sophie</first><last>Wu</last><affiliation>McGill University</affiliation></author>
       <author><first>Anita</first><last>Zheng</last><affiliation>McGill University</affiliation></author>
       <author><first>Joey</first><last>Chuang</last><affiliation>McGill University</affiliation></author>
-      <pages>391-396</pages>
+      <pages>287-292</pages>
       <abstract>This paper introduces a novel method for empirically evaluating the relationship between the phonological and semantic similarity of linguistic units using embedding spaces. Chinese character homophones are used as a proof-of-concept. We employ cosine similarity as a proxy for semantic similarity between characters, and compare relationships between phonologically-related characters and baseline characters (chosen as similar-frequency characters). We show there is a strongly statistically significant positive semantic relationship among different Chinese characters at varying levels of sound-sharing. We also perform some basic probing using t-SNE and UMAP visualizations, and indicate directions for future applications of this method.</abstract>
-      <url hash="74441f7b">2024.acl-srw.34</url>
+      <url hash="ccee2b3e">2024.acl-srw.34</url>
       <bibkey>wu-etal-2024-homophone2vec</bibkey>
       <doi>10.18653/v1/2024.acl-srw.34</doi>
     </paper>
@@ -15032,9 +14932,9 @@
       <title>Trace-of-Thought Prompting: Investigating Prompt-Based Knowledge Distillation Through Question Decomposition</title>
       <author><first>Tyler</first><last>McDonald</last><affiliation>Brock University</affiliation></author>
       <author><first>Ali</first><last>Emami</last><affiliation>Brock University</affiliation></author>
-      <pages>397-410</pages>
+      <pages>293-306</pages>
       <abstract>Knowledge distillation allows smaller neural networks to emulate the performance of larger, teacher models with reduced computational demands. Traditional methods for Large Language Models (LLMs) often necessitate extensive fine-tuning, which limits their accessibility. To address this, we introduce Trace-of-Thought Prompting, a novel framework designed to distill critical reasoning capabilities from large-scale teacher models (over 8 billion parameters) to small-scale student models (up to 8 billion parameters). This approach leverages problem decomposition to enhance interpretability and facilitate human-in-the-loop interventions. Empirical evaluations on the GSM8K and MATH datasets show that student models achieve accuracy gains of up to 113% on GSM8K and 20% on MATH, with significant improvements particularly notable in smaller models like Llama 2 and Zephyr. Our results suggest a promising pathway for open-source, small-scale models to eventually serve as both students and teachers, potentially reducing our reliance on large-scale, proprietary models. Our code, featuring data analytics and testing scripts, is provided here: https://github.com/traceofthought/trace-of-thought-prompting/tree/main.</abstract>
-      <url hash="fb0c0284">2024.acl-srw.35</url>
+      <url hash="c6c9ea7c">2024.acl-srw.35</url>
       <bibkey>mcdonald-emami-2024-trace</bibkey>
       <doi>10.18653/v1/2024.acl-srw.35</doi>
     </paper>
@@ -15045,71 +14945,48 @@
       <author><first>Arijit</first><last>Chowdhury</last><affiliation>Amazon</affiliation></author>
       <author><first>Karthik</first><last>Venkat Ramanan</last></author>
       <author><first>Aman</first><last>Chadha</last><affiliation>Amazon</affiliation></author>
-      <pages>411-421</pages>
+      <pages>307-317</pages>
       <abstract>Large Language Models (LLMs) have demonstrated impressive zero-shot performance on a wide range of NLP tasks, demonstrating the ability to reason and apply common sense. A relevant application is to use them for creating high-quality synthetic datasets for downstream tasks. In this work, we probe whether GPT-4 can be used to augment existing extractive reading comprehension datasets. Automating data annotation processes has the potential to save large amounts of time, money, and effort that goes into manually labeling datasets. In this paper, we evaluate the performance of GPT-4 as a replacement for human annotators for low-resource reading comprehension tasks, by comparing performance after fine-tuning, and the cost associated with annotation. This work serves to be the first analysis of LLMs as synthetic data augmenters for QA systems, highlighting the unique opportunities and challenges. Additionally, we release augmented versions of low-resource datasets, that will allow the research community to create further benchmarks for evaluation of generated datasets. Github available at https://github.com/vsamuel2003/qa-gpt4</abstract>
-      <url hash="27aec5b2">2024.acl-srw.36</url>
+      <url hash="b89c25ab">2024.acl-srw.36</url>
       <bibkey>samuel-etal-2024-llms</bibkey>
       <doi>10.18653/v1/2024.acl-srw.36</doi>
     </paper>
     <paper id="37">
       <title>Automatic Derivation of Semantic Representations for <fixed-case>T</fixed-case>hai Serial Verb Constructions: A Grammar-Based Approach</title>
       <author><first>Vipasha</first><last>Bansal</last></author>
-      <pages>422-437</pages>
+      <pages>318-333</pages>
       <abstract>Deep semantic representations are useful for many NLU tasks (Droganova and Zeman 2019; Schuster and Manning-2016). Manual annotation to build these representations is time-consuming, and so automatic approaches are preferred (Droganova and Zeman 2019; Bender et al. 2015). This paper demonstrates how rich semantic representations can be automatically derived for Thai Serial Verb Constructions (SVCs), where the semantic relationship between component verbs is not immediately clear from the surface forms. I present the first fully-implemented HPSG analysis for Thai SVCs, deriving appropriate semantic representations (MRS; Copestake et al. 2005) from syntactic features, implemented within a DELPH-IN computational grammar (Slayden 2009). This analysis increases verified coverage of SVCs by 73% and decreases ambiguity by 46%. The final grammar can be found at: https://github.com/VipashaB94/ThaiGrammar</abstract>
-      <url hash="2cf66553">2024.acl-srw.37</url>
+      <url hash="47b0652a">2024.acl-srw.37</url>
       <bibkey>bansal-2024-automatic</bibkey>
       <doi>10.18653/v1/2024.acl-srw.37</doi>
     </paper>
-    <paper id="38">
-      <title>Seed-Free Synthetic Data Generation Framework for Instruction-Tuning <fixed-case>LLM</fixed-case>s: A Case Study in <fixed-case>T</fixed-case>hai</title>
-      <author><first>Parinthapat</first><last>Pengpun</last></author>
-      <author><first>Can</first><last>Udomcharoenchaikit</last><affiliation>Vidyasirimedhi Institute of Science and Technology (VISTEC)</affiliation></author>
-      <author><first>Weerayut</first><last>Buaphet</last></author>
-      <author><first>Peerat</first><last>Limkonchotiwat</last></author>
-      <pages>438-457</pages>
-      <abstract>We present a synthetic data approach for instruction-tuning large language models (LLMs) for low-resource languages in a data-efficient manner, specifically focusing on Thai. We identify three key properties that contribute to the effectiveness of instruction-tuning datasets: fluency, diversity, and cultural context. We propose a seed-data-free framework for generating synthetic instruction-tuning data that incorporates these essential properties. Our framework employs an LLM to generate diverse topics, retrieve relevant contexts from Wikipedia, and create instructions for various tasks, such as question answering, summarization, and conversation. The experimental results show that our best-performing synthetic dataset, which incorporates all three key properties, achieves competitive performance using only 5,000 instructions when compared to state-of-the-art Thai LLMs trained on hundreds of thousands of instructions. Our code and dataset are publicly available at https://github.com/parinzee/seed-free-synthetic-instruct.</abstract>
-      <url hash="9ff68176">2024.acl-srw.38</url>
-      <bibkey>pengpun-etal-2024-seed</bibkey>
-      <doi>10.18653/v1/2024.acl-srw.38</doi>
-      <video href="2024.acl-srw.38.mp4"/>
-    </paper>
     <paper id="39">
       <title>Bridging Distribution Gap via Semantic Rewriting with <fixed-case>LLM</fixed-case>s to Enhance <fixed-case>OOD</fixed-case> Robustness</title>
       <author><first>Manas</first><last>Madine</last></author>
-      <pages>458-468</pages>
+      <pages>334-344</pages>
       <abstract>This paper investigates the robustness of Large Language Models (LLMs) against Out-Of-Distribution (OOD) data within the context of sentiment analysis. Traditional fine-tuning approaches often fail to generalize effectively across different data distributions, limiting the practical deployment of LLMs in dynamic real-world scenarios. To address this challenge, we introduce a novel method called “Semantic Rewriting,” which leverages the inherent flexibility of LLMs to align both in-distribution (ID) and OOD data with the LLMs distributions. By semantically transforming sentences to minimize linguistic discrepancies, our approach helps to standardize features across datasets, thus enhancing model robustness. We conduct extensive experiments with several benchmark datasets and LLMs to validate the efficacy of our method. The results demonstrate that Semantic Rewriting significantly improves the performance of models on OOD tasks, outperforming traditional methods in both robustness and generalization capabilities. Our findings suggest that Semantic Rewriting is a promising technique for developing more reliable and versatile NLP systems capable of performing robustly across diverse operational environments.</abstract>
-      <url hash="94a66bbb">2024.acl-srw.39</url>
+      <url hash="01f7a2c0">2024.acl-srw.39</url>
       <bibkey>madine-2024-bridging</bibkey>
       <doi>10.18653/v1/2024.acl-srw.39</doi>
     </paper>
     <paper id="40">
       <title><fixed-case>C</fixed-case>o<fixed-case>V</fixed-case>o<fixed-case>S</fixed-case>witch: Machine Translation of Synthetic Code-Switched Text Based on Intonation Units</title>
       <author><first>Yeeun</first><last>Kang</last></author>
-      <pages>469-481</pages>
+      <pages>345-357</pages>
       <abstract>Multilingual code-switching research is often hindered by the lack and linguistically biased status of available datasets. To expand language representation, we synthesize code-switching data by replacing intonation units detected through PSST, a speech segmentation model fine-tuned from OpenAI’s Whisper, using a speech-to-text translation dataset, CoVoST 2. With our dataset, CoVoSwitch, spanning 13 languages, we evaluate the code-switching translation performance of two multilingual translation models, M2M-100 418M and NLLB-200 600M. We reveal that the inclusion of code-switching units results in higher translation performance than monolingual settings and that models are better at code-switching translation into English than non-English. Further, low-resource languages gain most from integration of code-switched units when translating into English but much less when translating into non-English. Translations into low-resource languages also perform worse than even raw code-switched inputs. We find that systems excel at copying English tokens but struggle with non-English tokens, that the off-target problem in monolingual settings is also relevant in code-switching settings, and that models hallucinate in code-switching translation by introducing words absent in both of the original source sentences. CoVoSwitch and code are available at https://github.com/sophiayk20/covoswitch.</abstract>
-      <url hash="c5097a70">2024.acl-srw.40</url>
+      <url hash="b87b975f">2024.acl-srw.40</url>
       <bibkey>kang-2024-covoswitch</bibkey>
       <doi>10.18653/v1/2024.acl-srw.40</doi>
     </paper>
-    <paper id="41">
-      <title>An Analysis under a Unified Formulation of Learning Algorithms with Output Constraints</title>
-      <author><first>Mooho</first><last>Song</last><affiliation>Seoul National University</affiliation></author>
-      <author><first>Jay-Yoon</first><last>Lee</last><affiliation>Seoul National University</affiliation></author>
-      <pages>482-498</pages>
-      <abstract>Neural networks (NN) perform well in diverse tasks, but sometimes produce nonsensical results to humans. Most NN models “solely” learn from (input, output) pairs, occasionally conflicting with human knowledge. Many studies indicate injecting human knowledge by reducing output constraints during training can improve model performance and reduce constraint violations.While there have been several attempts to compare different existing algorithms under the same programming framework, nonetheless, there has been no previous work that categorizes learning algorithms with output constraints in a unified manner. Our contributions are as follows: (1) We categorize the previous studies based on three axes: type of constraint loss used (e.g. probabilistic soft logic, REINFORCE), exploration strategy of constraint-violating examples, and integration mechanism of learning signals from main task and constraint.(2) We propose new algorithms to integrate the information of main task and constraint injection, inspired by continual-learning algorithms.(3) Furthermore, we propose the <tex-math>H\beta</tex-math>-score as a metric for considering the main task metric and constraint violation simultaneously.To provide a thorough analysis, we examine all the algorithms on three NLP tasks: natural language inference (NLI), synthetic transduction examples (STE), and semantic role labeling (SRL). We explore and reveal the key factors of various algorithms associated with achieving high <tex-math>H\beta</tex-math>-scores.</abstract>
-      <url hash="9acd6ca5">2024.acl-srw.41</url>
-      <bibkey>song-lee-2024-analysis</bibkey>
-      <doi>10.18653/v1/2024.acl-srw.41</doi>
-    </paper>
     <paper id="42">
       <title>Beyond Abstracts: A New Dataset, Prompt Design Strategy and Method for Biomedical Synthesis Generation</title>
       <author><first>James</first><last>O’Doherty</last></author>
       <author><first>Cian</first><last>Nolan</last></author>
       <author><first>Yufang</first><last>Hou</last><affiliation>Technische Universit�t Darmstadt and IBM Research Ireland</affiliation></author>
       <author><first>Anya</first><last>Belz</last><affiliation>Dublin City University</affiliation></author>
-      <pages>499-518</pages>
+      <pages>358-377</pages>
       <abstract>The biomedical field relies on cost and time intensive systematic reviews of papers to enable practitioners to keep up to date with research. Impressive recent advances in large language models (LLMs) have made the task of automating at least part of the systematic review process feasible, but progress is slow. This paper identifies some factors that may have been holding research back, and proposes a new, enhanced dataset and prompting-based method for automatic synthesis generation, the most challenging step for automation. We test different models and types of information from and about biomedical studies for their usefulness in obtaining high-quality results.We find that, surprisingly, inclusion of paper abstracts can worsens results. Instead, study summary information, and system instructions informed by domain knowledge, are key to producing high-quality syntheses.</abstract>
-      <url hash="f36a2670">2024.acl-srw.42</url>
+      <url hash="4129859d">2024.acl-srw.42</url>
       <bibkey>odoherty-etal-2024-beyond</bibkey>
       <doi>10.18653/v1/2024.acl-srw.42</doi>
     </paper>
@@ -15119,23 +14996,23 @@
       <author><first>Hayato</first><last>Tsukagoshi</last></author>
       <author><first>Ryohei</first><last>Sasano</last><affiliation>Nagoya University</affiliation></author>
       <author><first>Koichi</first><last>Takeda</last><affiliation>Nagoya University</affiliation></author>
-      <pages>519-530</pages>
+      <pages>378-389</pages>
       <abstract>Decoder-based large language models (LLMs) have shown high performance on many tasks in natural language processing. This is also true for sentence embedding learning, where a decoder-based model, PromptEOL, has achieved the best performance on semantic textual similarity (STS) tasks. However, PromptEOL requires a manually annotated natural language inference (NLI) dataset for fine-tuning.We aim to improve sentence embeddings without using large manually annotated datasets by automatically generating an NLI dataset with an LLM and using it for fine-tuning of PromptEOL. To achieve this, we explore methods of data generation suitable for sentence embedding learning in this study. Specifically, we will focus on automatic dataset generation through few-shot learning and explore the appropriate methods to leverage few-shot examples. Experimental results on the STS tasks demonstrate that our approach outperforms existing models in settings without large manually annotated datasets.</abstract>
-      <url hash="26a8a6e6">2024.acl-srw.43</url>
+      <url hash="f55354dd">2024.acl-srw.43</url>
       <bibkey>sato-etal-2024-improving</bibkey>
       <doi>10.18653/v1/2024.acl-srw.43</doi>
       <video href="2024.acl-srw.43.mp4"/>
     </paper>
     <paper id="44">
       <title>Curriculum Learning for Small Code Language Models</title>
-      <author><first>Marwa</first><last>Na�r</last><affiliation>New York University, Abu Dhabi and �cole Nationale Sup�rieure d’Informatique</affiliation></author>
-      <author><first>Kamel</first><last>Yamani</last><affiliation>New York University, Abu Dhabi and Ecole Nationale Sup�rieure d’Informatique (ESI)</affiliation></author>
+      <author><first>Marwa</first><last>Naïr</last><affiliation>New York University, Abu Dhabi and Ecole Nationale Supérieure d’Informatique</affiliation></author>
+      <author><first>Kamel</first><last>Yamani</last><affiliation>New York University, Abu Dhabi and Ecole Nationale Supérieure d’Informatique (ESI)</affiliation></author>
       <author><first>Lynda</first><last>Lhadj</last><affiliation>ESI</affiliation></author>
       <author><first>Riyadh</first><last>Baghdadi</last><affiliation>New York University</affiliation></author>
-      <pages>531-542</pages>
+      <pages>390-401</pages>
       <abstract>Code language models have emerged as useful tools for various programming tasks, yet they often struggle when it comes to complex ones. In this paper, we explore the potential of curriculum learning in enhancing the performance of these models. While prior research has suggested that curriculum learning does not necessarily help in improving the performance of language models, our results surprisingly show that this may not be the case for code language models. We demonstrate that a well-designed curriculum learning approach significantly improves the accuracy of small decoder-only code language models on the task of code execution, while its effect on code completion is less significant. To explore the potential of curriculum learning, we train multiple GPT models with 1 million parameters each to predict the next token and evaluate them on code completion and execution tasks. Our contributions include proposing a novel code difficulty assessment metric by combining software code measures, investigating the effectiveness of Curriculum Learning for code language models, and introducing a Novel Curriculum Learning schedule that enhances the performance of small decoder-only language models in code execution tasks. The results of this paper open the door for more research on the use of curriculum learning for code language models.</abstract>
-      <url hash="c4c7136f">2024.acl-srw.44</url>
-      <bibkey>nar-etal-2024-curriculum</bibkey>
+      <url hash="1afd1526">2024.acl-srw.44</url>
+      <bibkey>nair-etal-2024-curriculum</bibkey>
       <doi>10.18653/v1/2024.acl-srw.44</doi>
     </paper>
     <paper id="45">
@@ -15143,46 +15020,21 @@
       <author><first>Dharunish</first><last>Yugeswardeenoo</last><affiliation>Algoverse</affiliation></author>
       <author><first>Kevin</first><last>Zhu</last><affiliation>Algoverse AI Research</affiliation></author>
       <author><first>Sean</first><last>O’Brien</last><affiliation>University of California, San Diego</affiliation></author>
-      <pages>543-554</pages>
+      <pages>402-413</pages>
       <abstract>Although LLMs have the potential to transform many fields, they still underperform humans in reasoning tasks. Existing methods induce the model to produce step-by-step calculations, but this research explores the question: Does making the LLM analyze the question improve its performance? We propose a novel prompting strategy called Question Analysis Prompting (QAP), in which the model is prompted to explain the question in ’n’ words before solving. The value of ’n’ influences the length of response generated by the model. QAP is evaluated on GPT-3.5 Turbo and GPT-4 Turbo on arithmetic datasets GSM8K, AQuA, and SAT and commonsense dataset StrategyQA. QAP is compared with other state-of-the-art prompts including chain-of-thought (CoT), Plan and Solve Prompting (PS+) and Take A Deep Breath (TADB). QAP outperforms all state-of-the-art prompts on AQuA and SAT datasets on both GPT-3.5 and GPT-4. QAP consistently ranks among the top-2 prompts on 75% of the tests. A key factor of QAP performance can be attributed to response length, where detailed responses are beneficial when answering harder questions, but can negatively affect easy questions.</abstract>
-      <url hash="1c468038">2024.acl-srw.45</url>
+      <url hash="28db8c22">2024.acl-srw.45</url>
       <bibkey>yugeswardeenoo-etal-2024-question</bibkey>
       <doi>10.18653/v1/2024.acl-srw.45</doi>
       <video href="2024.acl-srw.45.mp4"/>
     </paper>
-    <paper id="46">
-      <title>An Individualized News Affective Response Dataset</title>
-      <author><first>Tiancheng</first><last>Hu</last><affiliation>University of Cambridge</affiliation></author>
-      <author><first>Nigel</first><last>Collier</last><affiliation>University of Cambridge</affiliation></author>
-      <pages>555-563</pages>
-      <abstract>The rise of sensationalism in news reporting, driven by market saturation and online competition, has compromised news quality and trust. At the core of sensationalism is the evocation of affective responses in the readers. Current NLP approaches to emotion detection often overlook the subjective differences in groups and individuals, relying on aggregation techniques that can obscure nuanced reactions. We introduce a novel large-scale dataset capturing subjective affective responses to news headlines. The dataset includes Facebook post screenshots from popular UK media outlets and uses a comprehensive annotation scheme. Annotators report their affective responses, provide discrete emotion labels, assess relevance to current events, and indicate sharing likelihood. Additionally, we collect demographic, personality, and media consumption data. This ongoing dataset aims to enable more accurate models of affective response by considering individual and contextual factors. This work is ongoing and we highly appreciate any feedback.</abstract>
-      <url hash="a4c83dbd">2024.acl-srw.46</url>
-      <bibkey>hu-collier-2024-individualized</bibkey>
-      <doi>10.18653/v1/2024.acl-srw.46</doi>
-    </paper>
-    <paper id="47">
-      <title>How Well Do Vision Models Encode Diagram Attributes?</title>
-      <author><first>Haruto</first><last>Yoshida</last></author>
-      <author><first>Keito</first><last>Kudo</last></author>
-      <author><first>Yoichi</first><last>Aoki</last><affiliation>Tohoku University</affiliation></author>
-      <author><first>Ryota</first><last>Tanaka</last><affiliation>NTT</affiliation></author>
-      <author><first>Itsumi</first><last>Saito</last><affiliation>Tohoku University</affiliation></author>
-      <author><first>Keisuke</first><last>Sakaguchi</last><affiliation>Tohoku University</affiliation></author>
-      <author><first>Kentaro</first><last>Inui</last><affiliation>Mohamed bin Zayed University of Artificial Intelligence, RIKEN and Tohoku University</affiliation></author>
-      <pages>564-575</pages>
-      <abstract>Research on understanding and generating diagrams has used vision models such as CLIP. However, it remains unclear whether these models accurately identify diagram attributes, such as node colors and shapes, along with edge colors and connection patterns. This study evaluates how well vision models recognize the diagram attributes by probing the model and retrieving diagrams using text queries. Experimental results showed that while vision models can recognize differences in node colors, shapes, and edge colors, they struggle to identify differences in edge connection patterns that play a pivotal role in the semantics of diagrams. Moreover, we revealed inadequate alignment between diagram attributes and language representations in the embedding space.</abstract>
-      <url hash="d2c9e477">2024.acl-srw.47</url>
-      <bibkey>yoshida-etal-2024-well</bibkey>
-      <doi>10.18653/v1/2024.acl-srw.47</doi>
-    </paper>
     <paper id="48">
       <title><fixed-case>C</fixed-case>heckers<fixed-case>GPT</fixed-case>: Learning World Models through Language Modeling</title>
       <author><first>Abhinav</first><last>Joshi</last><affiliation>Indian Institute of Technology, Kanpur</affiliation></author>
       <author><first>Vaibhav</first><last>Sharma</last></author>
       <author><first>Ashutosh</first><last>Modi</last><affiliation>IIT Kanpur</affiliation></author>
-      <pages>576-588</pages>
+      <pages>414-426</pages>
       <abstract>Although Large Language Models (LLMs) have been trained using just the next token prediction objective, these have shown impressive performance on various tasks. Consequently, it has attracted research interests in this regard. While one line of work in the past has suggested that LLMs learn surface-level statistics from the dataset, another line of work emphasizes that the learned representations are effective for simulating the underlying world model, considering the causal relationship for the next token prediction. This phenomenon is often referred to as the emergence of a world model in sequence prediction tasks. Recent work has demonstrated this phenomenon in a simulated setting of board games like Othello and Chess. In this paper, we analyze the game of Checkers to find out the emergence of a world model in a language model. By training a GPT-style autoregressive language model using only the next character prediction objective, we find that the model does show a hint of learning a world model representation of the board positions. We perform our analysis on two datasets: 1) synthetic dataset, which comes from the checkers game tree, and 2) human gameplay dataset. With multiple models trained with different layer sizes, we find that increasing the parameter size does help learn better world model representation decoded by linear probes.</abstract>
-      <url hash="83053406">2024.acl-srw.48</url>
+      <url hash="93839814">2024.acl-srw.48</url>
       <bibkey>joshi-etal-2024-checkersgpt</bibkey>
       <doi>10.18653/v1/2024.acl-srw.48</doi>
       <video href="2024.acl-srw.48.mp4"/>
@@ -15193,24 +15045,13 @@
       <author><first>Katsiaryna</first><last>Haitsiukevich</last><affiliation>Aalto University</affiliation></author>
       <author><first>Nicola</first><last>Dainese</last><affiliation>Aalto University</affiliation></author>
       <author><first>Pekka</first><last>Marttinen</last><affiliation>Aalto University</affiliation></author>
-      <pages>589-606</pages>
+      <pages>427-444</pages>
       <abstract>State of the art Symbolic Regression (SR) methods currently build specialized models, while the application of Large Language Models (LLMs) remains largely unexplored. In this work, we introduce the first comprehensive framework that utilizes LLMs for the task of SR.We propose In-Context Symbolic Regression (ICSR), an SR method which iteratively refines a functional form with an LLM and determines its coefficients with an external optimizer. ICSR leverages LLMs’ strong mathematical prior both to propose an initial set of possible functions given the observations and to refine them based on their errors.Our findings reveal that LLMs are able to successfully find symbolic equations that fit the given data, matching or outperforming the overall performance of the best SR baselines on four popular benchmarks, while yielding simpler equations with better out of distribution generalization.</abstract>
-      <url hash="12a6532f">2024.acl-srw.49</url>
+      <url hash="4321195c">2024.acl-srw.49</url>
       <bibkey>merler-etal-2024-context</bibkey>
       <doi>10.18653/v1/2024.acl-srw.49</doi>
       <video href="2024.acl-srw.49.mp4"/>
     </paper>
-    <paper id="50">
-      <title><fixed-case>STEP</fixed-case>: Staged Parameter-Efficient Pre-training for Large Language Models</title>
-      <author><first>Kazuki</first><last>Yano</last></author>
-      <author><first>Takumi</first><last>Ito</last><affiliation>Langsmith Inc., Tohoku University and Machine Learning Solutions</affiliation></author>
-      <author><first>Jun</first><last>Suzuki</last><affiliation>Tohoku University</affiliation></author>
-      <pages>607-614</pages>
-      <abstract>Pre-training large language models faces significant memory challenges due to the large size of model weights.We propose STaged parameter-Efficient Pre-training (STEP), which combines ideas from parameter-efficient tuning and staged training. We conduct experiments on pre-training models of various sizes and demonstrate that STEP can achieve up to a 40.4% reduction in maximum memory requirement compared to vanilla pre-training while maintaining comparable performance.</abstract>
-      <url hash="5aee8834">2024.acl-srw.50</url>
-      <bibkey>yano-etal-2024-step</bibkey>
-      <doi>10.18653/v1/2024.acl-srw.50</doi>
-    </paper>
   </volume>
   <volume id="tutorials" ingest-date="2024-08-11" type="proceedings">
     <meta>
@@ -15222,11 +15063,11 @@
       <address>Bangkok, Thailand</address>
       <month>August</month>
       <year>2024</year>
-      <url hash="bfe13ead">2024.acl-tutorials</url>
+      <url hash="7c0a93a8">2024.acl-tutorials</url>
       <venue>acl</venue>
     </meta>
     <frontmatter>
-      <url hash="26fcc4e2">2024.acl-tutorials.0</url>
+      <url hash="612d13b1">2024.acl-tutorials.0</url>
       <bibkey>acl-2024-tutorials</bibkey>
     </frontmatter>
     <paper id="1">
@@ -15237,8 +15078,8 @@
       <author><first>Jixing</first><last>Li</last></author>
       <author><first>Marie-Francine</first><last>Moens</last></author>
       <pages>1-2</pages>
-      <abstract>Computational linguistics (CL) has witnessed tremendous advancementsin recent years, with models such as large language models demonstratingexceptional performance in various natural language processing tasks. Theseadvancements highlight their potential to help understand brain languageprocessing, especially through the lens of brain encoding and decoding.Brain encoding involves the mapping of linguistic stimuli to brain activity,while brain decoding is the process of reconstructing linguistic stimulifrom observed brain activities. CL models that excel at capturing andmanipulating linguistic features are crucial for mapping linguistic stimulito brain activities and vice versa. Brain encoding and decoding have vastapplications, from enhancing human-computer interaction to developingassistive technologies for individuals with communication impairments. Thistutorial will focus on elucidating how computational linguistics canfacilitate brain encoding and decoding. We will delve into the principlesand practices of using computational linguistics methods for brain encodingand decoding. We will also discuss the challenges and future directions ofbrain encoding and decoding. Through this tutorial, we aim to provide acomprehensive and informative overview of the intersection betweencomputational linguistics and cognitive neuroscience, inspiring futureresearch in this exciting and rapidly evolving field.</abstract>
-      <url hash="1f40b0da">2024.acl-tutorials.1</url>
+      <abstract>Computational linguistics (CL) has witnessed tremendous advancements in recent years, with models such as large language models demonstrating exceptional performance in various natural language processing tasks. These advancements highlight their potential to help understand brain language processing, especially through the lens of brain encoding and decoding. Brain encoding involves the mapping of linguistic stimuli to brain activity, while brain decoding is the process of reconstructing linguistic stimuli from observed brain activities. CL models that excel at capturing and manipulating linguistic features are crucial for mapping linguistic stimuli to brain activities and vice versa. Brain encoding and decoding have vast applications, from enhancing human-computer interaction to developing assistive technologies for individuals with communication impairments. This tutorial will focus on elucidating how computational linguistics can facilitate brain encoding and decoding. We will delve into the principles and practices of using computational linguistics methods for brain encoding and decoding. We will also discuss the challenges and future directions of brain encoding and decoding. Through this tutorial, we aim to provide a comprehensive and informative overview of the intersection between computational linguistics and cognitive neuroscience, inspiring future research in this exciting and rapidly evolving field.</abstract>
+      <url hash="9dda3d29">2024.acl-tutorials.1</url>
       <bibkey>sun-etal-2024-computational</bibkey>
       <doi>10.18653/v1/2024.acl-tutorials.1</doi>
     </paper>
@@ -15249,8 +15090,8 @@
       <author><first>Claire</first><last>Gardent</last></author>
       <author><first>Wei</first><last>Xu</last></author>
       <pages>3-4</pages>
-      <abstract>In this tutorial, we focus on text-to-text generation, a class ofnatural language generation (NLG) tasks, that takes a piece of text as inputand then generates a revision that is improved according to some specificcriteria (e.g., readability or linguistic styles), while largely retainingthe original meaning and the length of the text. This includes many usefulapplications, such as text simplification, paraphrase generation, styletransfer, etc. In contrast to text summarization and open-ended textcompletion (e.g., story), the text-to-text generation tasks we discuss inthis tutorial are more constrained in terms of semantic consistency andtargeted language styles. This level of control makes these tasks idealtestbeds for studying the ability of models to generate text that is bothsemantically adequate and stylistically appropriate. Moreover, these tasksare interesting from a technical standpoint, as they require complexcombinations of lexical and syntactical transformations, stylistic control,and adherence to factual knowledge, – all at once. With a special focus ontext simplification and revision, this tutorial aims to provide an overviewof the state-of-the-art natural language generation research from four majoraspects – Data, Models, Human-AI Collaboration, and Evaluation – and todiscuss and showcase a few significant and recent advances: (1) the use ofnon-retrogressive approaches; (2) the shift from fine-tuning to promptingwith large language models; (3) the development of new learnable metric andfine-grained human evaluation framework; (4) a growing body of studies anddatasets on non-English languages; (5) the rise of HCI+NLP+Accessibilityinterdisciplinary research to create real-world writing assistant systems.</abstract>
-      <url hash="789aa287">2024.acl-tutorials.2</url>
+      <abstract>In this tutorial, we focus on text-to-text generation, a class of natural language generation (NLG) tasks, that takes a piece of text as input and then generates a revision that is improved according to some specific criteria (e.g., readability or linguistic styles), while largely retaining the original meaning and the length of the text. This includes many useful applications, such as text simplification, paraphrase generation, style transfer, etc. In contrast to text summarization and open-ended text completion (e.g., story), the text-to-text generation tasks we discuss in this tutorial are more constrained in terms of semantic consistency and targeted language styles. This level of control makes these tasks ideal testbeds for studying the ability of models to generate text that is both semantically adequate and stylistically appropriate. Moreover, these tasks are interesting from a technical standpoint, as they require complex combinations of lexical and syntactical transformations, stylistic control, and adherence to factual knowledge, – all at once. With a special focus on text simplification and revision, this tutorial aims to provide an overview of the state-of-the-art natural language generation research from four major aspects – Data, Models, Human-AI Collaboration, and Evaluation – and to discuss and showcase a few significant and recent advances: (1) the use of non-retrogressive approaches; (2) the shift from fine-tuning to prompting with large language models; (3) the development of new learnable metric and fine-grained human evaluation framework; (4) a growing body of studies and datasets on non-English languages; (5) the rise of HCI+NLP+Accessibility interdisciplinary research to create real-world writing assistant systems.</abstract>
+      <url hash="4a3a0a71">2024.acl-tutorials.2</url>
       <bibkey>dou-etal-2024-automatic</bibkey>
       <doi>10.18653/v1/2024.acl-tutorials.2</doi>
     </paper>
@@ -15260,19 +15101,19 @@
       <author><first>Ryan</first><last>Cotterell</last></author>
       <author><first>Anej</first><last>Svete</last></author>
       <pages>5-5</pages>
-      <abstract>Language models (LMs) are currently at the forefront of NLP researchdue to their remarkable versatility across diverse tasks. However, a largegap exists between their observed capabilities and the explanations proposedby established formal machinery. To motivate a better theoreticalcharacterization of LMs’ abilities and limitations, this tutorial aims toprovide a comprehensive introduction to a specific framework for formalanalysis of modern LMs using tools from formal language theory (FLT). Wepresent how tools from FLT can be useful in understanding the inner workingsand predicting the capabilities of modern neural LM architectures. We willcover recent results using FLT to make precise and practically relevantstatements about LMs based on recurrent neural networks and transformers byrelating them to formal devices such as finite-state automata, Turingmachines, and analog circuits. Altogether, the results covered in thistutorial will allow us to make precise statements and explanations about theobserved as well as predicted behaviors of LMs, as well as providetheoretically motivated suggestions on the aspects of the architectures thatcould be improved.</abstract>
-      <url hash="adedb189">2024.acl-tutorials.3</url>
+      <abstract>Language models (LMs) are currently at the forefront of NLP research due to their remarkable versatility across diverse tasks. However, a large gap exists between their observed capabilities and the explanations proposed by established formal machinery. To motivate a better theoretical characterization of LMs’ abilities and limitations, this tutorial aims to provide a comprehensive introduction to a specific framework for formal analysis of modern LMs using tools from formal language theory (FLT). We present how tools from FLT can be useful in understanding the inner workings and predicting the capabilities of modern neural LM architectures. We will cover recent results using FLT to make precise and practically relevant statements about LMs based on recurrent neural networks and transformers by relating them to formal devices such as finite-state automata, Turing machines, and analog circuits. Altogether, the results covered in this tutorial will allow us to make precise statements and explanations about the observed as well as predicted behaviors of LMs, as well as provide theoretically motivated suggestions on the aspects of the architectures that could be improved.</abstract>
+      <url hash="89e4331d">2024.acl-tutorials.3</url>
       <bibkey>butoi-etal-2024-computational</bibkey>
       <doi>10.18653/v1/2024.acl-tutorials.3</doi>
     </paper>
     <paper id="4">
-      <title>Presentation Matters: How to Communicate Science in the <fixed-case>NLP</fixed-case> Venues and in the Wild?</title>
+      <title>Presentation Matters: How to Communicate Science in the <fixed-case>NLP</fixed-case> Venues and in the Wild</title>
       <author><first>Sarvnaz</first><last>Karimi</last></author>
       <author><first>Cecile</first><last>Paris</last></author>
       <author><first>Gholamreza</first><last>Haffari</last></author>
       <pages>6-7</pages>
       <abstract>Each year a large number of early career researchers join the NLP/Computational Linguistics community, with most starting by presenting their research in the *ACL conferences and workshops. While writing a paper that has made it to these venues is one important step, what comes with communicating the outcome is equally important and sets the path to impact of a research outcome. In addition, not all PhD candidates get the chance of being trained for their presentation skills. Research methods courses are not all of the same quality and may not cover scientific communications, and certainly not all are tailored to the NLP community. We are proposing an introductory tutorial that covers a range of different communication skills, including writing, oral presentation (posters and demos), and social media presence. This is to fill in the gap for the researchers who may not have access to research methods courses or other mentors who could help them acquire such skills. The interactive nature of such a tutorial would allow attendees to ask questions and clarifications which would not be possible from reading materials alone.</abstract>
-      <url hash="cb503c09">2024.acl-tutorials.4</url>
+      <url hash="c30f2e87">2024.acl-tutorials.4</url>
       <bibkey>karimi-etal-2024-presentation</bibkey>
       <doi>10.18653/v1/2024.acl-tutorials.4</doi>
     </paper>
@@ -15280,25 +15121,25 @@
       <title>Vulnerabilities of Large Language Models to Adversarial Attacks</title>
       <author><first>Yu</first><last>Fu</last></author>
       <author><first>Erfan</first><last>Shayegan</last></author>
-      <author><first>Md.</first><last>Mamun Al Abdullah</last></author>
+      <author><first>Md. Mamun Al</first><last>Abdullah</last></author>
       <author><first>Pedram</first><last>Zaree</last></author>
       <author><first>Nael</first><last>Abu-Ghazaleh</last></author>
       <author><first>Yue</first><last>Dong</last></author>
       <pages>8-9</pages>
       <abstract>This tutorial serves as a comprehensive guide on the vulnerabilities of Large Language Models (LLMs) to adversarial attacks, an interdisciplinary field that blends perspectives from Natural Language Processing (NLP) and Cybersecurity. As LLMs become more complex and integrated into various systems, understanding their security attributes is crucial. However, current research indicates that even safety-aligned models are not impervious to adversarial attacks that can result in incorrect or harmful outputs. The tutorial first lays the foundation by explaining safety-aligned LLMs and concepts in cybersecurity. It then categorizes existing research based on different types of learning architectures and attack methods. We highlight the existing vulnerabilities of unimodal LLMs, multi-modal LLMs, and systems that integrate LLMs, focusing on adversarial attacks designed to exploit weaknesses and mislead AI systems. Finally, the tutorial delves into the potential causes of these vulnerabilities and discusses potential defense mechanisms.</abstract>
-      <url hash="383af55e">2024.acl-tutorials.5</url>
+      <url hash="53564850">2024.acl-tutorials.5</url>
       <bibkey>fu-etal-2024-vulnerabilities</bibkey>
       <doi>10.18653/v1/2024.acl-tutorials.5</doi>
     </paper>
     <paper id="6">
-      <title>Detecting Machine-Generated Text: Techniques and Challenges</title>
-      <author><first>Li</first><last>Gao</last></author>
-      <author><first>Wenhan</first><last>Xiong</last></author>
-      <author><first>Taewoo</first><last>Kim</last></author>
+      <title>Watermarking for Large Language Models</title>
+      <author><first>Xuandong</first><last>Zhao</last></author>
+      <author><first>Yu-Xiang</first><last>Wang</last></author>
+      <author><first>Lei</first><last>Li</last></author>
       <pages>10-11</pages>
-      <abstract>As AI-generated text increasingly resembles human-written content, the ability to detect machine-generated text becomes crucial in many applications. This tutorial aims to provide a comprehensive overview of text detection techniques, focusing on machine-generated text and deepfakes. We will discuss various methods for distinguishing between human-written and machine-generated text, including statistical methods, neural network-based techniques, and hybrid approaches. The tutorial will also cover the challenges in the detection process, such as dealing with evolving models and maintaining robustness against adversarial attacks. By the end of the session, attendees will have a solid understanding of current techniques and future directions in the field of text detection.</abstract>
-      <url hash="e3a066a0">2024.acl-tutorials.6</url>
-      <bibkey>gao-etal-2024-detecting</bibkey>
+      <abstract>As AI-generated text increasingly resembles human-written content, the ability to detect machine-generated text becomes crucial in both the computational linguistics and machine learning communities. In this tutorial, we aim to provide an in-depth exploration of text watermarking, a subfield of linguistic steganography with the goal of embedding a hidden message (the watermark) within a text passage. We will introduce the fundamentals of text watermarking, discuss the main challenges in identifying AI-generated text, and delve into the current watermarking methods, assessing their strengths and weaknesses. Moreover, we will explore other possible applications of text watermarking and discuss future directions for this field. Each section will be supplemented with examples and key takeaways.</abstract>
+      <url hash="d0e779a1">2024.acl-tutorials.6</url>
+      <bibkey>zhao-etal-2024-watermarking</bibkey>
       <doi>10.18653/v1/2024.acl-tutorials.6</doi>
     </paper>
   </volume>
diff --git a/data/xml/2024.clib.xml b/data/xml/2024.clib.xml
new file mode 100644
index 0000000000..ec3ec4e743
--- /dev/null
+++ b/data/xml/2024.clib.xml
@@ -0,0 +1,355 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<collection id="2024.clib">
+  <volume id="1" ingest-date="2024-10-11" type="proceedings">
+    <meta>
+      <booktitle>Proceedings of the Sixth International Conference on Computational Linguistics in Bulgaria (CLIB 2024)</booktitle>
+      <publisher>Department of Computational Linguistics, Institute for Bulgarian Language, Bulgarian Academy of Sciences</publisher>
+      <address>Sofia, Bulgaria</address>
+      <month>September</month>
+      <year>2024</year>
+      <url hash="ccd00621">2024.clib-1</url>
+      <venue>clib</venue>
+    </meta>
+    <frontmatter>
+      <pages>344</pages>
+      <url hash="606e7bf6">2024.clib-1.0</url>
+      <bibkey>clib-2024-1</bibkey>
+    </frontmatter>
+    <paper id="1">
+      <title>A Cross-model Study on Learning <fixed-case>R</fixed-case>omanian Parts of Speech with Transformer Models</title>
+      <author><first>Radu</first><last>Ion</last></author>
+      <author><first>Verginica</first><last>Barbu Mititelu</last></author>
+      <author><first>Vasile</first><last>Păiş</last></author>
+      <author><first>Elena</first><last>Irimia</last></author>
+      <author><first>Valentin</first><last>Badea</last></author>
+      <pages>6–13</pages>
+      <abstract>This paper will attempt to determine experimentally if POS tagging of unseen words produces comparable performance, in terms of accuracy, as for words that were rarely seen in the training set (i.e. frequency less than 5), or more frequently seen (i.e. frequency greater than 10). To compare accuracies objectively, we will use the odds ratio statistic and its confidence interval testing to show that odds of being correct on unseen words are close to odds of being correct on rarely seen words. For the training of the POS taggers, we use different Romanian BERT models that are freely available on HuggingFace.</abstract>
+      <url hash="d1021f75">2024.clib-1.1</url>
+      <bibkey>ion-etal-2024-cross</bibkey>
+    </paper>
+    <paper id="2">
+      <title>What do <fixed-case>BERT</fixed-case> Word Embeddings Learn about the <fixed-case>F</fixed-case>rench Language?</title>
+      <author><first>Ekaterina</first><last>Goliakova</last></author>
+      <author><first>David</first><last>Langlois</last></author>
+      <pages>14–32</pages>
+      <abstract>Pre-trained word embeddings (for example, BERT-like) have been successfully used in a variety of downstream tasks. However, do all embeddings, obtained from the models of the same architecture, encode information in the same way? Does the size of the model correlate to the quality of the information encoding? In this paper, we will attempt to dissect the dimensions of several BERT-like models that were trained on the French language to find where grammatical information (gender, plurality, part of speech) and semantic features might be encoded. In addition to this, we propose a framework for comparing the quality of encoding in different models.</abstract>
+      <url hash="87d0ee41">2024.clib-1.2</url>
+      <bibkey>goliakova-langlois-2024-bert</bibkey>
+    </paper>
+    <paper id="3">
+      <title>Whisper–<fixed-case>TAD</fixed-case>: A General Model for Transcription, Alignment and Diarization of Speech</title>
+      <author><first>Camille</first><last>Lavigne</last></author>
+      <author><first>Alex</first><last>Stasica</last></author>
+      <pages>33–38</pages>
+      <abstract>Currently, there is a lack of a straightforward implementation of diarization-augmented speech transcription (DAST), ie. implementation of transcription, diarization and alignment to the audio within one model. These tasks typically require distinct models, necessitating to stack them together for complete processing. In this study, we advocate for leveraging the advanced capabilities of the Whisper models, which already excels in automatic transcription and partial alignment. Our approach involves fine-tuning the model’s parameters on both transcription and diarization tasks in a SOT-FIFO (Serialized Output Training-First In First Out) manner. This comprehensive framework facilitates the creation of orthographic transcriptions, identification of speakers, and precise alignment, thus enhancing the efficiency of audio processing workflows. While our work represents an initial step towards a unified transcription and diarization framework, the development of such a model demands substantial high-quality data augmentation and computational resources beyond our current scope. Consequently, our focus is narrowed to the English language. Despite these limitations, our method demonstrates promising performance in both transcription and diarization tasks. Comparative analysis between pre-trained models and fine-tuned TAD (Transcription, Alignment, Diarization) versions suggests that incorporating diarization into a Whisper model doesn’t compromise transcription accuracy. Our findings hint that deploying our TAD framework on the largest Whisper model could potentially yield state-of-the-art performance across all mentioned tasks.</abstract>
+      <url hash="8f0aab30">2024.clib-1.3</url>
+      <bibkey>lavigne-stasica-2024-whisper</bibkey>
+    </paper>
+    <paper id="4">
+      <title>Contemporary <fixed-case>LLM</fixed-case>s and Literary Abridgement: An Analytical Inquiry</title>
+      <author><first>Iglika</first><last>Nikolova-Stoupak</last></author>
+      <author><first>Gaël</first><last>Lejeune</last></author>
+      <author><first>Eva</first><last>Schaeffer-Lacroix</last></author>
+      <pages>39–57</pages>
+      <abstract>Within the framework of this study, several contemporary Large Language Models (ChatGPT, Gemini Pro, Mistral-Instruct and BgGPT) are evaluated in relation to their ability to generate abridged versions of literary texts. The analysis is based on ’The Ugly Duckling’ by H. C. Andersen as translated into English, French and Bulgarian. The different scenarios of abridgement experimented with include zero-shot, one-shot, division into chunks and crosslingual (including chain-of-thought) abridgement. The resulting texts are evaluated both automatically and via human evaluation. The automatic analysis includes ROUGE and BERTScore as well as the ratios of a selection of readability-related textual features (e.g. number of words, type-to-token ratio) as pertaining to the original versus automatically abridged texts. Professionally composed abridged versions are regarded as gold standard. Following the automatic analysis, six selected best candidate texts per language are then evaluated by volunteers with university education in terms of textual characteristics of a more qualitative nature, such as coherence, consistency and aesthetic appeal.</abstract>
+      <url hash="9428cc4c">2024.clib-1.4</url>
+      <bibkey>nikolova-stoupak-etal-2024-contemporary</bibkey>
+    </paper>
+    <paper id="5">
+      <title>Advancing Sentiment Analysis in <fixed-case>S</fixed-case>erbian Literature: A Zero and Few–Shot Learning Approach Using the Mistral Model</title>
+      <author><first>Milica Ikonić</first><last>Nešić</last></author>
+      <author><first>Saša</first><last>Petalinkar</last></author>
+      <author><first>Mihailo</first><last>Škorić</last></author>
+      <author><first>Ranka</first><last>Stanković</last></author>
+      <author><first>Biljana</first><last>Rujević</last></author>
+      <pages>58–70</pages>
+      <abstract>This study presents the Sentiment Analysis of the Serbian old novels from the 1840-1920 period, employing the Mistral Large Language Model (LLM) to pioneer zero and few-shot learning techniques. The main approach innovates by devising research prompts that include guidance text for zero-shot classification and examples for few-shot learning, enabling the LLM to classify sentiments into positive, negative, or objective categories. This methodology aims to streamline sentiment analysis by limiting responses, thereby enhancing classification precision. Python, along with the Hugging Face Transformers and LangChain libraries, serves as our technological backbone, facilitating the creation and refinement of research prompts tailored for sentence-level sentiment analysis. The results of sentiment analysis in both scenarios, zero-shot and few-shot, have indicated that the zero-shot approach outperforms, achieving an accuracy of 68.2%.</abstract>
+      <url hash="ddf0b36d">2024.clib-1.5</url>
+      <bibkey>nesic-etal-2024-advancing</bibkey>
+    </paper>
+    <paper id="6">
+      <title>Generating Phonetic Embeddings for <fixed-case>B</fixed-case>ulgarian Words with Neural Networks</title>
+      <author><first>Lyuboslav</first><last>Karev</last></author>
+      <author><first>Ivan</first><last>Koychev</last></author>
+      <pages>71–79</pages>
+      <abstract>Word embeddings can be considered the cornerstone of modern natural language processing. They are used in many NLP tasks and allow us to create models that can understand the meaning of words. Most word embeddings model the semantics of the words. In this paper, we create phoneme-based word embeddings, which model how a word sounds. This is accomplished by training a neural network that can automatically generate transcriptions of Bulgarian words. We used the Jaccard index and direct comparison metrics to measure the performance of neural networks. The models perform nearly perfectly with the task of generating transcriptions. The model’s word embeddings offer versatility across various applications, with its application in automatic paronym detection being particularly notable, as well as the task of detecting the language of origin of a Bulgarian word. The performance of this paronym detection is measured with the standard classifier metrics - accuracy, precision, recall, and F1.</abstract>
+      <url hash="683dae07">2024.clib-1.6</url>
+      <bibkey>karev-koychev-2024-generating</bibkey>
+    </paper>
+    <paper id="7">
+      <title><fixed-case>U</fixed-case>niversal <fixed-case>D</fixed-case>ependencies Treebank for Standard <fixed-case>A</fixed-case>lbanian: A New Approach</title>
+      <author><first>Nelda</first><last>Kote</last></author>
+      <author><first>Rozana</first><last>Rushiti</last></author>
+      <author><first>Anila</first><last>Çepani</last></author>
+      <author><first>Alba</first><last>Haveriku</last></author>
+      <author><first>Evis</first><last>Trandafili</last></author>
+      <author><first>Elinda Kajo</first><last>Meçe</last></author>
+      <author><first>Elsa Skënderi</first><last>Rakipllari</last></author>
+      <author><first>Lindita</first><last>Xhanari</last></author>
+      <author><first>Albana</first><last>Deda</last></author>
+      <pages>80–89</pages>
+      <abstract>In this paper, we present a Universal Dependencies (UD) treebank for the Standard Albanian Language (SAL), annotated by expert linguistics supported by information technology professionals. The annotated treebank consists of 24,537 tokens (1,400 sentences) and includes annotation for syntactic dependencies, part-of-speech tags, morphological features, and lemmas. This treebank represents the largest UD treebank available for SAL. In order to overcome annotation challenges in SAL within the UD framework, we delicately balanced the preservation of the richness of SAL grammar while adapting the UD tagset and addressing unique language-specific features for a unified annotation. We discuss the criteria followed to select the sentences included in the treebank and address the most significant linguistic considerations when adapting the UD framework conform to the grammar of the SAL. Our efforts contribute to the advancement of linguistic analyses and Natural Language Processing (NLP) in the SAL. The treebank will be made available online under an open license so that to provide the possibility for further developments of NLP tools based on the Artificial Intelligence (AI) models for the Albanian language.</abstract>
+      <url hash="b239ad86">2024.clib-1.7</url>
+      <bibkey>kote-etal-2024-universal</bibkey>
+    </paper>
+    <paper id="8">
+      <title>Function Multiword Expressions Annotated with Discourse Relations in the <fixed-case>R</fixed-case>omanian Reference Treebank</title>
+      <author><first>Verginica</first><last>Barbu Mititelu</last></author>
+      <author><first>Tudor</first><last>Voicu</last></author>
+      <pages>90–97</pages>
+      <abstract>For the Romanian Reference Treebank, a general language corpus, covering several genres and annotated according to the principles of Universal Dependencies, we present here the annotation of some function words, namely multiword conjunctions, with discourse relations from the Penn Discourse Treebank version 3.0 inventory of such relations. The annotation process was manual, with two annotators for each occurrence of the conjunctions. Lexical-semantic relations of the types synonymy, polysemy can be established between the senses of such conjunctions. The discourse relations are added to the CoNLL-U file in which the treebank is represented.</abstract>
+      <url hash="65014829">2024.clib-1.8</url>
+      <bibkey>barbu-mititelu-voicu-2024-function</bibkey>
+    </paper>
+    <paper id="9">
+      <title>Dependency Parser for <fixed-case>B</fixed-case>ulgarian</title>
+      <author><first>Atanas</first><last>Atanasov</last></author>
+      <pages>98–105</pages>
+      <abstract>This paper delves into the implementation of a Biaffine Attention Model, a sophisticated neural network architecture employed for dependency parsing tasks. Proposed by Dozat and Manning, this model is applied to Bulgarian language processing. The model’s training and evaluation are conducted using the Bulgarian Universal Dependencies dataset. The paper offers a comprehensive explanation of the model’s architecture and the data preparation process, aiming to demonstrate that for highly inflected languages, the inclusion of two additional input layers - lemmas and language-specific morphological information - is beneficial. The results of the experiments are subsequently presented and discussed. The paper concludes with a reflection on the model’s performance and suggestions for potential future work.</abstract>
+      <url hash="9b82abd4">2024.clib-1.9</url>
+      <bibkey>atanasov-2024-dependency</bibkey>
+    </paper>
+    <paper id="10">
+      <title>Towards a <fixed-case>R</fixed-case>omanian Phrasal Academic Lexicon</title>
+      <author><first>Madalina</first><last>Chitez</last></author>
+      <author><first>Ana-Maria</first><last>Bucur</last></author>
+      <author><first>Andreea</first><last>Dinca</last></author>
+      <author><first>Roxana</first><last>Rogobete</last></author>
+      <pages>106–112</pages>
+      <abstract>The lack of NLP based research studies on academic writing in Romania results in an unbalanced development of automatic support tools in Romanian compared to other languages, such as English. For this study, we use Romanian subsets of two bilingual academic writing corpora: the ROGER corpus, consisting of university student papers, and the EXPRES corpus, composed of expert research articles. Working with the Romanian Academic Word List / RoAWL, we present two phrase extraction phases: (i) use Ro-AWL words as node words to extract collocations according to the thresholds of statistical measures and (ii) classify extracted phrases into general versus domain-specific multi-word units. We show how manual rhetorical function annotation of resulting phrases can be combined with automatic function detection. The comparison between academic phrases in ROGER and EXPRES validates the final phrase list. The Romanian phrasal academic lexicon (ROPAL), similar to the Oxford Phrasal Academic Lexicon (OPAL), is a written academic phrase lexicon for Romanian language made available for academic use and further research or applications.</abstract>
+      <url hash="6eb8cb88">2024.clib-1.10</url>
+      <bibkey>chitez-etal-2024-towards-romanian</bibkey>
+    </paper>
+    <paper id="11">
+      <title>Classifying Multi–Word Expressions in the <fixed-case>L</fixed-case>atvian Monolingual Electronic Dictionary Tēzaurs.lv</title>
+      <author><first>Laura</first><last>Rituma</last></author>
+      <author><first>Gunta</first><last>Nešpore-Bērzkalne</last></author>
+      <author><first>Agute</first><last>Klints</last></author>
+      <author><first>Ilze</first><last>Lokmane</last></author>
+      <author><first>Madara</first><last>Stāde</last></author>
+      <author><first>Pēteris</first><last>Paikens</last></author>
+      <pages>113–118</pages>
+      <abstract>The electronic dictionary Tēzaurs.lv contains more than 400,000 entries from which 73,000 entries are multi-word expressions (MWEs). Over the past two years, there has been an ongoing division of these MWEs into subgroups (proper names, multi-word terms, taxa, phraseological units, collocations). The article describes the classification of MWEs, focusing on phraseological units (approximately 7,250 entries), as well as on borderline cases of phraseological unit types (phrasemes and idioms) and different MWE groups in general. The division of phraseological units depends on semantic divisibility and figurativeness. In a phraseme, at least one of the constituents retains its literal sense, whereas the meaning of an idiom is not dependent on the literal sense of any of its constituents. As a result, 65919 entries of MWE have been manually classified, and now this information of MWE type is available for the users of the electronic dictionary Tēzaurs.lv.</abstract>
+      <url hash="95b30fd2">2024.clib-1.11</url>
+      <bibkey>rituma-etal-2024-classifying</bibkey>
+    </paper>
+    <paper id="12">
+      <title>Complex Word Identification for <fixed-case>I</fixed-case>talian Language: A Dictionary–based Approach</title>
+      <author><first>Laura</first><last>Occhipinti</last></author>
+      <pages>119–129</pages>
+      <abstract>Assessing word complexity in Italian poses significant challenges, particularly due to the absence of a standardized dataset. This study introduces the first automatic model designed to identify word complexity for native Italian speakers. A dictionary of simple and complex words was constructed, and various configurations of linguistic features were explored to find the best statistical classifier based on Random Forest algorithm. Considering the probabilities of a word to belong to a class, a comparison between the models’ predictions and human assessments derived from a dataset annotated for complexity perception was made. Finally, the degree of accord between the model predictions and the human inter-annotator agreement was analyzed using Spearman correlation. Our findings indicate that a model incorporating both linguistic features and word embeddings performed better than other simpler models, also showing a value of correlation with the human judgements similar to the inter-annotator agreement. This study demonstrates the feasibility of an automatic system for detecting complexity in the Italian language with good performances and comparable effectiveness to humans in this subjective task.</abstract>
+      <url hash="2a2b9b73">2024.clib-1.12</url>
+      <bibkey>occhipinti-2024-complex</bibkey>
+    </paper>
+    <paper id="13">
+      <title>Verbal Multiword Expressions in the <fixed-case>C</fixed-case>roatian Verb Lexicon</title>
+      <author><first>Ivana</first><last>Brač</last></author>
+      <author><first>Matea</first><last>Birtić</last></author>
+      <pages>130–139</pages>
+      <abstract>The paper examines the complexities of encoding verbal multiword expressions in the Croatian verb lexicon. The lexicon incorporates a verb’s description at the syntactic, morphological, and semantic levels. This study explores the treatment of reflexive verbs, light verb constructions, and verbal idioms across several Croatian and Slavic language resources to find the best solution for the verb lexicon. It addresses the following research questions: 1. How should reflexive verbs, i.e., verbs with the reflexive marker se, be treated? Should they be considered as separate lemmas, sublemmas of non-reflexive counterparts, or as one of their senses? 2. What syntactic label and semantic role should be assigned to a predicative noun in light verb constructions? 3. Should verbal idioms be included, and, if so, at which level of a description? Our conclusion is that all reflexive verbs should be treated as separate lemmas since they are distinct lexemes that have undergone semantic and syntactic change. To differentiate between a semantically full verb and a light verb, we have introduced the label LV and decided not to assign a semantic role to a predicative noun. By including verbal idioms and their translation into English, non-native users can benefit from the lexicon. The aim is to enhance the verb lexicon for the more effective description and recognition of verbal multiword expressions.</abstract>
+      <url hash="10c88989">2024.clib-1.13</url>
+      <bibkey>brac-birtic-2024-verbal</bibkey>
+    </paper>
+    <paper id="14">
+      <title>Assessing Reading Literacy of <fixed-case>B</fixed-case>ulgarian Pupils with Finger–tracking</title>
+      <author><first>Alessandro</first><last>Lento</last></author>
+      <author><first>Andrea</first><last>Nadalini</last></author>
+      <author><first>Marcello</first><last>Ferro</last></author>
+      <author><first>Claudia</first><last>Marzi</last></author>
+      <author><first>Vito</first><last>Pirrelli</last></author>
+      <author><first>Tsvetana</first><last>Dimitrova</last></author>
+      <author><first>Hristina</first><last>Kukova</last></author>
+      <author><first>Valentina</first><last>Stefanova</last></author>
+      <author><first>Maria</first><last>Todorova</last></author>
+      <author><first>Svetla</first><last>Koeva</last></author>
+      <pages>140–149</pages>
+      <abstract>The paper reports on the first steps in developing a time-stamped multimodal dataset of reading data by Bulgarian children. Data are being collected, structured and analysed by means of ReadLet, an innovative infrastructure for multimodal language data collection that uses a tablet as a reader’s front-end. The overall goal of the project is to quantitatively analyse the reading skills of a sample of early Bulgarian readers collected over a two-year period, and compare them with the reading data of early readers of Italian, collected using the same protocol. We illustrate design issues of the experimental protocol, as well as the data acquisition process and the post-processing phase of data annotation/augmentation. To evaluate the potential and usefulness of the Bulgarian dataset for reading research, we present some preliminary statistical analyses of our recently collected data. They show robust convergence trends between Bulgarian and Italian early reading development stages.</abstract>
+      <url hash="74b459c3">2024.clib-1.14</url>
+      <bibkey>lento-etal-2024-assessing</bibkey>
+    </paper>
+    <paper id="15">
+      <title>Educational Horizons: Mapping the Terrain of Artificial Intelligence Integration in <fixed-case>B</fixed-case>ulgarian Educational Settings</title>
+      <author><first>Denitza</first><last>Kurshumova</last></author>
+      <pages>150–156</pages>
+      <abstract>The role of artificial intelligence in education (AIEd) has recently become a major topic of discussion and future planning. This article presents data from a large-scale survey involving 1463 Bulgarian educators in primary, secondary, and high schools. The results revealed that 70.30% of the teachers were familiar with or somewhat familiar with the existence of AI applications. Chatbots were the most popular among the surveyed teachers, with ChatGPT ranking as the most familiar. The teachers were almost equally split between those who reported use and those who declared nonuse of AI technology for instructional purposes. A significant association was found between the teachers’ familiarity with and use of AI technology and their age-related generational traits. The younger educators (up to 40 years of age) were associated with higher use of AI technology as a support tool for creating lesson plans, lesson content, tests, and exams. The outlined tendencies can be used to inform policy, professional development, and future research in the realm of AI-driven education.</abstract>
+      <url hash="57329730">2024.clib-1.15</url>
+      <bibkey>kurshumova-2024-educational</bibkey>
+    </paper>
+    <paper id="16">
+      <title>Evidential Auxiliaries as Non–reliability Markers in <fixed-case>B</fixed-case>ulgarian Parliamentary Speech</title>
+      <author><first>Ekaterina</first><last>Tarpomanova</last></author>
+      <pages>157–165</pages>
+      <abstract>In the evidentiality system of Bulgarian, there are three evidential auxiliaries that form complex verbal forms. The paper analyzes their potential to mark non-reliability in political discourse by using the ParlaMint-BG corpus of parliamentary debates. The method of the study includes detection, categorisation and context analysis of the evidentials formed with auxiliaries. The results prove that the evidential auxiliaries function as markers of non-reliability, especially in argumentative text type such as political discourse.</abstract>
+      <url hash="345d9d4b">2024.clib-1.16</url>
+      <bibkey>tarpomanova-2024-evidential</bibkey>
+    </paper>
+    <paper id="17">
+      <title>Extended Context at the Introduction of Complex Vocabulary in Abridged Literary Texts</title>
+      <author><first>Iglika</first><last>Nikolova-Stoupak</last></author>
+      <author><first>Eva</first><last>Schaeffer-Lacroix</last></author>
+      <author><first>Gaël</first><last>Lejeune</last></author>
+      <pages>166–177</pages>
+      <abstract>Psycholinguistics speaks of a fine-tuning process used by parents as they address children, in which complex vocabulary is introduced with additional context (Leung et al., 2021). This somewhat counterintuitive lengthening of text in order to aid one’s interlocutor in the process of language acquisition also comes in accord with Harris (1988)’s notion that for every complex sentence, there is an equivalent longer (non-contracted) yet simpler one that contains the same amount of information. Within the proposed work, a corpus of eight renowned literary works (e.g. Alice’s Adventures in Wonderland, The Adventures of Tom Sawyer, Les Misérables) in four distinct languages (English, French, Russian and Spanish) is gathered: both the original (or translated) versions and up to four abridged versions for various audiences (e.g. children of a defined age or foreign language learners of a defined level) are present. The contexts of the first appearance of complex words (as determined based on word frequency) in pairs of original and abridged works are compared, and the cases in which the abridged texts offer longer context are investigated. The discovered transformations are consequently classified into three separate categories: addition of vocabulary items from the same lexical field as the complex word, simplification of grammar and insertion of a definition. Context extensions are then statistically analysed as associated with different languages and reader audiences.</abstract>
+      <url hash="5436ec95">2024.clib-1.17</url>
+      <bibkey>nikolova-stoupak-etal-2024-extended</bibkey>
+    </paper>
+    <paper id="18">
+      <title>Corpus–based Research into Derivational Morphology: A Comparative Study of <fixed-case>J</fixed-case>apanese and <fixed-case>E</fixed-case>nglish Verbalization</title>
+      <author><first>Junya</first><last>Morita</last></author>
+      <pages>178–186</pages>
+      <abstract>As part of elucidating the syntax-morphology interaction, this study investigates where and how complex verbs are formed in Japanese and English. Focusing on the Japanese verb-forming suffix -ka-suru (e.g. toshi-o gendai-ka-suru ‘modernize city’), relevant verbs are extracted from a large-scale corpus and they receive an in-depth analysis from semantic, morphosyntactic, and functional viewpoints. The properties of -ka-suru and those of its English counterpart are then compared and contrasted. The result reveals three main points: (i) -ka-suru verbs are constantly created in syntactic settings to fulfill the functions of brevity and conceptualization, (ii) while denominal -ize derivatives have several submeanings such as ‘result,’ ‘ornative,’ and ‘agentive,’ -ka-suru equivalents retain the meaning ‘result,’ and (iii) -ka-suru can be combined with compound nouns, but -ize cannot. We will demonstrate that the above features originate in the underlying syntactic structure related to each suffix and their difference, thus supporting the thesis of syntactic word formation. (1) ji-kokumin-o moomai-ka-suru one’s-people-ACC ignorant-change-do ‘make one’s people ignorant’ (2) shinikaketa momiji-o bonsai-ka-suru dying maple-ACC bonsai-change-do ‘turn a dying maple into a bonsai’</abstract>
+      <url hash="d7bcebdd">2024.clib-1.18</url>
+      <bibkey>morita-2024-corpus</bibkey>
+    </paper>
+    <paper id="19">
+      <title>The Verbal Category of Conditionality in <fixed-case>B</fixed-case>ulgarian and its <fixed-case>U</fixed-case>krainian Correspondences</title>
+      <author><first>Ivan</first><last>Derzhanski</last></author>
+      <author><first>Olena</first><last>Siruk</last></author>
+      <pages>187–195</pages>
+      <abstract>Modern Bulgarian shares a conditional mood with the other Slavic languages, but it also has developed a future-in-the-past tense which is structurally analogous to many Western European languages’ category traditionally called a conditional mood in their grammars. The distinction between these two forms is sometimes elusive and can be difficult for native speakers of Slavic languages who are learning Bulgarian. In this paper we consider the uses of the Bulgarian conditional mood and future-in-the-past tense in a parallel corpus of Bulgarian and Ukrainian text, examining the corresponding wording in Ukrainian, where the conditional mood is supplemented by modal verbs, and discuss the breadth of choices open to translators when working in each direction.</abstract>
+      <url hash="d4a3ea0a">2024.clib-1.19</url>
+      <bibkey>derzhanski-siruk-2024-verbal</bibkey>
+    </paper>
+    <paper id="20">
+      <title>Lexical Richness of <fixed-case>F</fixed-case>rench and <fixed-case>Q</fixed-case>uebec Journalistic Texts</title>
+      <author><first>Natalia</first><last>Dankova</last></author>
+      <pages>196–200</pages>
+      <abstract>This paper presents some results of a quantitative study that focuses on the variety and word frequency in texts from a comparative perspective. The study aims to analyze and compare French and Quebec journalistic texts on political and cultural topics written in French and recently published in major newspapers such as Le Monde, le Figaro, Le Devoir, etc. The statistical analysis concerns the number of different words in the text, the number of different adjectives, the number of different verbs (and also passive structures, participles and gerunds which contribute to syntactic and stylistic sophistication), and the number of hapaxes. French texts from France exhibit greater lexical richness and sophistication: they contain more adjectives, a greater variety of adjectives, as well as more participles and gerunds compared to French texts from Quebec. The originality of the study lies in the fact that it analyzes variation in French using a lexicometric approach.</abstract>
+      <url hash="95c6a709">2024.clib-1.20</url>
+      <bibkey>dankova-2024-lexical</bibkey>
+    </paper>
+    <paper id="21">
+      <title>A Corpus of Liturgical Texts in <fixed-case>G</fixed-case>erman: Towards Multilevel Text Annotation</title>
+      <author><first>Maria</first><last>Khokhlova</last></author>
+      <author><first>Mikhail</first><last>Koryshev</last></author>
+      <pages>201–205</pages>
+      <abstract>The aim of the study is to create a “documented” literary and theological history of German Catholic hymnography. The paper focuses on the creation of a corpus of liturgical texts in German and describes the first stage of annotation dealing with the metatextual markup of Catholic hymns. The authors dwell in detail on the parameters of the multi-level classification of hymn texts they developed, which allows them to differentiate hymns on different grounds. The parameters include not only characteristics that represent hymns (the period and the source of their origin, rubrics, musical accompaniment), but also ones that are inherent for strophes. Based on the created markup, it is possible to trace general trends in texts divided according to certain meta-features. The developed scheme of annotation is given on the example of the hymnbook Gotteslob (1975). The results present statistics on different parameters used for hymn description.</abstract>
+      <url hash="a14799f9">2024.clib-1.21</url>
+      <bibkey>khokhlova-koryshev-2024-corpus</bibkey>
+    </paper>
+    <paper id="22">
+      <title><fixed-case>E</fixed-case>ur<fixed-case>L</fixed-case>ex<fixed-case>S</fixed-case>ummarization – A New Text Summarization Dataset on <fixed-case>EU</fixed-case> Legislation in 24 Languages with <fixed-case>GPT</fixed-case> Evaluation</title>
+      <author><first>Valentin</first><last>Zmiycharov</last></author>
+      <author><first>Todor</first><last>Tsonkov</last></author>
+      <author><first>Ivan</first><last>Koychev</last></author>
+      <pages>206–213</pages>
+      <abstract>Legal documents are notorious for their length and complexity, making it challenging to extract crucial information efficiently. In this paper, we introduce a new dataset for legal text summarization, covering 24 languages. We not only present and analyze the dataset but also conduct experiments using various extractive techniques. We provide a comparison between these techniques and summaries generated by the state-of-the-art GPT models. The abstractive GPT approach outperforms the extractive TextRank approach in 8 languages, but produces slightly lower results in the remaining 16 languages. This research aims to advance the field of legal document summarization by addressing the need for accessible and comprehensive information retrieval from lengthy legal texts.</abstract>
+      <url hash="6679d62b">2024.clib-1.22</url>
+      <bibkey>zmiycharov-etal-2024-eurlexsummarization</bibkey>
+    </paper>
+    <paper id="23">
+      <title>On a Hurtlex Resource for <fixed-case>B</fixed-case>ulgarian</title>
+      <author><first>Petya</first><last>Osenova</last></author>
+      <pages>214–219</pages>
+      <abstract>The paper reports on the cleaning of the Hurtlex lexicon for Bulgarian as part of the multilingual Hurtlex resource. All the challenges during the cleaning process are presented, such as: deleting strings or lexica that are clear errors from the automatic translation, establishing criteria for keeping or discarding a lexeme based on its meaning and potential usages, contextualizing the lexeme with the meaning through an example, etc. In addition, the paper discusses the mapping of the offensive lexica to the BTB-Wordnet as well as the system that has been used.</abstract>
+      <url hash="22a94939">2024.clib-1.23</url>
+      <bibkey>osenova-2024-hurtlex</bibkey>
+    </paper>
+    <paper id="24">
+      <title>Unified Annotation of the Stages of the <fixed-case>B</fixed-case>ulgarian Language. First Steps</title>
+      <author><first>Fabio</first><last>Maion</last></author>
+      <author><first>Tsvetana</first><last>Dimitrova</last></author>
+      <author><first>Andrej</first><last>Bojadziev</last></author>
+      <pages>220–226</pages>
+      <abstract>The paper reports on an ongoing work on a proposal of guidelines for unified annotation of the stages in the development of the Bulgarian language from the Middle Ages to the early modern period. It discusses the criteria for the selection of texts and their representation, along with some results of the trial tagging with an existing tagger which was already trained on other texts.</abstract>
+      <url hash="a53b582c">2024.clib-1.24</url>
+      <bibkey>maion-etal-2024-unified</bibkey>
+    </paper>
+    <paper id="25">
+      <title><fixed-case>C</fixed-case>hat<fixed-case>GPT</fixed-case>: Detection of <fixed-case>S</fixed-case>panish Terms Based on False <fixed-case>F</fixed-case>riends</title>
+      <author><first>Amal</first><last>Haddad Haddad</last></author>
+      <author><first>Damith</first><last>Premasiri</last></author>
+      <pages>227–240</pages>
+      <abstract>One of the common errors which translators commit when transferring terms from one lan- guage into another is erroneously coining terms which are based on a false friend mistake due to the similarity between lexical units forming part of terms. In this case-study, we use Chat- GPT to automatically detect terms in Spanish which may be coined based on a false friend relation. To carry out this study, we imple- mented two experiments with GPT and com- pared the results. In the first, we prompted GPT to produce a list of twenty terms in Span- ish extracted from the UN discourse, which are possibly based on false friend relation, and its English equivalents and analysed the veracity of the results. In the second experiment, we used an aligned corpus to further study the ca- pabilities of the Language Model on detecting false friends in English and Spanish Text. Some results were significant for future terminologi- cal studies.</abstract>
+      <url hash="49a17dcd">2024.clib-1.25</url>
+      <bibkey>haddad-haddad-premasiri-2024-chatgpt</bibkey>
+    </paper>
+    <paper id="26">
+      <title>Deep Learning Framework for Identifying Future Market Opportunities from Textual User Reviews</title>
+      <author><first>Jordan</first><last>Kralev</last></author>
+      <pages>241–248</pages>
+      <abstract>The paper develops an application of design gap theory for identification of future market segment growth and capitalization from a set of customer reviews for bought products from the market in a given past period. To build a consumer feature space, an encoded-decoder network with attention is trained over the textual reviews after they are pre-processed through tokenization and embedding layers. The encodings for product reviews are used to train a variational auto encoder network for representation of a product feature space. The sampling capabilities of this network are extended with a function to look for innovative designs with high consumer preferences, characterizing future opportunities in a given market segment. The framework is demonstrated for processing of Amazon reviews in consumer electronics segment.</abstract>
+      <url hash="b41cd769">2024.clib-1.26</url>
+      <bibkey>kralev-2024-deep</bibkey>
+    </paper>
+    <paper id="27">
+      <title>Look Who’s Talking: The Most Frequently Used Words in the <fixed-case>B</fixed-case>ulgarian Parliament 1990-2024</title>
+      <author><first>Ruslana</first><last>Margova</last></author>
+      <author><first>Bastiaan</first><last>Bruinsma</last></author>
+      <pages>249–256</pages>
+      <abstract>In this study we identify the most frequently used words and some multi-word expressions in the Bulgarian Parliament. We do this by using the transcripts of all plenary sessions between 1990 and 2024 - 3,936 in total. This allows us both to study an interesting period known in the Bulgarian linguistic space as the years of “transition and democracy”, and to provide scholars of Bulgarian politics with a purposefully generated list of additional stop words that they can use for future analysis. Because our list of words was generated from the data, there is no preconceived theory, and because we include all interactions during all sessions, our analysis goes beyond traditional party lines. We provide details of how we selected, retrieved, and cleaned our data, and discuss our findings.</abstract>
+      <url hash="2e704205">2024.clib-1.27</url>
+      <bibkey>margova-bruinsma-2024-look</bibkey>
+    </paper>
+    <paper id="28">
+      <title>Estimating Commonsense Knowledge from a Linguistic Analysis on Information Distribution</title>
+      <author><first>Sabrina</first><last>Mennella</last></author>
+      <author><first>Maria</first><last>Di Maro</last></author>
+      <author><first>Martina</first><last>Di Bratto</last></author>
+      <pages>257–263</pages>
+      <abstract>Commonsense Knowledge (CSK) is defined as a complex and multifaceted structure, encompassing a wide range of knowledge and reasoning generally acquired through everyday experiences. As CSK is often implicit in communication, it poses a challenge for AI systems to simulate human-like interaction. This work aims to deepen the CSK information structure from a linguistic perspective, starting from its organisation in conversations. To achieve this goal, we developed a three-level analysis model to extract more insights about this knowledge, focusing our attention on the second level. In particular, we aimed to extract the distribution of explicit actions and their execution order in the communicative flow. We built an annotation scheme based on FrameNet and applied it to a dialogical corpus on the culinary domain. Preliminary results indicate that certain frames occur earlier in the dialogues, while others occur towards the process’s end. These findings contribute to the systematic nature of actions by establishing clear patterns and relationships between frames.</abstract>
+      <url hash="e672d2e1">2024.clib-1.28</url>
+      <bibkey>mennella-etal-2024-estimating</bibkey>
+    </paper>
+    <paper id="29">
+      <title>Pondera: A Personalized <fixed-case>AI</fixed-case>–Driven Weight Loss Mobile Companion with Multidimensional Goal Fulfillment Analytics</title>
+      <author><first>Georgi</first><last>Pashev</last></author>
+      <author><first>Silvia</first><last>Gaftandzhieva</last></author>
+      <pages>264–271</pages>
+      <abstract>The global obesity epidemic is a significant challenge to public health, necessitating innovative and personalized solutions. This paper presents Pondera, an innovative mobile app revolutionizing weight management by integrating Artificial Intelligence (AI) and multidimensional goal fulfilment analytics. Pondera distinguishes itself by supplying a tailored approach to weight loss, combining individual user data, including dietary preferences, fitness levels, and specific weight loss objectives, with advanced AI algorithms to generate personalized weight loss plans. Future development directions include refining AI algorithms, enhancing user experience, and validating effectiveness through comprehensive studies, ensuring Pondera becomes a pivotal tool in achieving sustainable weight loss and health improvement.</abstract>
+      <url hash="adc09cfa">2024.clib-1.29</url>
+      <bibkey>pashev-gaftandzhieva-2024-pondera</bibkey>
+    </paper>
+    <paper id="30">
+      <title>Mitigating Hallucinations in Large Language Models via Semantic Enrichment of Prompts: Insights from <fixed-case>B</fixed-case>io<fixed-case>BERT</fixed-case> and Ontological Integration</title>
+      <author><first>Stanislav</first><last>Penkov</last></author>
+      <pages>272–276</pages>
+      <abstract>The advent of Large Language Models (LLMs) has been transformative for natural language processing, yet their tendency to produce “hallucinations”—outputs that are factually incorrect or entirely fabricated— remains a significant hurdle. This paper introduces a proactive methodology for reducing hallucinations by strategically enriching LLM prompts. This involves identifying key entities and contextual cues from varied domains and integrating this information into the LLM prompts to guide the model towards more accurate and relevant responses. Leveraging examples from BioBERT for biomedical entity recognition and ChEBI for chemical ontology, we illustrate a broader approach that encompasses semantic prompt enrichment as a versatile tool for enhancing LLM output accuracy. By examining the potential of semantic and ontological enrichment in diverse contexts, we aim to present a scalable strategy for improving the reliability of AI-generated content, thereby contributing to the ongoing efforts to refine LLMs for a wide range of applications.</abstract>
+      <url hash="d3b79674">2024.clib-1.30</url>
+      <bibkey>penkov-2024-mitigating</bibkey>
+    </paper>
+    <paper id="31">
+      <title>Commercially Minor Languages and Localization</title>
+      <author><first>Maria</first><last>Todorova</last></author>
+      <pages>277–285</pages>
+      <abstract>This paper offers a perspective of languages with a less significant volume of digital usership as minor in the context of globalization and localization. With this premise, the risks this status poses to the quality of localized texts, the substantiality of genre conventions, the public image of professional translators, and the users’ linguistic competence in these languages is explored. Furthermore, the common lack of established or clear conventions in the localization of digital products into commercially minor languages (and in the digital product genres) is highlighted as one of the factors amplifying these risks. These perspectives are contextualized with the Bulgarian language with examples of errors encountered in Bulgarian digital content localized from English and more specifically – errors and problems related to gender neutrality and register.</abstract>
+      <url hash="6ab633cb">2024.clib-1.31</url>
+      <bibkey>todorova-2024-commercially</bibkey>
+    </paper>
+    <paper id="32">
+      <title>Semantic features in the automatic analysis of verbs of creation in <fixed-case>B</fixed-case>ulgarian and <fixed-case>E</fixed-case>nglish</title>
+      <author><first>Ivelina</first><last>Stoyanova</last></author>
+      <pages>286–295</pages>
+      <abstract>The paper focuses on the semantic class of verbs of creation as a subclass of dynamic verbs. The objective is to present the description of creation verbs in terms of their corresponding semantic frames and to outline the semantic features of the frame elements with a view to their automatic identification and analysis in text. The observations are performed on Bulgarian and English data with the aim to establish the language-independent and language-specific features in the semantic description of the analysed class of verbs.</abstract>
+      <url hash="8d2bb6fa">2024.clib-1.32</url>
+      <bibkey>stoyanova-2024-semantic</bibkey>
+    </paper>
+    <paper id="33">
+      <title>A ‘Dipdive’ into Motion: Exploring Lexical Resources towards a Comprehensive Semantic and Syntactic Description</title>
+      <author><first>Svetlozara</first><last>Leseva</last></author>
+      <pages>296–308</pages>
+      <abstract>In this paper I illustrate the semantic description of verbs provided in three semantic resources (FrameNet, VerbNet and VerbAtlas) in comparative terms with a view to identifying common and distinct components in their representation and obtaining a preliminary idea of the resources’ interoperability. To this end, I provide a comparison of a small sample of motion verbs aligned with semantic frames and classes in the three resources. I also describe the semantic annotation of Bulgarian motion verbs using the framework defined in the Berkeley FrameNet project and its enrichment with information from the other two resources, which has been enabled by the mapping between: (i) their major semantic units – FrameNet frames, VerbNet classes and VerbAtlas frames, and (ii) their ’building blocks’ – frame elements (FrameNet )and semantic roles (VerbNet, VerbAtlas).</abstract>
+      <url hash="57aa1c61">2024.clib-1.33</url>
+      <bibkey>leseva-2024-dipdive</bibkey>
+    </paper>
+    <paper id="34">
+      <title>Multilingual Corpus of Illustrative Examples on Activity Predicates</title>
+      <author><first>Ivelina</first><last>Stoyanova</last></author>
+      <author><first>Hristina</first><last>Kukova</last></author>
+      <author><first>Maria</first><last>Todorova</last></author>
+      <author><first>Tsvetana</first><last>Dimitrova</last></author>
+      <pages>309–318</pages>
+      <abstract>The paper presents the ongoing process of compilation of a multilingual corpus of illustrative examples to supplement our work on the syntactic and semantic analysis of predicates representing activities in Bulgarian and other languages. The corpus aims to include over 1,000 illustrative examples on verbs from six semantic classes of predicates (verbs of motion, contact, consumption, creation, competition and bodily functions) which provide a basis for observations on the specificity of their realisation. The corpus of illustrative examples will be used for contrastive studies and further elaboration on the scope and behaviour of activity verbs in general, as well as its semantic subclasses.</abstract>
+      <url hash="5dbd4dc7">2024.clib-1.34</url>
+      <bibkey>stoyanova-etal-2024-multilingual</bibkey>
+    </paper>
+    <paper id="35">
+      <title>Large Language Models in Linguistic Research: the Pilot and the Copilot</title>
+      <author><first>Svetla</first><last>Koeva</last></author>
+      <pages>319–328</pages>
+      <abstract>In this paper, we present two experiments focussing on linguistic classification and annotation of examples, using zero-shot prompting. The aim is to show how large language models can confirm or reject the linguistic judgements of experts in order to increase the productivity of their work. In the first experiment, new lexical units evoking a particular FrameNet semantic frame are selected simultaneously with the annotation of examples with the core frame elements. The second experiment attempts to categorise verbs into the aspectual classes, assuming that only certain combinations of verbs belonging to different aspectual classes evoke a semantic frame. The linguistic theories underlying the two experiments, the development of the prompts and the results of the experiments are presented.</abstract>
+      <url hash="6993e6ee">2024.clib-1.35</url>
+      <bibkey>koeva-2024-large</bibkey>
+    </paper>
+  </volume>
+</collection>
diff --git a/data/xml/2024.eacl.xml b/data/xml/2024.eacl.xml
index fac82099eb..5aeeb34cea 100644
--- a/data/xml/2024.eacl.xml
+++ b/data/xml/2024.eacl.xml
@@ -364,7 +364,7 @@
       <author><first>Verena</first><last>Blaschke</last><affiliation>Ludwig-Maximilians-Universität München</affiliation></author>
       <author><first>Barbara</first><last>Plank</last><affiliation>Ludwig-Maximilians-Universität München and IT University of Copenhagen</affiliation></author>
       <pages>445-468</pages>
-      <abstract>Mainstream cross-lingual task-oriented dialogue (ToD) systems leverage the transfer learning paradigm by training a joint model for intent recognition and slot-filling in English and applying it, zero-shot, to other languages.We address a gap in prior research, which often overlooked the transfer to lower-resource colloquial varieties due to limited test data.Inspired by prior work on English varieties, we craft and manually evaluate perturbation rules that transform German sentences into colloquial forms and use them to synthesize test sets in four ToD datasets.Our perturbation rules cover 18 distinct language phenomena, enabling us to explore the impact of each perturbation on slot and intent performance.Using these new datasets, we conduct an experimental evaluation across six different transformers.Here, we demonstrate that when applied to colloquial varieties, ToD systems maintain their intent recognition performance, losing 6% (4.62 percentage points) in accuracy on average. However, they exhibit a significant drop in slot detection, with a decrease of 31% (21 percentage points) in slot F<tex-math>_1</tex-math> score.Our findings are further supported by a transfer experiment from Standard American English to synthetic Urban African American Vernacular English.</abstract>
+      <abstract>Mainstream cross-lingual task-oriented dialogue (ToD) systems leverage the transfer learning paradigm by training a joint model for intent recognition and slot-filling in English and applying it, zero-shot, to other languages. We address a gap in prior research, which often overlooked the transfer to lower-resource colloquial varieties due to limited test data. Inspired by prior work on English varieties, we craft and manually evaluate perturbation rules that transform German sentences into colloquial forms and use them to synthesize test sets in four ToD datasets. Our perturbation rules cover 18 distinct language phenomena, enabling us to explore the impact of each perturbation on slot and intent performance. Using these new datasets, we conduct an experimental evaluation across six different transformers. Here, we demonstrate that when applied to colloquial varieties, ToD systems maintain their intent recognition performance, losing 6% (4.62 percentage points) in accuracy on average. However, they exhibit a significant drop in slot detection, with a decrease of 31% (21 percentage points) in slot F<tex-math>_1</tex-math> score. Our findings are further supported by a transfer experiment from Standard American English to synthetic Urban African American Vernacular English.</abstract>
       <url hash="51ed9c7f">2024.eacl-long.28</url>
       <attachment type="software" hash="3a9f24a3">2024.eacl-long.28.software.zip</attachment>
       <bibkey>artemova-etal-2024-exploring</bibkey>
diff --git a/data/xml/2024.findings.xml b/data/xml/2024.findings.xml
index 1b32a29dd4..e053da99cf 100644
--- a/data/xml/2024.findings.xml
+++ b/data/xml/2024.findings.xml
@@ -5893,19 +5893,19 @@
   </volume>
   <volume id="acl" ingest-date="2024-08-12" type="proceedings">
     <meta>
-      <booktitle>Findings of the Association for Computational Linguistics ACL 2024</booktitle>
+      <booktitle>Findings of the Association for Computational Linguistics: ACL 2024</booktitle>
       <editor><first>Lun-Wei</first><last>Ku</last><affiliation>Academia Sinica</affiliation></editor>
       <editor><first>Andre</first><last>Martins</last><affiliation>Instituto Superior Técnico / Instituto de Telecomunicações / Unbabel</affiliation></editor>
       <editor><first>Vivek</first><last>Srikumar</last><affiliation>University of Utah</affiliation></editor>
       <publisher>Association for Computational Linguistics</publisher>
-      <address>Bangkok, Thailand and virtual meeting</address>
+      <address>Bangkok, Thailand</address>
       <month>August</month>
       <year>2024</year>
-      <url hash="7ea27750">2024.findings-acl</url>
+      <url hash="8553fca8">2024.findings-acl</url>
       <venue>findings</venue>
     </meta>
     <frontmatter>
-      <url hash="035121fd">2024.findings-acl.0</url>
+      <url hash="5a2a8aa6">2024.findings-acl.0</url>
       <bibkey>findings-2024-acl</bibkey>
     </frontmatter>
     <paper id="1">
@@ -5915,7 +5915,7 @@
       <author><first>Jingbo</first><last>Shang</last><affiliation>University of California, San Diego</affiliation></author>
       <pages>1-16</pages>
       <abstract>Prompting large language models (LLMs) for data augmentation has recently become a common practice in few-shot NLP tasks. In this paper, we propose Chain-of-Thought Attribute Manipulation (CoTAM), a novel approach that generates new data from existing examples by only tweaking in the user-provided, task-specific attribute, e.g., sentiment polarity or topic in movie reviews. Instead of conventional latent representation controlling, we leverage the chain-of-thought prompting to directly edit the text in three steps, (1) attribute decomposition, (2) manipulation proposal, and (3) sentence reconstruction. Extensive results on various tasks, such as text (pair) classification and aspect-based sentiment analysis, verify the superiority of CoTAM over other LLM-based augmentation methods with the same number of training examples for both fine-tuning and in-context learning. Remarkably, the 2D visualization of the augmented dataset using principle component analysis revealed a human-recognizable decision boundary that is likely hinted by the attribute manipulation, demonstrating the potential of our proposed approach.</abstract>
-      <url hash="3eea4f96">2024.findings-acl.1</url>
+      <url hash="1f21d8c5">2024.findings-acl.1</url>
       <bibkey>peng-etal-2024-controllable</bibkey>
       <doi>10.18653/v1/2024.findings-acl.1</doi>
     </paper>
@@ -5926,7 +5926,7 @@
       <author><first>Yi</first><last>Feng</last></author>
       <pages>17-27</pages>
       <abstract>Keyphrase extraction aims to automatically extract salient phrases representing the critical information in the source document. Identifying salient phrases is challenging because there is a lot of noisy information in the document, leading to wrong extraction. To address this issue, in this paper, we propose a hybrid matching model for keyphrase extraction, which combines representation-focused and interaction-based matching modules into a unified framework for improving the performance of the keyphrase extraction task. Specifically, HybridMatch comprises (1) a PLM-based Siamese encoder component that represents both candidate phrases and documents, (2) an interaction-focused matching (IM) component that estimates word matches between candidate phrases and the corresponding document at the word level, and (3) a representation-focused matching (RM) component captures context-aware semantic relatedness of each candidate keyphrase at the phrase level. Extensive experimental results on the OpenKP dataset demonstrate that the performance of the proposed model HybridMatch outperforms the recent state-of-the-art keyphrase extraction baselines. Furthermore, we discuss the performance of large language models in keyphrase extraction based on recent studies and our experiments.</abstract>
-      <url hash="cca820a4">2024.findings-acl.2</url>
+      <url hash="cd9b33c2">2024.findings-acl.2</url>
       <bibkey>song-etal-2024-match</bibkey>
       <doi>10.18653/v1/2024.findings-acl.2</doi>
     </paper>
@@ -5941,7 +5941,7 @@
       <author><first>Ningyi</first><last>Xu</last><affiliation>Shanghai Jiaotong University</affiliation></author>
       <pages>28-36</pages>
       <abstract>Large language models (LLMs) show great performance in various tasks, but face deployment challenges from limited memory capacity and bandwidth.Low-bit weight quantization can save memory and accelerate inference.Although floating-point (FP) formats show good performance in LLM quantization, they tend to perform poorly with small group sizes or sub-4 bits.We find the reason is that the absence of asymmetry in previous FP quantization makes it unsuitable for handling asymmetric value distribution of LLM weight tensors.In this work, we propose asymmetric FP quantization (AFPQ), which sets separate scales for positive and negative values.Our method leads to large accuracy improvements and can be easily plugged into other quantization methods, including GPTQ and AWQ, for better performance.Besides, no additional storage is needed compared with asymmetric integer (INT) quantization.The code is available at https://github.com/zhangsichengsjtu/AFPQ.</abstract>
-      <url hash="9b2d8a3b">2024.findings-acl.3</url>
+      <url hash="6bf6baae">2024.findings-acl.3</url>
       <bibkey>zhang-etal-2024-afpq</bibkey>
       <doi>10.18653/v1/2024.findings-acl.3</doi>
       <video href="2024.findings-acl.3.mp4"/>
@@ -5953,7 +5953,7 @@
       <author><first>Guodong</first><last>Zhou</last><affiliation>Soochow University, China</affiliation></author>
       <pages>37-47</pages>
       <abstract>Emotion detection is the task of automatically associating one or more emotions with a text. The emotions are experienced, targeted, and caused by different semantic constituents. Therefore, it is necessary to incorporate these semantic constituents into the process of emotion detection. In this study, we propose a new task called emotion semantic parsing which aims to parse the emotion and semantic constituents into an abstract semantic tree structure. In particular, we design an end-to-end generation model to capture the relations between emotion and all the semantic constituents, and to generate them jointly. Furthermore, we employ a task decomposition strategy to capture the semantic relation among these constituents in a more cognitive and structural way. Experimental results demonstrate the importance of the proposed task, and indicate the proposed model gives superior performance compared to other models.</abstract>
-      <url hash="84f4fbba">2024.findings-acl.4</url>
+      <url hash="be47bceb">2024.findings-acl.4</url>
       <bibkey>jiang-etal-2024-end</bibkey>
       <doi>10.18653/v1/2024.findings-acl.4</doi>
     </paper>
@@ -5967,7 +5967,7 @@
       <author><first>Qiang</first><last>Zhang</last><affiliation>Zhejiang University</affiliation></author>
       <pages>48-61</pages>
       <abstract>Intelligent task-oriented dialogue systems (ToDs) are expected to continuously acquire new knowledge, also known as Continual Learning (CL), which is crucial to fit ever-changing user needs. However, catastrophic forgetting dramatically degrades the model performance in face of a long streamed curriculum. In this paper, we aim to overcome the forgetting problem in ToDs and propose a method (HESIT) with hyper-gradient-based exemplar strategy, which samples influential exemplars for periodic retraining. Instead of unilaterally observing data or models, HESIT adopts a profound exemplar selection strategy that considers the general performance of the trained model when selecting exemplars for each task domain. Specifically, HESIT analyzes the training data influence by tracing their hyper-gradient in the optimization process. Furthermore, HESIT avoids estimating Hessian to make it compatible for ToDs with a large pre-trained model. Experimental results show that HESIT effectively alleviates catastrophic forgetting by exemplar selection, and achieves state-of-the-art performance on the largest CL benchmark of ToDs in terms of all metrics.</abstract>
-      <url hash="7dc1f2b6">2024.findings-acl.5</url>
+      <url hash="f2bc8186">2024.findings-acl.5</url>
       <bibkey>chen-etal-2024-overcoming</bibkey>
       <doi>10.18653/v1/2024.findings-acl.5</doi>
     </paper>
@@ -5976,7 +5976,7 @@
       <author><first>Hyunsoo</first><last>Cho</last><affiliation>Ewha Women’s University</affiliation></author>
       <pages>62-73</pages>
       <abstract>Many recent studies endeavor to improve open-sourced language models through imitation learning, re-training on the synthetic instruction data from state-of-the-art proprietary models like ChatGPT and GPT-4.However, the innate nature of synthetic data inherently contains noisy data, giving rise to a substantial presence of low-quality data replete with misleading queries, erroneous responses, and flawed reasoning.Although we intuitively grasp the potential harm of noisy data, we lack a quantitative understanding of its impact.To this end, this paper explores correlation between the degree of noise and its impact on language models through instruction tuning.We first introduce the Falsity-Controllable () dataset, which comprises pairs of true answers and corresponding reasoning, as well as false pairs to manually control the factuality ratio of the dataset.Through our extensive experiments, we found multiple intriguing findings of the correlation between factuality and instruction tuning. Specifically, factuality can significantly impact various benchmark characteristics especially when benchmarks are related to knowledge domain, and initial data quality plays a critical role, whereas the number of learning steps has a lesser impact.Additionally, we noted that once the language model is trained with a dataset contaminated by noise, restoring its original performance becomes exceptionally challenging, verging on irreversible.</abstract>
-      <url hash="069ce955">2024.findings-acl.6</url>
+      <url hash="c9bc6eea">2024.findings-acl.6</url>
       <bibkey>cho-2024-unveiling</bibkey>
       <doi>10.18653/v1/2024.findings-acl.6</doi>
       <video href="2024.findings-acl.6.mp4"/>
@@ -5992,7 +5992,7 @@
       <author><first>Armando</first><last>Solar-Lezama</last><affiliation>Massachusetts Institute of Technology</affiliation></author>
       <pages>74-117</pages>
       <abstract>While language models are increasingly more proficient at code generation, they still frequently generate incorrect programs. Many of these programs are obviously wrong, but others are more subtle and pass weaker correctness checks such as being able to compile. In this work, we focus on these counterfeit samples: programs sampled from a language model that 1) have a high enough log-probability to be generated at a moderate temperature and 2) pass weak correctness checks. Overall, we discover that most models have a very shallow understanding of counterfeits through three clear failure modes. First, models mistakenly classify them as correct. Second, models are worse at reasoning about the execution behaviour of counterfeits and often predict their execution results as if they were correct. Third, when asking models to fix counterfeits, the likelihood of a model successfully repairing a counterfeit is often even lower than that of sampling a correct program from scratch. Counterfeits also have very unexpected properties: first, counterfeit programs for problems that are easier for a model to solve are not necessarily easier to detect and only slightly easier to execute and repair. Second, counterfeits from a given model are just as confusing to the model itself as they are to other models. Finally, both strong and weak models are able to generate counterfeit samples that equally challenge all models. In light of our findings, we recommend that care and caution be taken when relying on models to understand their own samples, especially when no external feedback is incorporated.</abstract>
-      <url hash="e67ad5e5">2024.findings-acl.7</url>
+      <url hash="b3aad5e2">2024.findings-acl.7</url>
       <bibkey>gu-etal-2024-counterfeit</bibkey>
       <doi>10.18653/v1/2024.findings-acl.7</doi>
     </paper>
@@ -6008,7 +6008,7 @@
       <author><first>Aakanksha</first><last>Naik</last><affiliation>Allen Institute for Artificial Intelligence and National Institutes of Health</affiliation></author>
       <pages>118-132</pages>
       <abstract>Literature review requires researchers to synthesize a large amount of information and is increasingly challenging as the scientific literature expands. In this work, we investigate the potential of LLMs for producing hierarchical organizations of scientific studies to assist researchers with literature review. We define hierarchical organizations as tree structures where nodes refer to topical categories and every node is linked to the studies assigned to that category. Our naive LLM-based pipeline for hierarchy generation from a set of studies produces promising yet imperfect hierarchies, motivating us to collect CHIME, an expert-curated dataset for this task focused on biomedicine. Given the challenging and time-consuming nature of building hierarchies from scratch, we use a human-in-the-loop process in which experts correct errors (both links between categories and study assignment) in LLM-generated hierarchies. CHIME contains 2,174 LLM-generated hierarchies covering 472 topics, and expert-corrected hierarchies for a subset of 100 topics. Expert corrections allow us to quantify LLM performance, and we find that while they are quite good at generating and organizing categories, their assignment of studies to categories could be improved. We attempt to train a corrector model with human feedback which improves study assignment by 12.6 F1 points. We release our dataset and models to encourage research on developing better assistive tools for literature review.</abstract>
-      <url hash="0dc50954">2024.findings-acl.8</url>
+      <url hash="d17e53b0">2024.findings-acl.8</url>
       <bibkey>hsu-etal-2024-chime</bibkey>
       <doi>10.18653/v1/2024.findings-acl.8</doi>
       <video href="2024.findings-acl.8.mp4"/>
@@ -6028,7 +6028,7 @@
       <author><first>Goran</first><last>Nenadic</last><affiliation>University of Manchester</affiliation></author>
       <pages>133-150</pages>
       <abstract>With the recent advances of large language models (LLMs), it is no longer infeasible to build an automated debate system that helps people to synthesise persuasive arguments. Previous work attempted this task by integrating multiple components. In our work, we introduce an argument mining dataset that captures the end-to-end process of preparing an argumentative essay for a debate, which covers the tasks of claim and evidence identification (Task 1 ED), evidence convincingness ranking (Task 2 ECR), argumentative essay summarisation and human preference ranking (Task 3 ASR) and metric learning for automated evaluation of resulting essays, based on human feedback along argument quality dimensions (Task 4 SQE). Our dataset contains 14k examples of claims that are fully annotated with various properties supporting the aforementioned tasks. We evaluate multiple generative baselines for each of these tasks, including representative LLMs. We find, that while they show promising results on individual tasks in our benchmark, their end-to-end performance on all four tasks in succession deteriorates significantly, both in automated measures as well as in human-centred evaluation. This challenge presented by our proposed dataset motivates future research on end-to-end argument mining and summarisation. The repository of this project is available at https://github.com/HarrywillDr/ArgSum-Datatset.</abstract>
-      <url hash="4f5652f8">2024.findings-acl.9</url>
+      <url hash="e6791161">2024.findings-acl.9</url>
       <bibkey>li-etal-2024-side</bibkey>
       <doi>10.18653/v1/2024.findings-acl.9</doi>
       <video href="2024.findings-acl.9.mp4"/>
@@ -6045,7 +6045,7 @@
       <author><first>Asim</first><last>Munawar</last><affiliation>International Business Machines</affiliation></author>
       <pages>151-162</pages>
       <abstract>Despite LLMs’ recent advancements, they still suffer from factual inconsistency and hallucination. An often-opted remedy is retrieval-augmented generation – however, there is no guarantee that the model will strictly adhere to retrieved grounding. Fundamentally, LLMs need to be aligned to be more faithful to grounding, which will require high-quality preference annotations. This paper investigates whether we can create high-quality grounded preference data for model alignment without using annotations from humans or large proprietary models. We experimented with existing entailment data and proposed approaches to generate synthetic grounded preference data, with which we train a Grounded Preference Model(GPM). We demonstrate through Proximal Policy Optimization(PPO) training of Mistral-7B-Instruct that our GPM model can successfully align powerful LLMs to generate much better grounded responses as judged by GPT4. Moreover, we show that our GPM is also a great faithfulness classifier, achieving SoTA in dialogue sub-tasks of the TRUE faithfulness Benchmark. We will release our GPM under the Apache 2.0 license.</abstract>
-      <url hash="79373acd">2024.findings-acl.10</url>
+      <url hash="84dacde7">2024.findings-acl.10</url>
       <bibkey>naseem-etal-2024-grounded</bibkey>
       <doi>10.18653/v1/2024.findings-acl.10</doi>
       <video href="2024.findings-acl.10.mp4"/>
@@ -6065,7 +6065,7 @@
       <author><first>Jiawei</first><last>Han</last></author>
       <pages>163-184</pages>
       <abstract>Large language models (LLMs), while exhibiting exceptional performance, suffer from hallucinations, especially on knowledge-intensive tasks. Existing works propose to augment LLMs with individual text units retrieved from external knowledge corpora to alleviate the issue. However, in many domains, texts are interconnected (e.g., academic papers in a bibliographic graph are linked by citations and co-authorships) which form a (text-attributed) graph. The knowledge in such graphs is encoded not only in single texts/nodes but also in their associated connections. To facilitate the research of augmenting LLMs with graphs, we manually construct a Graph Reasoning Benchmark dataset called GRBench, containing 1,740 questions that can be answered with the knowledge from 10 domain graphs. Then, we propose a simple and effective framework called Graph Chain-of-thought (Graph-CoT) to augment LLMs with graphs by encouraging LLMs to reason on the graph iteratively. Each Graph-CoT iteration consists of three sub-steps: LLM reasoning, LLM-graph interaction, and graph execution. We conduct systematic experiments with three LLM backbones on GRBench, where Graph-CoT outperforms the baselines consistently. The code is available at https://github.com/PeterGriffinJin/Graph-CoT/.</abstract>
-      <url hash="d3f2671f">2024.findings-acl.11</url>
+      <url hash="c09e9114">2024.findings-acl.11</url>
       <bibkey>jin-etal-2024-graph</bibkey>
       <doi>10.18653/v1/2024.findings-acl.11</doi>
       <video href="2024.findings-acl.11.mp4"/>
@@ -6079,7 +6079,7 @@
       <author><first>Jiawei</first><last>Han</last></author>
       <pages>185-205</pages>
       <abstract>The task of information extraction (IE) is to extract structured knowledge from text. However, it is often not straightforward to utilize IE output due to the mismatch between the IE ontology and the downstream application needs. We propose a new formulation of IE, Text2DB, that emphasizes the integration of IE output and the target database (or knowledge base). Given a user instruction, a document set, and a database, our task requires the model to update the database with values from the document set to satisfy the user instruction. This task requires understanding user instructions for <i>what to extract</i> and adapting to the given DB/KB schema for <i>how to extract</i> on the fly. To evaluate this new task, we introduce a new benchmark featuring common demands such as data infilling, row population, and column addition. In addition, we propose an LLM agent framework OPAL (Observe-Plan-Analyze LLM) which includes an Observer component that interacts with the database, the Planner component that generates a code-based plan with calls to IE models, and the Analyzer component that provides feedback regarding code quality before execution. Experiments show that OPAL can successfully adapt to diverse database schemas by generating different code plans and calling the required IE models. We also highlight difficult cases such as dealing with large databases with complex dependencies and extraction hallucination, which we believe deserve further investigation.</abstract>
-      <url hash="31f38f2b">2024.findings-acl.12</url>
+      <url hash="bd32af63">2024.findings-acl.12</url>
       <bibkey>jiao-etal-2024-text2db</bibkey>
       <doi>10.18653/v1/2024.findings-acl.12</doi>
     </paper>
@@ -6091,7 +6091,7 @@
       <author><first>Emily</first><last>Prud’hommeaux</last><affiliation>Boston College</affiliation></author>
       <pages>206-213</pages>
       <abstract>N-gram language models (LMs) are the innovation that first made large-vocabulary continuous automatic speech recognition (ASR) viable. With neural end-to-end ASR architectures, however, LMs have become an afterthought. While the effect on accuracy may be negligible for English and Mandarin, jettisoning the LM might not make sense for the world’s remaining 6000+ languages. In this paper, we investigate the role of the LM in low-resource ASR. First we ask: does using an n-gram LM in decoding in neural architectures help ASR performance? While it may seem obvious that it should, its absence in most implementations suggests otherwise. Second, we ask: when an n-gram LM is used in ASR, is there a relationship between the size of the LM and ASR accuracy? We have discovered that gut feelings on this question vary considerably, but there is little empirical work to support any particular claim. We explore these questions “in the wild” using a deliberately diverse set of 9 very small ASR corpora. The results show that: (1) decoding with an n-gram LM, regardless of its size, leads to lower word error rates; and (2) increasing the size of the LM appears to yield improvements only when the audio corpus itself is already relatively large. This suggests that collecting additional LM training text may benefit widely-spoken languages which typically have larger audio corpora. In contrast, for endangered languages where data of any kind will always be limited, efforts may be better spent collecting additional transcribed audio.</abstract>
-      <url hash="c3f43d5c">2024.findings-acl.13</url>
+      <url hash="cdf6fa28">2024.findings-acl.13</url>
       <bibkey>liu-etal-2024-important</bibkey>
       <doi>10.18653/v1/2024.findings-acl.13</doi>
     </paper>
@@ -6105,7 +6105,7 @@
       <author><first>Sean</first><last>Lie</last><affiliation>Cerebras Systems, Inc</affiliation></author>
       <pages>214-230</pages>
       <abstract>Large language models (LLMs) are typically trained on general source data forvarious domains, but a recent surge in domain-specific LLMs has shown theirpotential to outperform general-purpose models in domain-specific tasks (e.g.,biomedicine). Although domain-specific pre-training enhances efficiency andleads to smaller models, the computational costs of training these LLMs remainhigh, posing budgeting challenges. We introduce MediSwift, a suite of biomedicalLMs that leverage sparse pre-training on domain-specific biomedical text data.By inducing up to 75% weight sparsity during the pre-training phase, MediSwiftachieves a 2-2.5x reduction in training FLOPs. Notably, all sparse pre-trainingwas performed on the Cerebras CS-2 system, which is specifically designed torealize the acceleration benefits from unstructured weight sparsity, therebysignificantly enhancing the efficiency of the MediSwift models. Throughsubsequent dense fine-tuning and strategic soft prompting, MediSwift modelsoutperform existing LLMs up to 7B parameters on biomedical tasks, setting newbenchmarks w.r.t efficiency-accuracy on tasks such as PubMedQA. Our results showthat sparse pre-training, along with dense fine-tuning and soft prompting,offers an effective method for creating high-performing, computationallyefficient models in specialized domains.</abstract>
-      <url hash="277228db">2024.findings-acl.14</url>
+      <url hash="d317f268">2024.findings-acl.14</url>
       <bibkey>thangarasa-etal-2024-mediswift</bibkey>
       <doi>10.18653/v1/2024.findings-acl.14</doi>
     </paper>
@@ -6116,7 +6116,7 @@
       <author><first>Jacob</first><last>Andreas</last><affiliation>Massachusetts Institute of Technology and Microsoft</affiliation></author>
       <pages>231-247</pages>
       <abstract>Today’s most accurate language models are trained on orders of magnitude more language data than human language learners receive— but with no supervision from other sensory modalities that play a crucial role in human learning. Can we make LMs’ representations and predictions more accurate (and more human-like) with more ecologically plausible supervision? This paper describes LexiContrastive Grounding (LCG), a grounded language learning procedure that leverages visual supervision to improve textual representations. LexiContrastive Grounding combines a next-token prediction strategy with a contrastive visual grounding objective, focusing on early-layerrepresentations that encode lexical information. Across multiple word-learning and sentence-understanding benchmarks, LexiContrastiveGrounding not only outperforms standard language-only models in terms of learning efficiency in small and developmentally plausible data regimes, but also improves upon vision-and-language learning procedures including CLIP, GIT, Flamingo, and Vokenization.Moreover, LexiContrastive Grounding improves perplexity by around 5% on multiple language modeling tasks compared to other models trained on the same amount of text data. This work underscores the potential of incorporating visual grounding into language models, aligning more closely with the multimodal nature of human language acquisition.</abstract>
-      <url hash="319d64af">2024.findings-acl.15</url>
+      <url hash="2779f6d9">2024.findings-acl.15</url>
       <bibkey>zhuang-etal-2024-lexicon</bibkey>
       <doi>10.18653/v1/2024.findings-acl.15</doi>
     </paper>
@@ -6129,7 +6129,7 @@
       <author><first>Gjergji</first><last>Kasneci</last><affiliation>Technische Universität München and University of Tuebingen</affiliation></author>
       <pages>248-264</pages>
       <abstract>A multitude of industries depend on accurate and reasonable tabular data augmentation for their business processes. Contemporary methodologies in generating tabular data revolve around utilizing Generative Adversarial Networks (GAN) or fine-tuning Large Language Models (LLM). However, GAN-based approaches are documented to produce samples with common-sense errors attributed to the absence of external knowledge. On the other hand, LLM-based methods exhibit a limited capacity to capture the disparities between synthesized and actual data distribution due to the absence of feedback from a discriminator during training. Furthermore, the decoding of LLM-based generation introduces gradient breakpoints, impeding the backpropagation of loss from a discriminator, thereby complicating the integration of these two approaches. To solve this challenge, we propose using proximal policy optimization (PPO) to apply GANs, guiding LLMs to enhance the probability distribution of tabular features. This approach enables the utilization of LLMs as generators for GANs in synthesizing tabular data. Our experiments demonstrate that PPO leads to an approximately 4% improvement in the accuracy of models trained on synthetically generated data over state-of-the-art across three real-world datasets.</abstract>
-      <url hash="b3a69b1e">2024.findings-acl.16</url>
+      <url hash="0b153f72">2024.findings-acl.16</url>
       <bibkey>yang-etal-2024-p</bibkey>
       <doi>10.18653/v1/2024.findings-acl.16</doi>
     </paper>
@@ -6139,7 +6139,7 @@
       <author><first>Wei</first><last>Ai</last><affiliation>University of Maryland, College Park</affiliation></author>
       <pages>265-282</pages>
       <abstract>There is increasing interest in distilling task-specific knowledge from large language models (LLM) to smaller student models.Nonetheless, LLM distillation presents a dual challenge: 1) there is a high cost associated with querying the teacher LLM, such as GPT-4, for gathering an ample number of demonstrations; 2) the teacher LLM might provide imperfect outputs with a negative impact on the student’s learning process. To enhance sample efficiency within resource-constrained, imperfect teacher scenarios, we propose a three-component framework leveraging three signal types. The first signal is the student’s self-consistency (consistency of student multiple outputs), which is a proxy of the student’s confidence. Specifically, we introduce a ”teaching assistant” (TA) model to assess the uncertainty of both the student’s and the teacher’s outputs via confidence scoring, which serves as another two signals for student training. Furthermore, we propose a two-stage training schema to first warm up the student with a small proportion of data to better utilize student’s signal. Experiments have shown the superiority of our proposed framework for four complex reasoning tasks. On average, our proposed two-stage framework brings a relative improvement of up to 20.79% compared to fine-tuning without any signals across datasets.</abstract>
-      <url hash="888d3eb2">2024.findings-acl.17</url>
+      <url hash="3c85ee8b">2024.findings-acl.17</url>
       <bibkey>zhou-ai-2024-teaching</bibkey>
       <doi>10.18653/v1/2024.findings-acl.17</doi>
       <video href="2024.findings-acl.17.mp4"/>
@@ -6154,7 +6154,7 @@
       <author><first>Julian</first><last>McAuley</last><affiliation>University of California, San Diego, University of California, San Diego</affiliation></author>
       <pages>283-294</pages>
       <abstract>Large language models (LLMs) such as GPT-3 and GPT-4 are powerful but their weights are often publicly unavailable and their immense sizes make the models difficult to be tuned with common hardware. As a result, effectively tuning these models with large-scale supervised data can be challenging. As an alternative, In-Context Learning (ICL) can only use a small number of supervised examples due to context length limits. In this paper, we propose Super In-Context Learning (SuperICL) which allows black-box LLMs to work with locally fine-tuned smaller models, resulting in superior performance on supervised tasks. Our experiments demonstrate that SuperICL can improve performance beyond state-of-the-art fine-tuned models while addressing the instability problem of in-context learning.</abstract>
-      <url hash="8e7dcfef">2024.findings-acl.18</url>
+      <url hash="75f03a6f">2024.findings-acl.18</url>
       <bibkey>xu-etal-2024-small</bibkey>
       <doi>10.18653/v1/2024.findings-acl.18</doi>
     </paper>
@@ -6165,7 +6165,7 @@
       <author><first>Siva</first><last>Reddy</last><affiliation>Mila, McGill University and Mila, McGill University</affiliation></author>
       <pages>295-337</pages>
       <abstract>Instruction-tuned Large Language Models (LLMs) excel at many tasks and will even explain their reasoning, so-called self-explanations. However, convincing and wrong self-explanations can lead to unsupported confidence in LLMs, thus increasing risk. Therefore, it’s important to measure if self-explanations truly reflect the model’s behavior. Such a measure is called interpretability-faithfulness and is challenging to perform since the ground truth is inaccessible, and many LLMs only have an inference API. To address this, we propose employing self-consistency checks to measure faithfulness. For example, if an LLM says a set of words is important for making a prediction, then it should not be able to make its prediction without these words. While self-consistency checks are a common approach to faithfulness, they have not previously been successfully applied to LLM self-explanations for counterfactual, feature attribution, and redaction explanations. Our results demonstrate that faithfulness is explanation, model, and task-dependent, showing self-explanations should not be trusted in general. For example, with sentiment classification, counterfactuals are more faithful for Llama2, feature attribution for Mistral, and redaction for Falcon 40B.</abstract>
-      <url hash="49e0de62">2024.findings-acl.19</url>
+      <url hash="54d1793d">2024.findings-acl.19</url>
       <bibkey>madsen-etal-2024-self</bibkey>
       <doi>10.18653/v1/2024.findings-acl.19</doi>
       <video href="2024.findings-acl.19.mp4"/>
@@ -6182,7 +6182,7 @@
       <author><first>Cornelia</first><last>Caragea</last><affiliation>University of Illinois, Chicago</affiliation></author>
       <pages>338-354</pages>
       <abstract>Existing datasets for attribute value extraction (AVE) predominantly focus on explicit attribute values while neglecting the implicit ones, lack product images, are often not publicly available, and lack an in-depth human inspection across diverse domains. To address these limitations, we present ImplicitAVE, the first, publicly available multimodal dataset for implicit attribute value extraction. ImplicitAVE, sourced from the MAVE dataset, is carefully curated and expanded to include implicit AVE and multimodality, resulting in a refined dataset of 68k training and 1.6k testing data across five domains. We also explore the application of multimodal large language models (MLLMs) to implicit AVE, establishing a comprehensive benchmark for MLLMs on the ImplicitAVE dataset. Six recent MLLMs with eleven variants are evaluated across diverse settings, revealing that implicit value extraction remains a challenging task for MLLMs. The contributions of this work include the development and release of ImplicitAVE, and the exploration and benchmarking of various MLLMs for implicit AVE, providing valuable insights and potential future research directions. Dataset and code are available at https://github.com/HenryPengZou/ImplicitAVE.</abstract>
-      <url hash="d75c8eae">2024.findings-acl.20</url>
+      <url hash="cc035620">2024.findings-acl.20</url>
       <bibkey>zou-etal-2024-implicitave</bibkey>
       <doi>10.18653/v1/2024.findings-acl.20</doi>
     </paper>
@@ -6194,7 +6194,7 @@
       <author><first>Fereshte</first><last>Khani</last><affiliation>Microsoft</affiliation></author>
       <pages>355-385</pages>
       <abstract>Prompt engineering is a challenging yet crucial task for optimizing the performance of large language models on customized tasks. It requires complex reasoning to examine the model’s errors, hypothesize what is missing or misleading in the current prompt, and communicate the task with clarity. While recent works indicate that large language models can be meta-prompted to perform automatic prompt engineering, we argue that their potential is limited due to insufficient guidance for complex reasoning in the meta-prompt. We fill this gap by infusing into the meta-prompt three key components: detailed descriptions, context specification, and a step-by-step reasoning template. The resulting method, named PE2, showcases remarkable versatility across diverse language tasks. It finds prompts that outperform “let’s think step by step” by 6.3% on MultiArith and 3.1% on GSM8K, and outperforms competitive baselines on counterfactual tasks by 6.9%. Further, we show that PE2 can make targeted prompt edits, rectify erroneous prompts, and induce multi-step plans for complex tasks.</abstract>
-      <url hash="ffe35e03">2024.findings-acl.21</url>
+      <url hash="0012eded">2024.findings-acl.21</url>
       <bibkey>ye-etal-2024-prompt</bibkey>
       <doi>10.18653/v1/2024.findings-acl.21</doi>
       <video href="2024.findings-acl.21.mp4"/>
@@ -6210,7 +6210,7 @@
       <author><first>Dinesh</first><last>Manocha</last><affiliation>University of Maryland, College Park</affiliation></author>
       <pages>386-406</pages>
       <abstract>Neural image classifiers can often learn to make predictions by overly relying on non-predictive features that are spuriously correlated with the class labels in the training data. This leads to poor performance in real-world atypical scenarios where such features are absent. This paper presents ASPIRE (Language-guided Data Augmentation for SPurIous correlation REmoval), a simple yet effective solution for supplementing the training dataset with images without spurious features, for robust learning against spurious correlations via better generalization. ASPIRE, guided by language at various steps, can generate non-spurious images without requiring any group labeling or existing non-spurious images in the training set. Precisely, we employ LLMs to first extract foreground and background features from textual descriptions of an image, followed by advanced language-guided image editing to discover the features that are spuriously correlated with the class label. Finally, we personalize a text-to-image generation model using the edited images to generate diverse in-domain images without spurious features. ASPIRE is complementary to all prior robust training methods in literature, and we demonstrate its effectiveness across 4 datasets and 9 baselines and show that ASPIRE improves the worst-group classification accuracy of prior methods by 1% - 38%. We also contribute a novel test set for the challenging Hard ImageNet dataset.</abstract>
-      <url hash="2d71bf17">2024.findings-acl.22</url>
+      <url hash="2c19a66f">2024.findings-acl.22</url>
       <bibkey>ghosh-etal-2024-aspire</bibkey>
       <doi>10.18653/v1/2024.findings-acl.22</doi>
       <video href="2024.findings-acl.22.mp4"/>
@@ -6227,7 +6227,7 @@
       <author><first>Rada</first><last>Mihalcea</last><affiliation>University of Michigan</affiliation></author>
       <pages>407-426</pages>
       <abstract>Tables contrast with unstructured text data by its structure to organize the information.In this paper, we investigate the efficiency of various LLMs in interpreting tabular data through different prompting strategies and data formats. Our analysis extends across six benchmarks for table-related tasks such as question-answering and fact-checking. We pioneer in the assessment of LLMs’ performance on image-based table representation. Specifically, we compare five text-based and three image-based table representations, revealing the influence of representation and prompting on LLM performance. We hope our study provides researchers insights into optimizing LLMs’ application in table-related tasks.</abstract>
-      <url hash="ffc69db1">2024.findings-acl.23</url>
+      <url hash="fced95f0">2024.findings-acl.23</url>
       <bibkey>deng-etal-2024-tables</bibkey>
       <doi>10.18653/v1/2024.findings-acl.23</doi>
       <video href="2024.findings-acl.23.mp4"/>
@@ -6244,7 +6244,7 @@
       <author><first>Yue</first><last>Dong</last><affiliation>University of California, Riverside and McGill University</affiliation></author>
       <pages>427-452</pages>
       <abstract>Using novel approaches to dataset development, the Biasly dataset captures the nuance and subtlety of misogyny in ways that are unique within the literature. Built in collaboration with multi-disciplinary experts and annotators themselves, the dataset contains annotations of movie subtitles, capturing colloquial expressions of misogyny in North American film. The open-source dataset can be used for a range of NLP tasks, including binary and multi-label classification, severity score regression, and text generation for rewrites. In this paper, we discuss the methodology used, analyze the annotations obtained, provide baselines for each task using common NLP algorithms, and furnish error analyses to give insight into model behaviour when fine-tuned on the Biasly dataset.</abstract>
-      <url hash="1dec80a9">2024.findings-acl.24</url>
+      <url hash="09e74ec8">2024.findings-acl.24</url>
       <bibkey>sheppard-etal-2024-biasly</bibkey>
       <doi>10.18653/v1/2024.findings-acl.24</doi>
       <video href="2024.findings-acl.24.mp4"/>
@@ -6257,7 +6257,7 @@
       <author><first>Preethi</first><last>Raghavan</last><affiliation>Fidelity</affiliation></author>
       <pages>453-466</pages>
       <abstract>Many existing end-to-end systems for hybrid question answering tasks can often be boiled down to a “prompt-and-pray” paradigm, where the user has limited control and insight into the intermediate reasoning steps used to achieve the final result. Additionally, due to the context size limitation of many transformer-based LLMs, it is often not reasonable to expect that the full structured and unstructured context will fit into a given prompt in a zero-shot setting, let alone a few-shot setting. We introduce BlendSQL, a superset of SQLite to act as a unified dialect for orchestrating reasoning across both unstructured and structured data. For hybrid question answering tasks involving multi-hop reasoning, we encode the full decomposed reasoning roadmap into a single interpretable BlendSQL query. Notably, we show that BlendSQL can scale to massive datasets and improve the performance of end-to-end systems while using 35% fewer tokens. Our code is available and installable as a package at https://github.com/parkervg/blendsql.</abstract>
-      <url hash="95138996">2024.findings-acl.25</url>
+      <url hash="ac7144ed">2024.findings-acl.25</url>
       <bibkey>glenn-etal-2024-blendsql</bibkey>
       <doi>10.18653/v1/2024.findings-acl.25</doi>
       <video href="2024.findings-acl.25.mp4"/>
@@ -6275,7 +6275,7 @@
       <author><first>Vikas</first><last>Chandra</last><affiliation>Meta</affiliation></author>
       <pages>467-484</pages>
       <abstract>Several post-training quantization methods have been applied to large language models (LLMs), and have been shown to perform well down to 8-bits. We find that these methods break down at lower bit precision, and investigate quantization-aware training for LLMs (LLM-QAT) to push quantization levels even further. We propose a data-free distillation method that leverages generations produced by the pre-trained model, which better preserves the original output distribution and allows quantizing any generative model independent of its training data, similar to post-training quantization methods. In addition to quantizing weights and activations, we also quantize the KV cache, which is critical for increasing throughput and supporting long sequence dependencies at current model sizes. We experiment with LLaMA models of sizes 7B, 13B, and 30B, at quantization levels down to 4-bits. We observe large improvements over training-free methods, especially in the low-bit settings.</abstract>
-      <url hash="fc96dcf9">2024.findings-acl.26</url>
+      <url hash="3ce4fb42">2024.findings-acl.26</url>
       <bibkey>liu-etal-2024-llm</bibkey>
       <doi>10.18653/v1/2024.findings-acl.26</doi>
       <video href="2024.findings-acl.26.mp4"/>
@@ -6296,7 +6296,7 @@
       <author><first>Hongxia</first><last>Yang</last></author>
       <pages>485-492</pages>
       <abstract>In this work, we present InfiMM, an advanced Multimodal Large Language Model that adapts to intricate vision-language tasks. InfiMM, inspired by the Flamingo architecture, distinguishes itself through the utilization of large-scale training data, comprehensive training strategies, and diverse large language models. This approach ensures the preservation of Flamingo’s foundational strengths while simultaneously introducing augmented capabilities. Empirical evaluations across a variety of benchmarks underscore InfiMM’s remarkable capability in multimodal understanding. The code can be found at: https://anonymous.4open.science/r/infimm-zephyr-F60C/.</abstract>
-      <url hash="1f9fba68">2024.findings-acl.27</url>
+      <url hash="fb1ec11f">2024.findings-acl.27</url>
       <bibkey>liu-etal-2024-infimm</bibkey>
       <doi>10.18653/v1/2024.findings-acl.27</doi>
       <video href="2024.findings-acl.27.mp4"/>
@@ -6310,7 +6310,7 @@
       <author><first>Aixin</first><last>Sun</last><affiliation>Nanyang Technological University</affiliation></author>
       <pages>493-516</pages>
       <abstract>Although achieving great success, Large Language Models (LLMs) usually suffer from unreliable hallucinations. Although language attribution can be a potential solution, there are no suitable benchmarks and evaluation metrics to attribute LLMs to structured knowledge. In this paper, we define a new task of Knowledge-aware Language Model Attribution (KaLMA) that improves upon three core concerns with conventional attributed LMs. First, we extend attribution source from unstructured texts to Knowledge Graph (KG), whose rich structures benefit both the attribution performance and working scenarios. Second, we propose a new “Conscious Incompetence” setting considering the incomplete knowledge repository, where the model identifies the need for supporting knowledge beyond the provided KG. Third, we propose a comprehensive automatic evaluation metric encompassing text quality, citation quality, and text citation alignment. To implement the above innovations, we build a dataset in biography domain BioKaLMA via evolutionary question generation strategy, to control the question complexity and necessary knowledge to the answer. For evaluation, we develop a baseline solution and demonstrate the room for improvement in LLMs’ citation generation, emphasizing the importance of incorporating the “Conscious Incompetence” setting, and the critical role of retrieval accuracy.</abstract>
-      <url hash="2696e462">2024.findings-acl.28</url>
+      <url hash="a8706cbc">2024.findings-acl.28</url>
       <bibkey>li-etal-2024-towards-verifiable</bibkey>
       <doi>10.18653/v1/2024.findings-acl.28</doi>
       <video href="2024.findings-acl.28.mp4"/>
@@ -6325,7 +6325,7 @@
       <author><first>Dongyeop</first><last>Kang</last><affiliation>University of Minnesota</affiliation></author>
       <pages>517-545</pages>
       <abstract>Large Language Models (LLMs) have recently been shown to be effective as automatic evaluators with simple prompting and in-context learning. In this work, we assemble 16 LLMs encompassing four different size ranges and evaluate their output responses by preference ranking from the other LLMs as evaluators, such as System Star is better than System Square. We then evaluate the quality of ranking outputs introducing the Cognitive Bias Benchmark for LLMs as Evaluators (CoBBLer), a benchmark to measure six different cognitive biases in LLM evaluation outputs, such as the Egocentric bias where a model prefers to rank its own outputs highly in evaluation. We find that LLMs are biased text quality evaluators, exhibiting strong indications on our bias benchmark (40% of comparisons made by all models) within each of their evaluations that question their robustness as evaluators. Furthermore, we examine the correlation between human and machine preferences and calculate the average Rank-Biased Overlap (RBO) score to be 44%, indicating that machine preferences are misaligned with humans. According to our findings, LLMs may still be unable to be utilized for automatic annotation aligned with human preferences.</abstract>
-      <url hash="a8cb0ff1">2024.findings-acl.29</url>
+      <url hash="fdaa667d">2024.findings-acl.29</url>
       <bibkey>koo-etal-2024-benchmarking</bibkey>
       <doi>10.18653/v1/2024.findings-acl.29</doi>
     </paper>
@@ -6339,7 +6339,7 @@
       <author><first>Chengqing</first><last>Zong</last><affiliation>Institute of automation, Chinese academy of science, Chinese Academy of Sciences</affiliation></author>
       <pages>546-566</pages>
       <abstract>Large language models respond well in high-resource languages like English but struggle in low-resource languages. It may arise from the lack of high-quality instruction following data in these languages. Directly translating English samples into these languages can be a solution but unreliable, leading to responses with translation errors and lacking language-specific or cultural knowledge. To address this issue, we propose a novel method to construct cross-lingual instruction following samples with instruction in English and response in low-resource languages. Specifically, the language model first learns to generate appropriate English instructions according to the natural web texts in other languages as responses. The candidate cross-lingual instruction tuning samples are further refined and diversified. We have employed this method to build a large-scale cross-lingual instruction tuning dataset on 10 languages, namely X-Instruction. The instruction data built using our method incorporate more language-specific knowledge compared with the naive translation method. Experimental results have shown that the response quality of the model tuned on X-Instruction greatly exceeds the model distilled from a powerful teacher model, reaching or even surpassing the ones of ChatGPT. In addition, we find that models tuned on cross-lingual instruction following samples can follow the instruction in the output language without further tuning.</abstract>
-      <url hash="b7435c57">2024.findings-acl.30</url>
+      <url hash="5c6391f4">2024.findings-acl.30</url>
       <bibkey>li-etal-2024-x</bibkey>
       <doi>10.18653/v1/2024.findings-acl.30</doi>
       <video href="2024.findings-acl.30.mp4"/>
@@ -6353,7 +6353,7 @@
       <author><first>Jing</first><last>Li</last><affiliation>The Hong Kong Polytechnic University</affiliation></author>
       <pages>567-585</pages>
       <abstract>Emotional support conversation systems are designed to alleviate users’ emotional distress and assist them in overcoming their challenges. While previous studies have made progress, their models occasionally generate unhelpful responses, which are intended to be supportive but instead have counterproductive effects. Since unhelpful responses can hinder the effectiveness of emotional support, it is crucial to mitigate them within conversations. Our solution is motivated by two principal considerations: (1) multiple facets of emotional support are expected to be considered when developing emotional support conversation models, and (2) directly reducing the probability of generating unhelpful responses can effectively mitigate their occurrence. Accordingly, we introduce a novel <tex-math>\textbf{model-agnostic}</tex-math> framework named <tex-math>\underline{M}</tex-math>itigating <tex-math>\underline{u}</tex-math>nhelpfulness with multifaceted AI <tex-math>\underline{f}</tex-math>eedback for emot<tex-math>\underline{i}</tex-math>o<tex-math>\underline{n}</tex-math>al support (<tex-math>\textit{Muffin}</tex-math>). It first employs a multifaceted AI feedback module designed to assess the helpfulness model responses across various facets of emotional support. Leveraging contrastive learning, Muffin then reduces the unhelpful responses’ likelihoods. To validate the effectiveness of our proposed framework, we apply Muffin to various previous emotional support generation models, including the state-of-the-art. Experimental results demonstrate that Muffin can significantly mitigate unhelpful response generation while enhancing response fluency and relevance.</abstract>
-      <url hash="92cc28bb">2024.findings-acl.31</url>
+      <url hash="f23cc17e">2024.findings-acl.31</url>
       <bibkey>wang-etal-2024-muffin</bibkey>
       <doi>10.18653/v1/2024.findings-acl.31</doi>
     </paper>
@@ -6366,7 +6366,7 @@
       <author><first>Bang</first><last>Liu</last><affiliation>University of Montreal</affiliation></author>
       <pages>586-598</pages>
       <abstract>This paper addresses the challenge of train-short-test-long (TSTL) scenarios in Large Language Models (LLMs) equipped with Rotary Position Embedding (RoPE), where models pre-trained on shorter sequences face difficulty with out-of-distribution (OOD) token positions in longer sequences. We introduce Resonance RoPE, a novel approach designed to narrow the generalization gap in TSTL scenarios by refining the interpolation of RoPE features for OOD positions, significantly improving the model performance without additional online computational costs. Furthermore, we present PosGen, a new synthetic benchmark specifically designed for fine-grained behavior analysis in TSTL scenarios, aiming to isolate the constantly increasing difficulty of token generation on long contexts from the challenges of recognizing new token positions. Our experiments on synthetic tasks show that after applying Resonance RoPE, Transformers recognize OOD position better and more robustly. Our extensive LLM experiments also show superior performance after applying Resonance RoPE to the current state-of-the-art RoPE scaling method, YaRN, on both upstream language modeling tasks and a variety of downstream long-text applications.</abstract>
-      <url hash="420f0f8e">2024.findings-acl.32</url>
+      <url hash="bc329464">2024.findings-acl.32</url>
       <bibkey>wang-etal-2024-resonance</bibkey>
       <doi>10.18653/v1/2024.findings-acl.32</doi>
       <video href="2024.findings-acl.32.mp4"/>
@@ -6383,7 +6383,7 @@
       <author><first>Mark</first><last>Gerstein</last><affiliation>Yale University</affiliation></author>
       <pages>599-621</pages>
       <abstract>Large language models (LLMs), despite their remarkable progress across various general domains, encounter significant barriers in medicine and healthcare. This field faces unique challenges such as domain-specific terminologies and reasoning over specialized knowledge. To address these issues, we propose MedAgents, a novel multi-disciplinary collaboration framework for the medical domain. MedAgents leverages LLM-based agents in a role-playing setting that participate in a collaborative multi-round discussion, thereby enhancing LLM proficiency and reasoning capabilities. This training-free framework encompasses five critical steps: gathering domain experts, proposing individual analyses, summarising these analyses into a report, iterating over discussions until a consensus is reached, and ultimately making a decision. Our work focuses on the zero-shot setting, which is applicable in real-world scenarios. Experimental results on nine datasets (MedQA, MedMCQA, PubMedQA, and six subtasks from MMLU) establish that our proposed MedAgents framework excels at mining and harnessing the medical expertise within LLMs, as well as extending its reasoning abilities. Our code can be found at https://github.com/gersteinlab/MedAgents.</abstract>
-      <url hash="df30c82f">2024.findings-acl.33</url>
+      <url hash="9ed756e4">2024.findings-acl.33</url>
       <bibkey>tang-etal-2024-medagents</bibkey>
       <doi>10.18653/v1/2024.findings-acl.33</doi>
     </paper>
@@ -6396,7 +6396,7 @@
       <author><first>Rui</first><last>Wang</last><affiliation>Shanghai Jiao Tong University</affiliation></author>
       <pages>622-643</pages>
       <abstract>Neural-symbolic methods have demonstrated efficiency in enhancing the reasoning abilities of large language models (LLMs). However, existing methods mainly rely on syntactically mapping natural languages to complete formal languages like Python and SQL. Those methods require that reasoning tasks be convertible into programs, which cater to the computer execution mindset and deviate from human reasoning habits. To broaden symbolic methods’ applicability and adaptability in the real world, we propose Meta-Reasoning from a linguistic perspective. This method empowers LLMs to deconstruct reasoning-independent semantic information into generic symbolic representations, thereby efficiently capturing more generalized reasoning knowledge. We conduct extensive experiments on more than ten datasets encompassing conventional reasoning tasks like arithmetic, symbolic, and logical reasoning, and the more complex interactive reasoning tasks like theory-of-mind reasoning. Experimental results demonstrate that Meta-Reasoning significantly enhances in-context reasoning accuracy, learning efficiency, out-of-domain generalization, and output stability compared to the Chain-of-Thought technique.</abstract>
-      <url hash="c37412be">2024.findings-acl.34</url>
+      <url hash="543cdd93">2024.findings-acl.34</url>
       <bibkey>wang-etal-2024-meta</bibkey>
       <doi>10.18653/v1/2024.findings-acl.34</doi>
       <video href="2024.findings-acl.34.mp4"/>
@@ -6413,7 +6413,7 @@
       <author><first>Kam-Fai</first><last>Wong</last><affiliation>The Chinese University of Hong Kong</affiliation></author>
       <pages>644-653</pages>
       <abstract>The success of large language models (LLM) benefits from large-scale model parameters and large amounts of pre-training data. However, the textual data for training LLM can not be confirmed to be legal because they are crawled from different web sites. For example, there are copyrighted articles, personal reviews and information in the pre-training data for LLM which are illegal. To address the above issue and develop legal LLM, we propose to detect the pre-training data from LLM in a pure black-box way because the existing LLM services only return the generated text. The previous most related works are the membership inference attack (MIA) on machine learning models to detect the training data from them. But the existing methods are based on analyzing the output probabilities of models which are unrealistic to LLM services. To tackle the problem, we firstly construct the benchmark datasets by collecting textual data from different domains as the seen and unseen pre-training data for LLMs. Then, we investigate a black-box framework named DPDLLM, with the only access to the generated texts from LLM for detecting textual data whether was used to train it. In the proposed framework, we exploit GPT-2 as the reference model to fit the textual data and feed the generated text from LLM into it to acquire sequence probabilities as the significant feature for detection. The experimental results on the benchmark datasets demonstrate that DPDLLM is effective on different popular LLMs and outperforms the existing methods.</abstract>
-      <url hash="73282b78">2024.findings-acl.35</url>
+      <url hash="786e252a">2024.findings-acl.35</url>
       <bibkey>zhou-etal-2024-dpdllm</bibkey>
       <doi>10.18653/v1/2024.findings-acl.35</doi>
     </paper>
@@ -6426,7 +6426,7 @@
       <author><first>Guanhua</first><last>Chen</last><affiliation>Southern University of Science and Technology</affiliation></author>
       <pages>654-665</pages>
       <abstract>Instruction tuning enhances the instruction following ability of large language models by finetuning with supervised instruction data. Previous work proposes in-context instruction tuning (ICIT) where specific positive or negative examples are incorporated into the prompt for better performance. In this work, we propose PACIT, a simple and effective in-context instruction tuning method, inspired by the pedagogical concept of desirable difficulty. The PACIT method unlocks the power of examples by encouraging the model to actively learn to grasp the distinctions between the positive and negative examples instead of merely reading. The model is expected to first verify the correctness of the provided example according to the task description, which is then set as the condition for generating a better response to the task instance. Our extensive experiments prove the effectiveness of PACIT, outperforming ICIT baseline on both in-domain and out-domain tasks up to 9.16 and 3.14 average ROUGE-L scores, respectively. Moreover, PACIT can notably enhance the performance of instruction tuning even when all positive and negative examples are generated with a self-instruct method.</abstract>
-      <url hash="da1d8388">2024.findings-acl.36</url>
+      <url hash="dacd8861">2024.findings-acl.36</url>
       <bibkey>xue-etal-2024-pacit</bibkey>
       <doi>10.18653/v1/2024.findings-acl.36</doi>
       <video href="2024.findings-acl.36.mp4"/>
@@ -6441,7 +6441,7 @@
       <author><first>Ruizhe</first><last>Li</last><affiliation>University of Aberdeen</affiliation></author>
       <pages>666-679</pages>
       <abstract>Recent advances in large language models (LLMs) have promoted generative error correction (GER) for automatic speech recognition (ASR), which aims to predict the ground-truth transcription from the decoded N-best hypotheses. Thanks to the strong language generation ability of LLMs and rich information in the N-best list, GER shows great effectiveness in enhancing ASR results. However, it still suffers from two limitations: 1) LLMs are unaware of the source speech during GER, which may lead to results that are grammatically correct but violate the source speech content, 2) N-best hypotheses usually only vary in a few tokens, making it redundant to send all of them for GER, which could confuse LLM about which tokens to focus on and thus lead to increased miscorrection. In this paper, we propose ClozeGER, a new paradigm for ASR generative error correction. First, we introduce a multimodal LLM (i.e., SpeechGPT) to receive source speech as extra input to improve the fidelity of correction output. Then, we reformat GER as a cloze test with logits calibration to remove the input information redundancy and simplify GER with clear instructions. Experiments show that ClozeGER achieves a new breakthrough over vanilla GER on 9 popular ASR datasets.</abstract>
-      <url hash="a30bc32a">2024.findings-acl.37</url>
+      <url hash="5b50770e">2024.findings-acl.37</url>
       <bibkey>hu-etal-2024-listen</bibkey>
       <doi>10.18653/v1/2024.findings-acl.37</doi>
     </paper>
@@ -6455,7 +6455,7 @@
       <author><first>Jinsong</first><last>Su</last><affiliation>Xiamen University</affiliation></author>
       <pages>680-691</pages>
       <abstract>Cross-document Relation Extraction aims to predict the relation between target entities located in different documents. In this regard, the dominant models commonly retain useful information for relation prediction via bridge entities, which allows the model to elaborately capture the intrinsic interdependence between target entities. However, these studies ignore the non-bridge entities, each of which co-occurs with only one target entity and offers the semantic association between target entities for relation prediction. Besides, the commonly-used dataset–CodRED contains substantial NA instances, leading to the prediction bias during inference. To address these issues, in this paper, we propose a novel graph-based cross-document RE model with non-bridge entity enhancement and prediction debiasing. Specifically, we use a unified entity graph to integrate numerous non-bridge entities with target entities and bridge entities, modeling various associations between them, and then use a graph recurrent network to encode this graph. Finally, we introduce a novel debiasing strategy to calibrate the original prediction distribution. Experimental results on the closed and open settings show that our model significantly outperforms all baselines, including the GPT-3.5-turbo and InstructUIE, achieving state-of-the-art performance. Particularly, our model obtains 66.23% and 55.87% AUC points in the official leaderboard under the two settings, respectively,ranking the first place in all submissions since December 2023. Our code is available at https://github.com/DeepLearnXMU/CoRE-NEPD.</abstract>
-      <url hash="68ea1dec">2024.findings-acl.38</url>
+      <url hash="b1979106">2024.findings-acl.38</url>
       <bibkey>yue-etal-2024-towards</bibkey>
       <doi>10.18653/v1/2024.findings-acl.38</doi>
     </paper>
@@ -6468,7 +6468,7 @@
       <author><first>Ho-Jin</first><last>Choi</last><affiliation>Korea Advanced Institute of Science &amp; Technology</affiliation></author>
       <pages>692-713</pages>
       <abstract>This paper explores the image-sharing capability of Large Language Models (LLMs), such as GPT-4 and LLaMA 2, in a zero-shot setting. To facilitate a comprehensive evaluation of LLMs, we introduce the photochatplus dataset, which includes enriched annotations (ie intent, triggering sentence, image description, and salient information). Furthermore, we present the gradient-free and extensible Decide, Describe, and Retrieve () framework. With extensive experiments, we unlock the image-sharing capability of equipped with LLMs in zero-shot prompting, with ChatGPT achieving the best performance.Our findings also reveal the emergent image-sharing ability in LLMs under zero-shot conditions, validating the effectiveness of . We use this framework to demonstrate its practicality and effectiveness in two real-world scenarios: (1) human-bot interaction and (2) dataset augmentation. To the best of our knowledge, this is the first study to assess the image-sharing ability of various LLMs in a zero-shot setting. We make our source code and dataset publicly available at https://github.com/passing2961/DribeR.</abstract>
-      <url hash="4f0fde32">2024.findings-acl.39</url>
+      <url hash="89172735">2024.findings-acl.39</url>
       <bibkey>lee-etal-2024-large</bibkey>
       <doi>10.18653/v1/2024.findings-acl.39</doi>
       <video href="2024.findings-acl.39.mp4"/>
@@ -6490,7 +6490,7 @@
       <author><first>Lizhen</first><last>Cui</last><affiliation>Shandong University</affiliation></author>
       <pages>714-729</pages>
       <abstract>In the era of code large language models (code LLMs), data engineering plays a pivotal role during the instruction fine-tuning phase. To train a versatile model, previous efforts devote tremendous efforts into crafting instruction data covering all the downstream scenarios. Nonetheless, this will incur significant expenses in constructing data and training model. Therefore, this paper introduces CodeM, a novel data construction strategy, which can efficiently train a versatile model using less data via our newly proposed ability matrix. CodeM uses ability matrix to decouple code LLMs’ abilities into two dimensions, constructing a lightweight training corpus that only covers a subset of target scenarios. Extensive experiments on HumanEvalPack and MultiPL-E imply that code LLMs can combine the single-dimensional abilities to master composed abilities, validating the effectiveness of CodeM.</abstract>
-      <url hash="c874c12a">2024.findings-acl.40</url>
+      <url hash="7e2a5a1f">2024.findings-acl.40</url>
       <bibkey>zan-etal-2024-codem</bibkey>
       <doi>10.18653/v1/2024.findings-acl.40</doi>
     </paper>
@@ -6506,7 +6506,7 @@
       <author><first>Heng</first><last>Ji</last><affiliation>University of Illinois, Urbana-Champaign</affiliation></author>
       <pages>730-749</pages>
       <abstract>Advances in large vision-language models (LVLMs) have led to significant progress in generating natural language descriptions for visual contents. These powerful models are known for producing texts that are factually inconsistent with the visual input. While some efforts mitigate such inconsistencies in natural image captioning, the factuality of generated captions for structured visuals, such as charts, has not received as much scrutiny. This work introduces a comprehensive typology of factual errors in generated chart captions. A large-scale human annotation effort provides insight into the error patterns in captions generated by various models, ultimately forming the foundation of a dataset, CHOCOLATE. Our analysis reveals that even advanced models like GPT-4V frequently produce captions laced with factual inaccuracies. To combat this, we establish the task of Chart Caption Factual Error Correction and introduce CHARTVE, a visual entailment model that outperforms current LVLMs in evaluating caption factuality. Furthermore, we propose C2TFEC, an interpretable two-stage framework that excels at correcting factual errors. This work inaugurates a new domain in factual error correction for chart captions, presenting a novel evaluation metric, and demonstrating an effective approach to ensuring the factuality of generated chart captions. The code and data as well as the continuously updated benchmark can be found at: https://khuangaf.github.io/CHOCOLATE/.</abstract>
-      <url hash="3dba158e">2024.findings-acl.41</url>
+      <url hash="40721b4f">2024.findings-acl.41</url>
       <bibkey>huang-etal-2024-lvlms</bibkey>
       <doi>10.18653/v1/2024.findings-acl.41</doi>
       <video href="2024.findings-acl.41.mp4"/>
@@ -6519,7 +6519,7 @@
       <author><first>Zhicheng</first><last>Dou</last><affiliation>Renmin University of China</affiliation></author>
       <pages>750-761</pages>
       <abstract>Retrieval-augmented large language models (LLMs) have demonstrated efficacy in knowledge-intensive tasks such as open-domain QA, addressing inherent challenges in knowledge update and factual inadequacy.However, inconsistencies between retrieval knowledge and the necessary knowledge for LLMs, leading to a decline in LLM’s answer quality. This paper introduces BIDER, an approach that refines retrieval documents into Key Supporting Evidence (KSE) through knowledge synthesis, supervised fine-tuning (SFT), and preference alignment. We train BIDER by learning from crafting KSE, while maximizing its output to align with LLM’s information acquisition preferences through reinforcement learning. Evaluations across five datasets show BIDER boosts LLMs’ answer quality by 7% while reducing input content length in retrieval documents by 80%, outperforming existing methods. The proposed KSE simulation effectively equips LLMs with essential information for accurate question answering.</abstract>
-      <url hash="f8eee1c5">2024.findings-acl.42</url>
+      <url hash="79dba553">2024.findings-acl.42</url>
       <bibkey>jin-etal-2024-bider</bibkey>
       <doi>10.18653/v1/2024.findings-acl.42</doi>
     </paper>
@@ -6534,7 +6534,7 @@
       <author><first>Jing</first><last>Liu</last><affiliation>Institute of automation, Chinese academy of science</affiliation></author>
       <pages>762-776</pages>
       <abstract>Visual grounding (VG) aims at locating the foreground entities that match the given natural language expression. Previous datasets and methods for classic VG task mainly rely on the prior assumption that the given expression must literally refer to the target object, which greatly impedes the practical deployment of agents in real-world scenarios. Since users usually prefer to provide the intention-based expressions for the desired object instead of covering all the details, it is necessary for the agents to interpret the intention-driven instructions. Thus, in this work, we take a step further to the intention-driven visual-language (V-L) understanding. To promote classic VG towards human intention interpretation, we propose a new intention-driven visual grounding (IVG) task and build a largest-scale IVG dataset named IntentionVG with free-form intention expressions. Considering that practical agents need to move and find specific targets among various scenarios to realize the grounding task, our IVG task and IntentionVG dataset have taken the crucial properties of both multi-scenario perception and egocentric view into consideration. Besides, various types of models are set up as the baselines to realize our IVG task. Extensive experiments on our IntentionVG dataset and baselines demonstrate the necessity and efficacy of our method for the V-L field. To foster future research in this direction, our newly built dataset and baselines will be publicly available at https://github.com/Rubics-Xuan/IVG.</abstract>
-      <url hash="1420b71f">2024.findings-acl.43</url>
+      <url hash="9d1339fd">2024.findings-acl.43</url>
       <bibkey>wang-etal-2024-beyond-literal</bibkey>
       <doi>10.18653/v1/2024.findings-acl.43</doi>
       <video href="2024.findings-acl.43.mp4"/>
@@ -6548,7 +6548,7 @@
       <author><first>Qianli</first><last>Ma</last><affiliation>South China University of Technology</affiliation></author>
       <pages>777-791</pages>
       <abstract>The incremental sequence labeling task involves continuously learning new classes over time while retaining knowledge of the previous ones. Our investigation identifies two significant semantic shifts: E2O (where the model mislabels an old entity as a non-entity) and O2E (where the model labels a non-entity or old entity as a new entity). Previous research has predominantly focused on addressing the E2O problem, neglecting the O2E issue. This negligence results in a model bias towards classifying new data samples as belonging to the new class during the learning process. To address these challenges, we propose a novel framework, Incremental Sequential Labeling without Semantic Shifts (IS3). Motivated by the identified semantic shifts (E2O and O2E), IS3 aims to mitigate catastrophic forgetting in models. As for the E2O problem, we use knowledge distillation to maintain the model’s discriminative ability for old entities. Simultaneously, to tackle the O2E problem, we alleviate the model’s bias towards new entities through debiased loss and optimization levels.Our experimental evaluation, conducted on three datasets with various incremental settings, demonstrates the superior performance of IS3 compared to the previous state-of-the-art method by a significant margin.</abstract>
-      <url hash="278815de">2024.findings-acl.44</url>
+      <url hash="ed63c710">2024.findings-acl.44</url>
       <bibkey>qiu-etal-2024-incremental</bibkey>
       <doi>10.18653/v1/2024.findings-acl.44</doi>
       <video href="2024.findings-acl.44.mp4"/>
@@ -6565,7 +6565,7 @@
       <author><first>Juanzi</first><last>Li</last></author>
       <pages>792-815</pages>
       <abstract>Knowledge Base Question Answering (KBQA) aims to answer natural language questions based on facts in knowledge bases. A typical approach to KBQA is semantic parsing, which translates a question into an executable logical form in a formal language. Recent works leverage the capabilities of large language models (LLMs) for logical form generation to improve performance. However, although it is validated that LLMs are capable of solving some KBQA problems, there has been little discussion on the differences in LLMs’ proficiency in formal languages used in semantic parsing. In this work, we propose to evaluate the understanding and generation ability of LLMs to deal with differently structured logical forms by examining the inter-conversion of natural and formal language through in-context learning of LLMs. Extensive experiments with models of different sizes show that state-of-the-art LLMs can understand formal languages as well as humans, but generating correct logical forms given a few examples remains a challenge. Most importantly, our results also indicate that LLMs exhibit considerable sensitivity. In general, the formal language with a lower formalization level, i.e., the more similar it is to natural language, is more friendly to LLMs. Code and data can be found at https://github.com/Matthewlliu/structure_probe.</abstract>
-      <url hash="ee184c70">2024.findings-acl.45</url>
+      <url hash="7f7a34c7">2024.findings-acl.45</url>
       <bibkey>liu-etal-2024-proficient</bibkey>
       <doi>10.18653/v1/2024.findings-acl.45</doi>
       <video href="2024.findings-acl.45.mp4"/>
@@ -6580,7 +6580,7 @@
       <author><first>Xiaojie</first><last>Yuan</last><affiliation>Nankai University</affiliation></author>
       <pages>816-826</pages>
       <abstract>Multimodal entity linking (MEL), which aligns ambiguous mentions within multimodal contexts to referent entities from multimodal knowledge bases, is essential for many natural language processing applications. Previous MEL methods mainly focus on exploring complex multimodal interaction mechanisms to better capture coherence evidence between mentions and entities by mining complementary information. However, in real-world social media scenarios, vision modality often exhibits low quality, low value, or low relevance to the mention. Integrating such information directly will backfire, leading to a weakened consistency between mentions and their corresponding entities. In this paper, we propose a novel latent space vision feature optimization framework MELOV, which combines inter-modality and intra-modality optimizations to address these challenges. For the inter-modality optimization, we exploit the variational autoencoder to mine shared information and generate text-based visual features. For the intra-modality optimization, we consider the relationships between mentions and build graph convolutional network to aggregate the visual features of semantic similar neighbors. Extensive experiments on three benchmark datasets demonstrate the superiority of our proposed framework.</abstract>
-      <url hash="18de6be3">2024.findings-acl.46</url>
+      <url hash="9e0f8c2e">2024.findings-acl.46</url>
       <bibkey>sui-etal-2024-melov</bibkey>
       <doi>10.18653/v1/2024.findings-acl.46</doi>
     </paper>
@@ -6591,7 +6591,7 @@
       <author><first>Yunfang</first><last>Wu</last></author>
       <pages>827-838</pages>
       <abstract>Within the context of reading comprehension, the task of Distractor Generation (DG) aims to generate several incorrect options to confuse readers. In recent years, the emergence of Large Language Models (LLMs) provides a potential for unsupervised DG without expensive human-annotated distractor labels. In this paper, we leverage LLMs as a cost-effective annotator to enhance the DG capability of smaller student models. To perform knowledge distilling, we propose a dual task training framework that integrates pseudo distractors from LLMs and answer information as the objective target with a two-stage training process. Moreover, we devise a counterfactual contrastive decoding mechanism for increasing the distracting capability of the DG model. Experiments show that our unsupervised generation method with Bart-base greatly surpasses GPT-3.5-turbo zero-shot performance with only 200<tex-math>\times</tex-math> fewer model parameters. Our proposed unsupervised DG method offers a cost-effective framework for practical reading comprehension applications, without the need of laborious distractor annotation and costly large-size models.</abstract>
-      <url hash="8fca2d34">2024.findings-acl.47</url>
+      <url hash="5dadb394">2024.findings-acl.47</url>
       <bibkey>qu-etal-2024-unsupervised</bibkey>
       <doi>10.18653/v1/2024.findings-acl.47</doi>
       <video href="2024.findings-acl.47.mp4"/>
@@ -6605,7 +6605,7 @@
       <author><first>Hanghang</first><last>Tong</last></author>
       <pages>839-850</pages>
       <abstract>Conversational question answering (ConvQA) over knowledge graphs (KGs) involves answering multi-turn natural language questions about information contained in a KG. State-of-the-art methods of ConvQA often struggle with inexplicit question-answer pairs. These inputs are easy for human beings to understand given a conversation history, but hard for a machine to interpret, which can degrade ConvQA performance. To address this problem, we propose a reinforcement learning (RL) based model, CoRnNet, which utilizes question reformulations generated by large language models (LLMs) to improve ConvQA performance. CoRnNet adopts a teacher-student architecture where a teacher model learns question representations using human writing reformulations, and a student model to mimic the teacher model’s output via reformulations generated by LLMs. The learned question representation is then used by a RL model to locate the correct answer in a KG. Extensive experimental results show that CoRnNet outperforms state-of-the-art ConvQA models.</abstract>
-      <url hash="660600ca">2024.findings-acl.48</url>
+      <url hash="43e7d8dd">2024.findings-acl.48</url>
       <bibkey>liu-etal-2024-conversational</bibkey>
       <doi>10.18653/v1/2024.findings-acl.48</doi>
     </paper>
@@ -6616,7 +6616,7 @@
       <author><first>Jingbo</first><last>Shang</last><affiliation>University of California, San Diego</affiliation></author>
       <pages>851-870</pages>
       <abstract>Large language models (LLMs) are leading significant progress in code generation. Beyond one-pass code generation, recent works further integrate unit tests and program verifiers into LLMs to iteratively refine the generated programs. However, these works consider the generated programs as an indivisible entity, which falls short for LLMs in debugging the programs, especially when the programs contain complex logic flows and data operations. In contrast, when human developers debug programs, they typically set breakpoints and selectively examine runtime execution information. The execution flow and the intermediate variables play a crucial role in the debugging process, yet they are underutilized in the existing literature on code generation. In this study, we introduce Large Language Model Debugger (LDB), a novel debugging framework that enables LLMs to refine their generated programs with the runtime execution information. Specifically, LDB segments the programs into basic blocks and tracks the values of intermediate variables after each block throughout the runtime execution. This allows LLMs to concentrate on simpler code units within the overall execution flow, verify their correctness against the task description block by block, and efficiently pinpoint any potential errors. Experiments demonstrate that LDB consistently enhances the baseline performance by up to 9.8% across the HumanEval, MBPP, and TransCoder benchmarks, archiving new state-of-the-art performance in code debugging for various LLM selections.</abstract>
-      <url hash="e5fb9437">2024.findings-acl.49</url>
+      <url hash="7ce74575">2024.findings-acl.49</url>
       <bibkey>zhong-etal-2024-debug</bibkey>
       <doi>10.18653/v1/2024.findings-acl.49</doi>
       <video href="2024.findings-acl.49.mp4"/>
@@ -6630,7 +6630,7 @@
       <author><first>Jun</first><last>Xu</last><affiliation>Renmin University of China</affiliation></author>
       <pages>871-877</pages>
       <abstract>In-context learning has been extensively validated in large language models. However, the mechanism and selection strategy for in-context example selection, which is a crucial ingredient in this approach, lacks systematic and in-depth research. In this paper, we propose a data compression approach to the selection of in-context examples. We introduce a two-stage method that can effectively choose relevant examples and retain sufficient information about the training dataset within the in-context examples. Our method shows a significant improvement of an average of 5.90% across five different real-world datasets using four language models.</abstract>
-      <url hash="1cb2255b">2024.findings-acl.50</url>
+      <url hash="ecb8435f">2024.findings-acl.50</url>
       <bibkey>sun-etal-2024-effective</bibkey>
       <doi>10.18653/v1/2024.findings-acl.50</doi>
     </paper>
@@ -6648,7 +6648,7 @@
       <author><first>Aimin</first><last>Zhou</last><affiliation>East China Normal University</affiliation></author>
       <pages>878-890</pages>
       <abstract>Although large language models (LLMs) acquire extensive world knowledge and some reasoning abilities, their proficiency in generating humorous sentences remains a challenge. Previous research has demonstrated that the humor generation capabilities of ChatGPT are confined to producing merely 25 unique jokes. In this work, we concentrate on endowing LLMs with the ability of generating puns, a particular category of humor by preference learning method. We propose a multi-stage curriculum preference learning framework to optimize both pun structure preferences and humor preferences. Specifically, we improve the Direct Preference Optimization (DPO) algorithm to address the challenge of multi-objective alignment problem. Besides, to facilitate further advancement in this field, we collect a Chinese Pun (ChinesePun) dataset, containing 2.1k puns and corresponding annotations. Experimental results on both Chinese and English benchmark datasets demonstrate that our method significantly outperforms all the baseline models.</abstract>
-      <url hash="59f94ae5">2024.findings-acl.51</url>
+      <url hash="0b0a0c21">2024.findings-acl.51</url>
       <bibkey>chen-etal-2024-u</bibkey>
       <doi>10.18653/v1/2024.findings-acl.51</doi>
     </paper>
@@ -6663,7 +6663,7 @@
       <author><first>Huajun</first><last>Chen</last><affiliation>Zhejiang University</affiliation></author>
       <pages>891-904</pages>
       <abstract>Deploying large language models (LLMs) to real scenarios for domain-specific question answering (QA) is a key thrust for LLM applications, which poses numerous challenges, especially in ensuring that responses are both accommodating to user requirements and appropriately leveraging domain-specific knowledge bases. They are the two major difficulties for LLM application as vanilla fine-tuning falls short of addressing. Combining these requirements, we conceive of them as the requirement for the model’s preference to be harmoniously aligned with humans’. Thus, we introduce Knowledgeable Preference AlignmenT (KnowPAT), which constructs two kinds of preference sets to tackle the two issues. Besides, we design a new alignment objective to align the LLM preference with different human preferences uniformly, aiming to optimize LLM performance in real-world, domain-specific QA settings. Adequate experiments and comprehensive comparisons with 15 baseline methods illustrate that our KnowPAT is a superior pipeline for real-scenario domain-specific QA with LLMs.</abstract>
-      <url hash="ef32855c">2024.findings-acl.52</url>
+      <url hash="53cf117a">2024.findings-acl.52</url>
       <bibkey>zhang-etal-2024-knowledgeable</bibkey>
       <doi>10.18653/v1/2024.findings-acl.52</doi>
       <video href="2024.findings-acl.52.mp4"/>
@@ -6677,7 +6677,7 @@
       <author><first>Kai</first><last>Fan</last><affiliation>Alibaba Group</affiliation></author>
       <pages>905-924</pages>
       <abstract>Large language models (LLMs) have significantly improved in understanding natural language but still lack in mathematical reasoning, a hurdle on the path to true artificial general intelligence. The training of large language models, based on next-token prediction, struggles to capture the precise nature of mathematical reasoning, presenting both practical and theoretical challenges. In this paper, we address this challenge by enriching the data landscape and introducing a reasonable data format, enhanced the text analysis of the LLM with a capability to utilize a Python code interpreter. This dataset is derived from GSM8K and MATH and has been further refined through a combination of GPT annotations, human review, and self-training processes. Additionally, we propose a tentative, easily replicable protocol for the fine-tuning of math-specific LLMs, which has led to a significant improvement in the performance of a 7B-parameter LLM on the GSM8K and MATH datasets. A solution generator and a value estimator are fine-tuned simultaneously in a multi-task fashion, while an outlier-free value model-based inference method is proposed to further boost the performance. We are committed to advancing the field of mathematical reasoning in LLMs and, to that end, we will make the source code and checkpoints publicly available.</abstract>
-      <url hash="9f6b05f8">2024.findings-acl.53</url>
+      <url hash="7d64df04">2024.findings-acl.53</url>
       <bibkey>liao-etal-2024-mario</bibkey>
       <doi>10.18653/v1/2024.findings-acl.53</doi>
     </paper>
@@ -6687,7 +6687,7 @@
       <author><first>Shuangyin</first><last>Li</last></author>
       <pages>925-935</pages>
       <abstract>Online social media platforms often gather user feedback through polls to enhance user engagement. Automatically generating polls from social media and its context can decrease the labor expenses of media workers and enhance workplace productivity. However, on social media platforms, there are internet water armies that manipulate public opinion through sheer numbers and causing the comments to be biased, drowning out minority views. In such circumstances, polls created based on biased comments often have limited types of options and poor coverage. Therefore, it is crucial to diversify the poll options and try to listen to the voices of the minority. To achieve this, we introduce DiffusPoll, a novel paradigm for poll generation based on a non-autoregressive diffusion model that can generate diversified and high-quality samples. Under the new paradigm, we design a task-specific mask strategy tailored to the inherent logic of polls to optimize controlled generation. Furthermore, we also leverage additional attribute tags from comments to enhance the generation quality. Experimental results indicate that DiffusPoll has achieved state-of-the-art performance in both the quality and diversity of poll generation tasks, and is more likely to hit the voices of minority.</abstract>
-      <url hash="8920310f">2024.findings-acl.54</url>
+      <url hash="fb8d6b3e">2024.findings-acl.54</url>
       <bibkey>cheng-li-2024-diffuspoll</bibkey>
       <doi>10.18653/v1/2024.findings-acl.54</doi>
       <video href="2024.findings-acl.54.mp4"/>
@@ -6701,7 +6701,7 @@
       <author><first>Jie</first><last>Chen</last></author>
       <pages>936-946</pages>
       <abstract>While large language models (LLMs) have shown excellent capabilities in language understanding, text generation and many other tasks, they still struggle in complex multi-step reasoning problems such as mathematical reasoning. In this paper, through a newly proposed arithmetical puzzle problem, we show that the model can perform well on multi-step reasoning tasks via fine tuning on high-quality synthetic data. Experiments with the open-llama-3B model on three different test datasets show that not only the model can reach a zero-shot pass@1 at 0.44 on the in-domain dataset, it also demonstrates certain generalization capabilities on the out-of-domain datasets. Specifically, this paper has designed two out-of-domain datasets in the form of extending the numerical range and the composing components of the arithmetical puzzle problem separately. The fine-tuned model have shown encouraging performance on these two far more difficult tasks with the zero-shot pass@1 at 0.33 and 0.35 correspondingly.</abstract>
-      <url hash="127a6f5b">2024.findings-acl.55</url>
+      <url hash="f14a4dd5">2024.findings-acl.55</url>
       <bibkey>li-etal-2024-exploring-mathematical</bibkey>
       <doi>10.18653/v1/2024.findings-acl.55</doi>
       <video href="2024.findings-acl.55.mp4"/>
@@ -6712,7 +6712,7 @@
       <author><first>Tieyun</first><last>Qian</last><affiliation>Wuhan University</affiliation></author>
       <pages>947-962</pages>
       <abstract>Toxicity detection plays a crucial role in maintaining the peace of the society. Existing methods can be roughly categorized as small language model (SLM) based and large language model (LLM) based. However, due to the limitation of SLMs on general knowledge and the potential embedded bias in LLMs despite their large amount of knowledge, it is not a good idea to detect toxicity only with either SLM or LLM based method.In this work, we propose to implant LLM’s knowledge into SLM based methods such that we can stick to both types of models’ strengths. To this end, we develop a reading comprehension (RC) tree to transfer knowledge between two models. Specifically, we first construct the RC tree, from an extensive to intensive reading perspective, to capture the local and global information in the text. We then model samples encoded by SLM and knowledge extracted from LLM as two distributions using the constructed RT tree. We finally transfer knowledge via optimal transportation between two distributions. Extensive experiments prove the effectiveness of our method on real-world and machine-generated datasets.</abstract>
-      <url hash="3cd01d84">2024.findings-acl.56</url>
+      <url hash="e4f5c6ac">2024.findings-acl.56</url>
       <bibkey>kang-qian-2024-implanting</bibkey>
       <doi>10.18653/v1/2024.findings-acl.56</doi>
     </paper>
@@ -6733,7 +6733,7 @@
       <author><first>Dongmei</first><last>Zhang</last><affiliation>Microsoft and Microsoft</affiliation></author>
       <pages>963-981</pages>
       <abstract>This paper focuses on task-agnostic prompt compression for better generalizability and efficiency. Considering the redundancy in natural language, existing approaches compress prompts by removing tokens or lexical units according to their information entropy obtained from a causal language model such as LLaMa-7B. The challenge is that information entropy may be a suboptimal compression metric: (i) it only leverages unidirectional context and may fail to capture all essential information needed for prompt compression; (ii) it is not aligned with the prompt compression objective.To address these issues, we propose a data distillation procedure to derive knowledge from an LLM to compress prompts without losing crucial information, and meantime, introduce an extractive text compression dataset. We formulate prompt compression as a token classification problem to guarantee the faithfulness of the compressed prompt to the original one, and use a Transformer encoder as the base architecture to capture all essential information for prompt compression from the full bidirectional context. Our approach leads to lower latency by explicitly learning the compression objective with smaller models such as XLM-RoBERTa-large and mBERT.We evaluate our method on both in-domain and out-of-domain datasets, including MeetingBank, LongBench, ZeroScrolls, GSM8K, and BBH. Despite its small size, our model shows significant performance gains over strong baselines and demonstrates robust generalization ability across different LLMs. Additionally, our model is 3x-6x faster than existing prompt compression methods, while accelerating the end-to-end latency by 1.6x-2.9x with compression ratios of 2x-5x.</abstract>
-      <url hash="29761e8c">2024.findings-acl.57</url>
+      <url hash="6d65cad5">2024.findings-acl.57</url>
       <bibkey>pan-etal-2024-llmlingua</bibkey>
       <doi>10.18653/v1/2024.findings-acl.57</doi>
       <video href="2024.findings-acl.57.mp4"/>
@@ -6744,7 +6744,7 @@
       <author><first>Yi</first><last>Yang</last><affiliation>Hong Kong University of Science and Technology</affiliation></author>
       <pages>982-994</pages>
       <abstract>Large Language Models (LLMs) are widely used for writing economic analysis reports or providing financial advice, but their ability to understand economic knowledge and reason about potential results of specific economic events lacks systematic evaluation. To address this gap, we propose a new dataset, natural language inference on economic events (EconNLI), to evaluate LLMs’ knowledge and reasoning abilities in the economic domain. We evaluate LLMs on (1) their ability to correctly classify whether a premise event will cause a hypothesis event and (2) their ability to generate reasonable events resulting from a given premise. Our experiments reveal that LLMs are not sophisticated in economic reasoning and may generate wrong or hallucinated answers. Our study raises awareness of the limitations of using LLMs for critical decision-making involving economic reasoning and analysis. The dataset and codes are available at <url>https://github.com/Irenehere/EconNLI</url>.</abstract>
-      <url hash="318807ba">2024.findings-acl.58</url>
+      <url hash="cb96cdb4">2024.findings-acl.58</url>
       <bibkey>guo-yang-2024-econnli</bibkey>
       <doi>10.18653/v1/2024.findings-acl.58</doi>
     </paper>
@@ -6757,7 +6757,7 @@
       <author><first>Hui</first><last>Zhao</last><affiliation>East China Normal University</affiliation></author>
       <pages>995-1011</pages>
       <abstract>Clinical text summarization has proven successful in generating concise and coherent summaries. However, these summaries may include unintended text with hallucinations, which can mislead clinicians and patients. Existing methods for mitigating hallucinations can be categorized into task-specific and task-agnostic approaches. Task-specific methods lack versatility for real-world applicability. Meanwhile, task-agnostic methods are not model-agnostic, so they require retraining for different models, resulting in considerable computational costs. To address these challenges, we propose MEDAL, a model-agnostic framework designed to post-process medical hallucinations. MEDAL can seamlessly integrate with any medical summarization model, requiring no additional computational overhead. MEDAL comprises a medical infilling model and a hallucination correction model. The infilling model generates non-factual summaries with common errors to train the correction model. The correction model is incorporated with a self-examination mechanism to activate its cognitive capability. We conduct comprehensive experiments using 11 widely accepted metrics on 7 baseline models across 3 medical text summarization tasks. MEDAL demonstrates superior performance in correcting hallucinations when applied to summaries generated by pre-trained language models and large language models.</abstract>
-      <url hash="8cc4769a">2024.findings-acl.59</url>
+      <url hash="e9fd6f21">2024.findings-acl.59</url>
       <bibkey>li-etal-2024-better</bibkey>
       <doi>10.18653/v1/2024.findings-acl.59</doi>
       <video href="2024.findings-acl.59.mp4"/>
@@ -6771,7 +6771,7 @@
       <author><first>Meng</first><last>Wang</last><affiliation>Hefei University of Technology</affiliation></author>
       <pages>1012-1037</pages>
       <abstract>Understanding the internal mechanisms by which multi-modal large language models (LLMs) interpret different modalities and integrate cross-modal representations is becoming increasingly critical for continuous improvements in both academia and industry. In this paper, we propose a novel method to identify key neurons for interpretability — how multi-modal LLMs bridge visual and textual concepts for captioning. Our method improves conventional works upon efficiency and applied range by removing needs of costly gradient computation. Based on those identified neurons, we further design a multi-modal knowledge editing method, beneficial to mitigate sensitive words or hallucination. For rationale of our design, we provide theoretical assumption. For empirical evaluation, we have conducted extensive quantitative and qualitative experiments. The results not only validate the effectiveness of our methods, but also offer insightful findings that highlight three key properties of multi-modal neurons: sensitivity, specificity and causal-effect, to shed light for future research.</abstract>
-      <url hash="5b12e690">2024.findings-acl.60</url>
+      <url hash="f4ff28de">2024.findings-acl.60</url>
       <bibkey>pan-etal-2024-finding</bibkey>
       <doi>10.18653/v1/2024.findings-acl.60</doi>
       <video href="2024.findings-acl.60.mp4"/>
@@ -6784,7 +6784,7 @@
       <author><first>Thien</first><last>Nguyen</last><affiliation>, University of Oregon</affiliation></author>
       <pages>1038-1047</pages>
       <abstract>Large language models (LLMs) have become integral to our professional workflows and daily lives. Nevertheless, these machine companions of ours have a critical flaw: the huge amount of data which endows them with vast and diverse knowledge, also exposes them to the inevitable toxicity and bias. While most LLMs incorporate defense mechanisms to prevent the generation of harmful content, these safeguards can be easily bypassed with minimal prompt engineering. In this paper, we introduce the new Thoroughly Engineered Toxicity (TET) dataset, comprising manually crafted prompts designed to nullify the protective layers of such models. Through extensive evaluations, we demonstrate the pivotal role of TET in providing a rigorous benchmark for evaluation of toxicity awareness in several popular LLMs: it highlights the toxicity in the LLMs that might remain hidden when using normal prompts, thus revealing subtler issues in their behavior.</abstract>
-      <url hash="b02e433f">2024.findings-acl.61</url>
+      <url hash="d344a694">2024.findings-acl.61</url>
       <bibkey>luong-etal-2024-realistic</bibkey>
       <doi>10.18653/v1/2024.findings-acl.61</doi>
     </paper>
@@ -6796,7 +6796,7 @@
       <author><first>Dawei</first><last>Song</last><affiliation>Beijing Institute of Technology and Open University</affiliation></author>
       <pages>1048-1066</pages>
       <abstract>Large-scale Causal Language Models (CLMs), e.g., GPT3 and ChatGPT, have brought great success in text generation. However, it is still an open challenge to effectively control the generation process of a CLM while balancing the flexibility, control granularity, and generation efficiency. In this paper, we provide a new alternative for controllable text generation (CTG), by designing a non-intrusive, lightweight control plugin, namely Residual Memory Transformer (RMT), to accompany the generation of CLM at arbitrary time steps. With an encoder-decoder setup, RMT can accept any types of control conditions and cooperate with the base CLM through a residual learning paradigm, to achieve a more flexible, general, and efficient CTG. Extensive experiments are carried out on various control tasks, in the form of both automatic and human evaluations. The results demonstrate the superiority of RMT over a wide range of state-of-the-art CTG approaches. The code implementation of our work is available at: https://github.com/Residual_Memory_Transformer.</abstract>
-      <url hash="b5b5a015">2024.findings-acl.62</url>
+      <url hash="a0ff23bb">2024.findings-acl.62</url>
       <bibkey>zhang-etal-2024-controllable</bibkey>
       <doi>10.18653/v1/2024.findings-acl.62</doi>
     </paper>
@@ -6809,7 +6809,7 @@
       <author><first>Qun</first><last>Liu</last><affiliation>Huawei Noah’s Ark Lab</affiliation></author>
       <pages>1067-1085</pages>
       <abstract>Large language models (LLMs) have attracted great attention given their strong performance on a wide range of NLP tasks. In practice, users often expect generated texts to fall within a specific length range, making length controlled generation an important topic, especially for GPT-style models. Existing length control methods mostly focus on a simple control type of “equal to” a target length. Different from them, we propose a prompt-based method to achieve length controlled generation under different control types with high accuracy. In particular, we adopt reinforcement learning (RL) and sample filtering with the reward signal given by rule-based reward models, which enhances the length control ability of models by rewarding outputs that follow certain control instructions. In addition, we introduce a standard prompt extractor to parse arbitrary users’ input into standard control instructions. Experiments show that our method significantly improves the accuracy of prompt-based length control on popular summarization datasets like CNNDM and NYT under multiple control types. Moreover, both the standard prompt extractor and RL-tuned model show strong generalization to unseen control prompt templates.</abstract>
-      <url hash="f90cb5aa">2024.findings-acl.63</url>
+      <url hash="91ca2f55">2024.findings-acl.63</url>
       <bibkey>jie-etal-2024-prompt</bibkey>
       <doi>10.18653/v1/2024.findings-acl.63</doi>
       <video href="2024.findings-acl.63.mp4"/>
@@ -6828,7 +6828,7 @@
       <author><first>Baobao</first><last>Chang</last><affiliation>Peking University</affiliation></author>
       <pages>1086-1104</pages>
       <abstract>We present PCA-Bench, a multimodal decision-making benchmark for evaluating the integrated capabilities of Multimodal Large Language Models (MLLMs). Departing from previous benchmarks focusing on simplistic tasks and individual model capability, PCA-Bench introduces three complex scenarios: autonomous driving, domestic robotics, and open-world games. Given task instructions and diverse contexts, the model is required to seamlessly integrate multiple capabilities of Perception, Cognition, and Action in a reasoning chain to make accurate decisions. Moreover, PCA-Bench features error localization capabilities, scrutinizing model inaccuracies in areas such as perception, knowledge, or reasoning. This enhances the reliability of deploying MLLMs. To balance accuracy and efficiency in evaluation, we propose PCA-Eval, an automatic evaluation protocol, and assess 10 prevalent MLLMs. The results reveal significant performance disparities between open-source models and powerful proprietary models like GPT-4 Vision. To address this, we introduce Embodied-Instruction-Evolution (EIE), an automatic framework for synthesizing instruction tuning examples in multimodal embodied environments. EIE generates 7,510 training examples in PCA-Bench and enhances the performance of open-source MLLMs, occasionally surpassing GPT-4 Vision (+3% in decision accuracy), thereby validating the effectiveness of EIE. Our findings suggest that robust MLLMs like GPT4-Vision show promise for decision-making in embodied agents, opening new avenues for MLLM research. All benchmark data and evaluation code are made public.</abstract>
-      <url hash="f82b30f0">2024.findings-acl.64</url>
+      <url hash="7bcb6477">2024.findings-acl.64</url>
       <bibkey>chen-etal-2024-pca</bibkey>
       <doi>10.18653/v1/2024.findings-acl.64</doi>
     </paper>
@@ -6844,7 +6844,7 @@
       <author><first>Dongha</first><last>Lee</last><affiliation>Yonsei University</affiliation></author>
       <pages>1105-1120</pages>
       <abstract>Conversational recommender systems are an emerging area that has garnered increasing interest in the community, especially with the advancements in large language models (LLMs) that enable sophisticated handling of conversational input. Despite the progress, the field still has many aspects left to explore. The currently available public datasets for conversational recommendation lack specific user preferences and explanations for recommendations, hindering high-quality recommendations. To address such challenges, we present a novel conversational recommendation dataset named PEARL, synthesized with persona- and knowledge-augmented LLM simulators. We obtain detailed persona and knowledge from real-world reviews and construct a large-scale dataset with over 57k dialogues. Our experimental results demonstrate that PEARL contains more specific user preferences, show expertise in the target domain, and provides recommendations more relevant to the dialogue context than those in prior datasets. Furthermore, we demonstrate the utility of PEARL by showing that our downstream models outperform baselines in both human and automatic evaluations. We release our dataset and code.</abstract>
-      <url hash="f0492116">2024.findings-acl.65</url>
+      <url hash="4a23426c">2024.findings-acl.65</url>
       <bibkey>kim-etal-2024-pearl</bibkey>
       <doi>10.18653/v1/2024.findings-acl.65</doi>
       <video href="2024.findings-acl.65.mp4"/>
@@ -6857,7 +6857,7 @@
       <author><first>Yong Man</first><last>Ro</last><affiliation>Korea Advanced Institute of Science and Technology</affiliation></author>
       <pages>1121-1138</pages>
       <abstract>The remarkable success of Large Language Models (LLMs) and instruction tuning drives the evolution of Vision Language Models (VLMs) towards a versatile general-purpose model. Yet, it remains unexplored whether current VLMs genuinely possess quality object-level image understanding capabilities determined from ‘what objects are in the image?’ or ‘which object corresponds to a specified bounding box?’. Our findings reveal that the image understanding capabilities of current VLMs are strongly correlated with their zero-shot performance on vision language (VL) tasks. This suggests that prioritizing basic image understanding is crucial for VLMs to excel at VL tasks. To enhance object-level image understanding, we propose Crayon Large Language and Vision mOdel (CoLLaVO), which incorporates instruction tuning with Crayon Prompt as a new visual prompt tuning scheme based on panoptic color maps. Furthermore, we present a learning strategy of Dual QLoRA to preserve object-level image understanding without forgetting it during visual instruction tuning, thereby achieving a significant leap in numerous VL benchmarks in a zero-shot setting.</abstract>
-      <url hash="eaa42eca">2024.findings-acl.66</url>
+      <url hash="585880f4">2024.findings-acl.66</url>
       <bibkey>lee-etal-2024-collavo</bibkey>
       <doi>10.18653/v1/2024.findings-acl.66</doi>
       <video href="2024.findings-acl.66.mp4"/>
@@ -6870,7 +6870,7 @@
       <author><first>Phil</first><last>Woodland</last><affiliation>University of Cambridge</affiliation></author>
       <pages>1139-1157</pages>
       <abstract>Human annotator simulation (HAS) serves as a cost-effective substitute for human evaluation tasks such as data annotation and system assessment. It is important to incorporate the variability present in human evaluation into HAS, since it helps capture diverse subjective interpretations and mitigate potential biases and over-representation. This work introduces a novel framework for modelling variability in HAS. Conditional softmax flow (S-CNF) is proposed to model the distribution of subjective human annotations, which leverages diverse human annotations via meta-learning. This enables efficient generation of annotations that exhibit human variability for unlabelled input. In addition, a wide range of evaluation metrics are adopted to assess the capability and efficiency of HAS systems in predicting the aggregated behaviours of human annotators, matching the distribution of human annotations, and simulating the inter-annotator disagreements. Results demonstrate that the proposed method achieves state-of-the-art performance on two real-world human evaluation tasks: emotion recognition and toxic speech detection.</abstract>
-      <url hash="685a368e">2024.findings-acl.67</url>
+      <url hash="d4dee086">2024.findings-acl.67</url>
       <bibkey>wu-etal-2024-modelling</bibkey>
       <doi>10.18653/v1/2024.findings-acl.67</doi>
       <video href="2024.findings-acl.67.mp4"/>
@@ -6885,7 +6885,7 @@
       <author><first>Alice</first><last>Oh</last><affiliation>Korea Advanced Institute of Science and Technology</affiliation></author>
       <pages>1158-1177</pages>
       <abstract>In this study, we introduce BEnQA, a dataset comprising parallel Bengali and English exam questions for middle and high school levels in Bangladesh. Our dataset consists of approximately 5K questions covering several subjects in science with different types of questions, including factual, application, and reasoning-based questions. We benchmark several Large Language Models (LLMs) with our parallel dataset and observe a notable performance disparity between the models in Bengali and English. We also investigate some prompting methods, and find that Chain-of-Thought prompting is beneficial mostly on reasoning questions, but not so much on factual ones. We also find that appending English translation helps to answer questions in Bengali. Our findings point to promising future research directions for improving the performance of LLMs in Bengali and more generally in low-resource languages.</abstract>
-      <url hash="afd3f278">2024.findings-acl.68</url>
+      <url hash="a07748dd">2024.findings-acl.68</url>
       <bibkey>shafayat-etal-2024-benqa</bibkey>
       <doi>10.18653/v1/2024.findings-acl.68</doi>
       <video href="2024.findings-acl.68.mp4"/>
@@ -6898,7 +6898,7 @@
       <author><first>Xueqi</first><last>Cheng</last><affiliation>, Chinese Academy of Sciences</affiliation></author>
       <pages>1178-1192</pages>
       <abstract>Since commonsense information has been recorded significantly less frequently than its existence, language models pre-trained by text generation have difficulty to learn sufficient commonsense knowledge. Several studies have leveraged text retrieval to augment the models’ commonsense ability. Unlike text, images capture commonsense information inherently but little effort has been paid to effectively utilize them. In this work, we propose a novel <tex-math>\textbf{M}</tex-math>ulti-m<tex-math>\textbf{O}</tex-math>dal <tex-math>\textbf{RE}</tex-math>trieval (MORE) augmentation framework, to leverage both text and images to enhance the commonsense ability of language models. Extensive experiments on the Common-Gen task have demonstrated the efficacy of MORE based on the pre-trained models of both single and multiple modalities.</abstract>
-      <url hash="f1f46b27">2024.findings-acl.69</url>
+      <url hash="e6ef9f5d">2024.findings-acl.69</url>
       <bibkey>cui-etal-2024-multi</bibkey>
       <doi>10.18653/v1/2024.findings-acl.69</doi>
       <video href="2024.findings-acl.69.mp4"/>
@@ -6916,7 +6916,7 @@
       <author><first>Jun</first><last>Zhao</last><affiliation>Institute of automation, Chinese academy of science</affiliation></author>
       <pages>1193-1215</pages>
       <abstract>Recently, retrieval augmentation and tool augmentation have demonstrated a remarkable capability to expand the internal memory boundaries of language models (LMs) by providing external context. However, internal memory and external context inevitably clash, leading to knowledge conflicts within LMs. In this paper, we aim to interpret the mechanism of knowledge conflicts through the lens of information flow, and then mitigate conflicts by precise interventions at the pivotal point. We find there are some attention heads with opposite effects in the later layers, where memory heads can recall knowledge from internal memory, and context heads can retrieve knowledge from external context. Moreover, we reveal that the pivotal point at which knowledge conflicts emerge in LMs is the integration of inconsistent information flows by memory heads and context heads. Inspired by the insights, we propose a novel method called Pruning Head via PatH PatcHing (PH3), which can efficiently mitigate knowledge conflicts by pruning conflicting attention heads without updating model parameters. PH3 can flexibly control eight LMs to use internal memory (<tex-math>\uparrow</tex-math> 44.0%) or external context (<tex-math>\uparrow</tex-math> 38.5%). Moreover, PH3 can also improve the performance of LMs on open-domain QA tasks. We also conduct extensive experiments to demonstrate the cross-model, cross-relation, and cross-format generalization of our method. Our code is publicly available at https://github.com/jinzhuoran/MConflict/.</abstract>
-      <url hash="e3b88564">2024.findings-acl.70</url>
+      <url hash="20af2863">2024.findings-acl.70</url>
       <bibkey>jin-etal-2024-cutting</bibkey>
       <doi>10.18653/v1/2024.findings-acl.70</doi>
     </paper>
@@ -6933,7 +6933,7 @@
       <author><first>Rui</first><last>Yan</last><affiliation>Renmin University of China</affiliation></author>
       <pages>1216-1240</pages>
       <abstract>Recent research trends in computational biology have increasingly focused on integrating text and bio-entity modeling, especially in the context of molecules and proteins. However, previous efforts like BioT5 faced challenges in generalizing across diverse tasks and lacked a nuanced understanding of molecular structures, particularly in their textual representations (e.g., IUPAC). This paper introduces BioT5+, an extension of the BioT5 framework, tailored to enhance biological research and drug discovery. BioT5+ incorporates several novel features: integration of IUPAC names for molecular understanding, inclusion of extensive bio-text and molecule data from sources like bioRxiv and PubChem, the multi-task instruction tuning for generality across tasks, and a numerical tokenization technique for improved processing of numerical data. These enhancements allow BioT5+ to bridge the gap between molecular representations and their textual descriptions, providing a more holistic understanding of biological entities, and largely improving the grounded reasoning of bio-text and bio-sequences. The model is pre-trained and fine-tuned with a large number of experiments, including <i>3 types of problems (classification, regression, generation), 15 kinds of tasks, and 21 total benchmark datasets</i>, demonstrating the remarkable performance and state-of-the-art results in most cases. BioT5+ stands out for its ability to capture intricate relationships in biological data, thereby contributing significantly to bioinformatics and computational biology. Our code is available at https://github.com/QizhiPei/BioT5.</abstract>
-      <url hash="81277f81">2024.findings-acl.71</url>
+      <url hash="7d82d23e">2024.findings-acl.71</url>
       <bibkey>pei-etal-2024-biot5</bibkey>
       <doi>10.18653/v1/2024.findings-acl.71</doi>
     </paper>
@@ -6944,7 +6944,7 @@
       <author><first>Yuan</first><last>Fang</last><affiliation>Singapore Management University</affiliation></author>
       <pages>1241-1257</pages>
       <abstract>Fine-tuning all parameters of large language models (LLMs) necessitates substantial computational power and extended time. Latest advancements in parameter-efficient fine-tuning (PEFT) techniques, such as Adapter tuning and LoRA, allow for adjustments to only a minor fraction of the parameters of these LLMs. Concurrently, it has been noted that the issue of over-smoothing diminishes the effectiveness of these Transformer-based LLMs, resulting in suboptimal performances in downstream tasks. In this paper, we present SIBO, which is a SImple BOoster to enhance PEFT, by injecting an initial residual. SIBO is straightforward and readily extensible to a range of state-of-the-art PEFT techniques to alleviate over-smoothing and enhance performance. Extensive experiments on 22 benchmark datasets demonstrate that SIBO significantly enhances the performance of various strong baselines, achieving up to 15.7% and 23.5% improvement over existing PEFT methods on the arithmetic and commonsense reasoning tasks, respectively.</abstract>
-      <url hash="48048a21">2024.findings-acl.72</url>
+      <url hash="a2f970e3">2024.findings-acl.72</url>
       <bibkey>wen-etal-2024-sibo</bibkey>
       <doi>10.18653/v1/2024.findings-acl.72</doi>
     </paper>
@@ -6958,7 +6958,7 @@
       <author><first>Yashar</first><last>Moshfeghi</last><affiliation>University of Strathclyde</affiliation></author>
       <pages>1258-1276</pages>
       <abstract>Recent advancements in large language models (LLMs) and multi-modal models (MMs) have demonstrated their remarkable capabilities in problem-solving. Yet, their proficiency in tackling geometry math problems, which necessitates an integrated understanding of both textual and visual information, has not been thoroughly evaluated. To address this gap, we introduce the GeoEval benchmark, a comprehensive collection that includes a main subset of 2,000 problems, a 750 problems subset focusing on backward reasoning, an augmented sub- set of 2,000 problems, and a hard subset of 300 problems. This benchmark facilitates a deeper investigation into the performance of LLMs and MMs in solving geometry math problems. Our evaluation of ten LLMs and MMs across these varied subsets reveals that the WizardMath model excels, achieving a 55.67% accuracy rate on the main subset but only a 6.00% accuracy on the hard subset. This highlights the critical need for testing models against datasets on which they have not been pre-trained. Additionally, our findings indicate that GPT-series models perform more effectively on problems they have rephrased, suggesting a promising method for enhancing model capabilities.</abstract>
-      <url hash="3410166e">2024.findings-acl.73</url>
+      <url hash="5695a270">2024.findings-acl.73</url>
       <bibkey>zhang-etal-2024-geoeval</bibkey>
       <doi>10.18653/v1/2024.findings-acl.73</doi>
     </paper>
@@ -6988,7 +6988,7 @@
       <author><first>Christoforos</first><last>Nalmpantis</last></author>
       <pages>1288-1301</pages>
       <abstract>We analyze a family of large language models in such a lightweight manner that can be done on a single GPU. Specifically, we focus on the OPT family of models ranging from 125m to 66b parameters and rely only on whether an FFN neuron is activated or not. First, we find that the early part of the network is sparse and represents many discrete features. Here, many neurons (more than in some layers of the 66b model) are “dead”, i.e. they never activate on a large collection of diverse data. At the same time, many of the alive neurons are reserved for discrete features and act as token and n-gram detectors. Interestingly, their corresponding FFN updates not only promote next token candidates as could be expected, but also explicitly focus on removing the information about triggering them tokens, i.e., current input. To the best of our knowledge, this is the first example of mechanisms specialized at removing (rather than adding) information from the residual stream. With scale, models become more sparse in a sense that they have more dead neurons and token detectors. Finally, some neurons are positional: them being activated or not depends largely (or solely) on position and less so (or not at all) on textual data. We find that smaller models have sets of neurons acting as position range indicators while larger models operate in a less explicit manner.</abstract>
-      <url hash="9c26a1a0">2024.findings-acl.75</url>
+      <url hash="92e291d2">2024.findings-acl.75</url>
       <bibkey>voita-etal-2024-neurons</bibkey>
       <doi>10.18653/v1/2024.findings-acl.75</doi>
     </paper>
@@ -7003,7 +7003,7 @@
       <author><first>Gang</first><last>Pan</last><affiliation>Tianjin University</affiliation></author>
       <pages>1302-1318</pages>
       <abstract>Grounded Multimodal Named Entity Recognition (GMNER) is a nascent multimodal task that aims to identify named entities, entity types and their corresponding visual regions. GMNER task exhibits two challenging properties: 1) The weak correlation between image-text pairs in social media results in a significant portion of named entities being ungroundable. 2) There exists a distinction between coarse-grained referring expressions commonly used in similar tasks (e.g., phrase localization, referring expression comprehension) and fine-grained named entities. In this paper, we propose RiVEG, a unified framework that reformulates GMNER into a joint MNER-VE-VG task by leveraging large language models (LLMs) as a connecting bridge. This reformulation brings two benefits: 1) It maintains the optimal MNER performance and eliminates the need for employing object detection methods to pre-extract regional features, thereby naturally addressing two major limitations of existing GMNER methods. 2) The introduction of entity expansion expression and Visual Entailment (VE) module unifies Visual Grounding (VG) and Entity Grounding (EG). It enables RiVEG to effortlessly inherit the Visual Entailment and Visual Grounding capabilities of any current or prospective multimodal pretraining models. Extensive experiments demonstrate that RiVEG outperforms state-of-the-art methods on the existing GMNER dataset and achieves absolute leads of 10.65%, 6.21%, and 8.83% in all three subtasks.</abstract>
-      <url hash="71225597">2024.findings-acl.76</url>
+      <url hash="531669b0">2024.findings-acl.76</url>
       <bibkey>li-etal-2024-llms</bibkey>
       <doi>10.18653/v1/2024.findings-acl.76</doi>
       <video href="2024.findings-acl.76.mp4"/>
@@ -7017,7 +7017,7 @@
       <author><first>Ekapol</first><last>Chuangsuwanich</last><affiliation>Chulalongkorn University</affiliation></author>
       <pages>1319-1329</pages>
       <abstract>Learning job title representation is a vital process for developing automatic human resource tools. To do so, existing methods primarily rely on learning the title representation through skills extracted from the job description, neglecting the rich and diverse content within. Thus, we propose an alternative framework for learning job titles through their respective job description (JD) and utilize a Job Description Aggregator component to handle the lengthy description and bidirectional contrastive loss to account for the bidirectional relationship between the job title and its description. We evaluated the performance of our method on both in-domain and out-of-domain settings, achieving a superior performance over the skill-based approach.</abstract>
-      <url hash="0ddc14f2">2024.findings-acl.77</url>
+      <url hash="12b84e68">2024.findings-acl.77</url>
       <bibkey>laosaengpha-etal-2024-learning</bibkey>
       <doi>10.18653/v1/2024.findings-acl.77</doi>
       <video href="2024.findings-acl.77.mp4"/>
@@ -7033,7 +7033,7 @@
       <author><first>Dan</first><last>Roth</last><affiliation>University of Pennsylvania</affiliation></author>
       <pages>1330-1350</pages>
       <abstract>Existing benchmarks for visual question answering lack in visual grounding and complexity, particularly in evaluating spatial reasoning skills. We introduce FlowVQA, a novel benchmark aimed at assessing the capabilities of visual question-answering multimodal language models in reasoning with flowcharts as visual contexts. FlowVQA comprises 2,272 carefully generated and human-verified flowchart images from three distinct content sources, along with 22,413 diverse question-answer pairs, to test a spectrum of reasoning tasks, including information localization, decision-making, and logical progression. We conduct a thorough baseline evaluation on a suite of both open-source and proprietary multimodal language models using various strategies, followed by an analysis of directional bias. The results underscore the benchmark’s potential as a vital tool for advancing the field of multimodal modeling, providing a focused and challenging environment for enhancing model performance in visual and logical reasoning tasks.</abstract>
-      <url hash="4c4eefda">2024.findings-acl.78</url>
+      <url hash="231340d8">2024.findings-acl.78</url>
       <bibkey>singh-etal-2024-flowvqa</bibkey>
       <doi>10.18653/v1/2024.findings-acl.78</doi>
       <video href="2024.findings-acl.78.mp4"/>
@@ -7046,7 +7046,7 @@
       <author><first>Chenhui</first><last>Chu</last><affiliation>Kyoto University</affiliation></author>
       <pages>1351-1358</pages>
       <abstract>Continual Named Entity Recognition (CNER) is dedicated to sequentially learning new entity types while mitigating catastrophic forgetting of old entity types. Traditional CNER approaches commonly employ knowledge distillation to retain old knowledge within the current model. However, because only the representations of old and new models are constrained to be consistent, the reliance solely on distillation in existing methods still suffers from catastrophic forgetting. To further alleviate the forgetting issue of old entity types, this paper introduces flexible Weight Tuning (WT) and Weight Fusion (WF) strategies for CNER. The WT strategy, applied at each training step, employs a learning rate schedule on the parameters of the current model. After learning the current task, the WF strategy dynamically integrates knowledge from both the current and previous models for inference. Notably, these two strategies are model-agnostic and seamlessly integrate with existing State-Of-The-Art (SOTA) models. Extensive experiments demonstrate that the WT and WF strategies consistently enhance the performance of previous SOTA methods across ten CNER settings in three datasets.</abstract>
-      <url hash="1ca39e64">2024.findings-acl.79</url>
+      <url hash="4573490a">2024.findings-acl.79</url>
       <bibkey>yu-etal-2024-flexible</bibkey>
       <doi>10.18653/v1/2024.findings-acl.79</doi>
       <video href="2024.findings-acl.79.mp4"/>
@@ -7061,7 +7061,7 @@
       <author><first>Haizhou</first><last>Li</last><affiliation>The Chinese University of Hong Kong (Shenzhen); National University of Singapore and National University of Singapore</affiliation></author>
       <pages>1359-1375</pages>
       <abstract>The automatic evaluation of natural language generation (NLG) systems presents a long-lasting challenge. Recent studies have highlighted various neural metrics that align well with human evaluations. Yet, the robustness of these evaluators against adversarial perturbations remains largely under-explored due to the unique challenges in obtaining adversarial data for different NLG evaluation tasks. To address the problem, we introduce AdvEval, a novel black-box adversarial framework against NLG evaluators. AdvEval is specially tailored to generate data that yield strong disagreements between human and victim evaluators. Specifically, inspired by the recent success of large language models (LLMs) in text generation and evaluation, we adopt strong LLMs as both the data generator and gold evaluator. Adversarial data are automatically optimized with feedback from the gold and victim evaluator. We conduct experiments on 12 victim evaluators and 11 NLG datasets, spanning tasks including dialogue, summarization, and question evaluation. The results show that AdvEval can lead to significant performance degradation of various victim metrics, thereby validating its efficacy.</abstract>
-      <url hash="facd7198">2024.findings-acl.80</url>
+      <url hash="d8c5ebf4">2024.findings-acl.80</url>
       <bibkey>chen-etal-2024-unveiling</bibkey>
       <doi>10.18653/v1/2024.findings-acl.80</doi>
     </paper>
@@ -7072,7 +7072,7 @@
       <author><first>Mark</first><last>Gales</last><affiliation>University of Cambridge</affiliation></author>
       <pages>1376-1387</pages>
       <abstract>Large Language Models (LLMs) have demonstrated impressive zero-shot capabilities and versatility in NLP tasks, however they sometimes fail to maintain crucial invariances for specific tasks. One example is permutation sensitivity, where LLMs’ outputs may significantly vary depending on the order of the input options. While debiasing techniques can mitigate these issues, and yield better performance and reliability, they often come with a high computational cost at inference. This paper addresses this inefficiency at inference time. The aim is to distill the capabilities of a computationally intensive, debiased, teacher model into a more compact student model. We explore two variants of student models: one based on pure distillation, and the other on an error-correction approach for more complex tasks, where the student corrects a single biased decision from the teacher to achieve a debiased output. Our approach is general and can be applied to both black-box and white-box LLMs. Furthermore, we demonstrate that our compact, encoder-only student models can outperform their larger, biased teacher counterparts, achieving better results with significantly fewer parameters.</abstract>
-      <url hash="3e0e8921">2024.findings-acl.81</url>
+      <url hash="ff7e7643">2024.findings-acl.81</url>
       <bibkey>liusie-etal-2024-teacher</bibkey>
       <doi>10.18653/v1/2024.findings-acl.81</doi>
     </paper>
@@ -7084,7 +7084,7 @@
       <author><first>Ping</first><last>Luo</last><affiliation>Institute of Computing Technology, Chinese Academy of Sciences</affiliation></author>
       <pages>1388-1409</pages>
       <abstract>Tables are recognized for their high information density and widespread usage, serving as essential sources of information. Seeking information from tables (TIS) is a crucial capability for Large Language Models (LLMs), serving as the foundation of knowledge-based Q&amp;A systems. However, this field presently suffers from an absence of thorough and reliable evaluation. This paper introduces a more reliable benchmark for Table Information Seeking (TabIS). To avoid the unreliable evaluation caused by text similarity-based metrics, TabIS adopts a single-choice question format (with two options per question) instead of a text generation format. We establish an effective pipeline for generating options, ensuring their difficulty and quality. Experiments conducted on 12 LLMs reveal that while the performance of GPT-4-turbo is marginally satisfactory, both other proprietary and open-source models perform inadequately. Further analysis shows that LLMs exhibit a poor understanding of table structures, and struggle to balance between TIS performance and robustness against pseudo-relevant tables (common in retrieval-augmented systems). These findings uncover the limitations and potential challenges of LLMs in seeking information from tables. We release our data and code to facilitate further research in this field.</abstract>
-      <url hash="0b9ca2a8">2024.findings-acl.82</url>
+      <url hash="28ccb585">2024.findings-acl.82</url>
       <bibkey>pang-etal-2024-uncovering</bibkey>
       <doi>10.18653/v1/2024.findings-acl.82</doi>
       <video href="2024.findings-acl.82.mp4"/>
@@ -7097,7 +7097,7 @@
       <author><first>Tieyun</first><last>Qian</last><affiliation>Wuhan University</affiliation></author>
       <pages>1410-1423</pages>
       <abstract>Continual relation extraction (CRE) aims to continuously learn relations in new tasks without forgetting old relations in previous tasks.Current CRE methods are all rehearsal-based which need to store samples and thus may encounter privacy and security issues.This paper targets rehearsal-free continual relation extraction for the first time and decomposes it into task identification and within-task prediction sub-problems. Existing rehearsal-free methods focus on training a model (expert) for within-task prediction yet neglect to enhance models’ capability of task identification.In this paper, we propose an Ensemble-of-Experts (EoE) framework for rehearsal-free continual relation extraction. Specifically, we first discriminatively train each expert by augmenting analogous relations across tasks to enhance the expert’s task identification ability. We then propose a cascade voting mechanism to form an ensemble of experts for effectively aggregating their abilities.Extensive experiments demonstrate that our method outperforms current rehearsal-free methods and is even better than rehearsal-based CRE methods.</abstract>
-      <url hash="a3f404a5">2024.findings-acl.83</url>
+      <url hash="9c3a9f6d">2024.findings-acl.83</url>
       <bibkey>zhou-etal-2024-ensemble</bibkey>
       <doi>10.18653/v1/2024.findings-acl.83</doi>
     </paper>
@@ -7107,7 +7107,7 @@
       <author><first>Adam</first><last>Jatowt</last></author>
       <pages>1424-1446</pages>
       <abstract>Temporal validity is an important property of text that has many downstream applications, such as recommender systems, conversational AI, and user status tracking. Existing benchmarking tasks often require models to identify the temporal validity duration of a single statement. However, many data sources contain additional context, such as successive sentences in a story or posts on a social media profile. This context may alter the duration for which the originally collected statement is expected to be valid. We propose Temporal Validity Change Prediction, a natural language processing task benchmarking the capability of machine learning models to detect context statements that induce such change. We create a dataset consisting of temporal target statements sourced from Twitter and crowdsource corresponding context statements. We then benchmark a set of transformer-based language models on our dataset. Finally, we experiment with a multitasking approach to improve the state-of-the-art performance.</abstract>
-      <url hash="058608fb">2024.findings-acl.84</url>
+      <url hash="ac0c44cb">2024.findings-acl.84</url>
       <bibkey>wenzel-jatowt-2024-temporal</bibkey>
       <doi>10.18653/v1/2024.findings-acl.84</doi>
     </paper>
@@ -7117,7 +7117,7 @@
       <author><first>Alona</first><last>Fyshe</last><affiliation>University of Alberta</affiliation></author>
       <pages>1447-1466</pages>
       <abstract>Pre-trained Language Models (PLMs) can be accurately fine-tuned for downstream text processing tasks. Recently, researchers have introduced several parameter-efficient fine-tuning methods that optimize input prompts or adjust a small number of model parameters (e.g LoRA). In this study, we explore the impact of altering the input text of the original task in conjunction with parameter-efficient fine-tuning methods. To most effectively rewrite the input text, we train a few-shot paraphrase model with a Maximum-Marginal Likelihood objective. Using six few-shot text classification datasets, we show that enriching data with paraphrases at train and test time enhances the performance beyond what can be achieved with parameter-efficient fine-tuning alone. The code used for our experiments can be found at https://github.com/SaeedNajafi/RIFF.</abstract>
-      <url hash="b362ba60">2024.findings-acl.85</url>
+      <url hash="3d6ca346">2024.findings-acl.85</url>
       <bibkey>najafi-fyshe-2024-riff</bibkey>
       <doi>10.18653/v1/2024.findings-acl.85</doi>
     </paper>
@@ -7130,7 +7130,7 @@
       <author><first>Steven</first><last>Schockaert</last><affiliation>Cardiff University</affiliation></author>
       <pages>1467-1480</pages>
       <abstract>Concept embeddings offer a practical and efficient mechanism for injecting commonsense knowledge into downstream tasks. Their core purpose is often not to predict the commonsense properties of concepts themselves, but rather to identify commonalities, i.e. sets of concepts which share some property of interest. Such commonalities are the basis for inductive generalisation, hence high-quality concept embeddings can make learning easier and more robust. Unfortunately, standard embeddings primarily reflect basic taxonomic categories, making them unsuitable for finding commonalities that refer to more specific aspects (e.g. the colour of objects or the materials they are made of). In this paper, we address this limitation by explicitly modelling the different facets of interest when learning concept embeddings. We show that this leads to embeddings which capture a more diverse range of commonsense properties, and consistently improves results in downstream tasks such as ultra-fine entity typing and ontology completion.</abstract>
-      <url hash="f2c31743">2024.findings-acl.86</url>
+      <url hash="14853e1b">2024.findings-acl.86</url>
       <bibkey>kteich-etal-2024-modelling</bibkey>
       <doi>10.18653/v1/2024.findings-acl.86</doi>
       <video href="2024.findings-acl.86.mp4"/>
@@ -7140,7 +7140,7 @@
       <author><first>Thomas</first><last>Bonnier</last><affiliation>Centrale Lille Alumni</affiliation></author>
       <pages>1481-1500</pages>
       <abstract>Tabular data with text fields can be leveraged in applications such as financial risk assessment or medical diagnosis prediction. When employing multimodal approaches to make predictions based on these modalities, it is crucial to make the most appropriate modeling choices in terms of numerical feature encoding or fusion strategy. In this paper, we focus on multimodal classification tasks based on tabular datasets with text fields. We build on multimodal Transformers to propose the Tabular-Text Transformer (TTT), a tabular/text dual-stream Transformer network. This architecture includes a distance-to-quantile embedding scheme for numerical features and an overall attention module which concurrently considers self-attention and cross-modal attention. Further, we leverage the two well-informed modality streams to estimate whether a prediction is uncertain or not. To explain uncertainty in terms of feature values, we use a sampling-based approximation of Shapley values in a bimodal context, with two options for the value function. To show the efficacy and relevance of this approach, we compare it to six baselines and measure its ability to quantify and explain uncertainty against various methods. Our code is available at https://github.com/thomas-bonnier/TabularTextTransformer.</abstract>
-      <url hash="7262c3fe">2024.findings-acl.87</url>
+      <url hash="042d9dbb">2024.findings-acl.87</url>
       <bibkey>bonnier-2024-revisiting</bibkey>
       <doi>10.18653/v1/2024.findings-acl.87</doi>
       <video href="2024.findings-acl.87.mp4"/>
@@ -7153,7 +7153,7 @@
       <author><first>Rifat</first><last>Shahriyar</last><affiliation>Bangladesh University of Engineering and Technology</affiliation></author>
       <pages>1501-1520</pages>
       <abstract>Pretrained language models inherently exhibit various social biases, prompting a crucial examination of their social impact across various linguistic contexts due to their widespread usage. Previous studies have provided numerous methods for intrinsic bias measurements, predominantly focused on high-resource languages. In this work, we aim to extend these investigations to Bangla, a low-resource language. Specifically, in this study, we (1) create a dataset for intrinsic gender bias measurement in Bangla, (2) discuss necessary adaptations to apply existing bias measurement methods for Bangla, and (3) examine the impact of context length variation on bias measurement, a factor that has been overlooked in previous studies. Through our experiments, we demonstrate a clear dependency of bias metrics on context length, highlighting the need for nuanced considerations in Bangla bias analysis. We consider our work as a stepping stone for bias measurement in the Bangla Language and make all of our resources publicly available to support future research.</abstract>
-      <url hash="33d272f8">2024.findings-acl.88</url>
+      <url hash="159dc7c6">2024.findings-acl.88</url>
       <bibkey>sadhu-etal-2024-empirical-study</bibkey>
       <doi>10.18653/v1/2024.findings-acl.88</doi>
     </paper>
@@ -7166,7 +7166,7 @@
       <author><first>Gerald</first><last>Penn</last><affiliation>Department of Computer Science, University of Toronto</affiliation></author>
       <pages>1521-1533</pages>
       <abstract>The task of temporal relation extraction (TRE) involves identifying and extracting temporal relations between events from narratives. We identify two primary issues with TRE systems. First, by formulating TRE as a simple text classification task where every temporal relation is independent, it is hard to enhance the TRE model’s representation of meaning of temporal relations, and its facility with the underlying temporal calculus. We solve the issue by proposing a novel Temporally Contrastive learning model (ConTempo) that increase the model’s awareness of the meaning of temporal relations by leveraging their symmetric or antisymmetric properties. Second, the reusability of innovations has been limited due to incompatibilities in model architectures. Therefore, we propose a unified framework and show that ConTempo is compatible with all three main branches of TRE research. Our results demonstrate that the performance gains of ConTempo are more pronounced, with the total combination achieving state-of-the-art performance on the widely used MATRES and TBD corpora. We furthermore identified and corrected a large number of annotation errors present in the test set of MATRES, after which the performance increase brought by ConTempo becomes more apparent.</abstract>
-      <url hash="b63464a9">2024.findings-acl.89</url>
+      <url hash="bfb29012">2024.findings-acl.89</url>
       <bibkey>niu-etal-2024-contempo</bibkey>
       <doi>10.18653/v1/2024.findings-acl.89</doi>
     </paper>
@@ -7180,7 +7180,7 @@
       <author><first>Prasanna</first><last>Parthasarathi</last><affiliation>Huawei Technologies Ltd.</affiliation></author>
       <pages>1534-1551</pages>
       <abstract>In this work, we dive deep into one of the popular knowledge-grounded dialogue benchmarks that focus on faithfulness, FaithDial. We show that a significant portion of the FaithDial data contains annotation artifacts, which may bias models towards completely ignoring the conversation history. We therefore introduce CHARP, a testbed, designed for evaluating supposedly non-hallucinatory models trained on the FaithDial dataset. Our extensive analysis reveals that models primarily exhibit poor performance on CHARP due to their inability to effectively attend to and reason over the conversation history. Furthermore, the evaluation methods of FaithDial fail to capture these shortcomings, neglecting the conversational history. Our findings indicate that there is substantial room for contribution in both dataset creation and hallucination evaluation for knowledge-grounded dialogue, and that CHARP can serve as a tool for monitoring the progress in this particular research area. Data, models, and source code will be publicly available upon acceptance.</abstract>
-      <url hash="37fb045e">2024.findings-acl.90</url>
+      <url hash="95b268bb">2024.findings-acl.90</url>
       <bibkey>ghaddar-etal-2024-charp</bibkey>
       <doi>10.18653/v1/2024.findings-acl.90</doi>
     </paper>
@@ -7194,7 +7194,7 @@
       <author><first>Yujiu</first><last>Yang</last><affiliation>Graduate School at Shenzhen,Tsinghua University</affiliation></author>
       <pages>1552-1587</pages>
       <abstract>The ability of Large Language Models (LLMs) to critique and refine their reasoning is crucial for their application in evaluation, feedback provision, and self-improvement. This paper introduces CriticBench, a comprehensive benchmark designed to assess LLMs’ abilities to critique and rectify their reasoning across a variety of tasks. CriticBench encompasses five reasoning domains: mathematical, commonsense, symbolic, coding, and algorithmic. It compiles 15 datasets and incorporates responses from three LLM families. Utilizing CriticBench, we evaluate and dissect the performance of 17 LLMs in generation, critique, and correction reasoning, i.e., GQC reasoning. Our findings reveal: (1) a linear relationship in GQC capabilities, with critique-focused training markedly enhancing performance; (2) a task-dependent variation in correction effectiveness, with logic-oriented tasks being more amenable to correction; (3) GQC knowledge inconsistencies that decrease as model size increases; and (4) an intriguing inter-model critiquing dynamic, where stronger models are better at critiquing weaker ones, while weaker models can surprisingly surpass stronger ones in their self-critique. We hope these insights into the nuanced critique-correct reasoning of LLMs will foster further research in LLM critique and self-improvement.</abstract>
-      <url hash="18fba52c">2024.findings-acl.91</url>
+      <url hash="8c5c31a0">2024.findings-acl.91</url>
       <bibkey>lin-etal-2024-criticbench</bibkey>
       <doi>10.18653/v1/2024.findings-acl.91</doi>
     </paper>
@@ -7210,7 +7210,7 @@
       <author><first>Jun</first><last>Huang</last></author>
       <pages>1588-1602</pages>
       <abstract>Recently, while large language models (LLMs) have demonstrated impressive results, they still suffer from hallucination, i.e., the generation of false information. Model editing is the task of fixing factual mistakes in LLMs; yet, most previous works treat it as a one-time task, paying little attention to ever-emerging mistakes generated by LLMs. We address the task of sequential model editing (SME) that aims to rectify mistakes continuously. A Dynamic Auxiliary Fusion Network (DAFNet) is designed to enhance the semantic interaction among the factual knowledge within the entire sequence, preventing catastrophic forgetting during the editing process of multiple knowledge triples.Specifically, (1) for semantic fusion within a relation triple, we aggregate the intra-editing attention flow into auto-regressive self-attention with token-level granularity in LLMs. We further leverage multi-layer diagonal inter-editing attention flow to update the weighted representations of the entire sequence-level granularity. (2) Considering that auxiliary parameters are required to store the knowledge for sequential editing, we construct a new dataset named DAFSet, fulfilling recent, popular, long-tail and robust properties to enhance the generality of sequential editing. Experiments show DAFNet significantly outperforms strong baselines in single-turn and sequential editing. The usage of DAFSet also consistently improves the performance of other auxiliary network-based methods in various scenarios.</abstract>
-      <url hash="c082d97b">2024.findings-acl.92</url>
+      <url hash="900ec3e3">2024.findings-acl.92</url>
       <bibkey>zhang-etal-2024-dafnet</bibkey>
       <doi>10.18653/v1/2024.findings-acl.92</doi>
     </paper>
@@ -7222,7 +7222,7 @@
       <author><first>Rahul</first><last>Mishra</last><affiliation>International Institute of Information Technology Hyderabad</affiliation></author>
       <pages>1603-1623</pages>
       <abstract>Generic text summarization approaches often fail to address the specific intent and needs of individual users. Recently, scholarly attention has turned to the development of summarization methods that are more closely tailored and controlled to align with specific objectives and user needs. Despite a growing corpus of controllable summarization research, there is no comprehensive survey available that thoroughly explores the diverse controllable attributes employed in this context, delves into the associated challenges, and investigates the existing solutions. In this survey, we formalize the Controllable Text Summarization (CTS) task, categorize controllable attributes according to their shared characteristics and objectives, and present a thorough examination of existing datasets and methods within each category. Moreover, based on our findings, we uncover limitations and research gaps, while also exploring potential solutions and future directions for CTS. We release our detailed analysis of CTS papers at https://github.com/ashokurlana/controllable_text_summarization_survey.</abstract>
-      <url hash="10c2eb5c">2024.findings-acl.93</url>
+      <url hash="ef86fde9">2024.findings-acl.93</url>
       <bibkey>urlana-etal-2024-controllable</bibkey>
       <doi>10.18653/v1/2024.findings-acl.93</doi>
     </paper>
@@ -7235,7 +7235,7 @@
       <author><first>Ye</first><last>Wang</last></author>
       <pages>1624-1637</pages>
       <abstract>Traditional applications of natural language processing (NLP) in healthcare have predominantly focused on patient-centered services, enhancing patient interactions and care delivery, such as through medical dialogue systems. However, the potential of NLP to benefit inexperienced doctors, particularly in areas such as communicative medical coaching, remains largely unexplored. We introduce “ChatCoach”, a human-AI cooperative framework designed to assist medical learners in practicing their communication skills during patient consultations. ChatCoach differentiates itself from conventional dialogue systems by offering a simulated environment where medical learners can practice dialogues with a patient agent, while a coach agent provides immediate, structured feedback. This is facilitated by our proposed Generalized Chain-of-Thought (GCoT) approach, which fosters the generation of structured feedback and enhances the utilization of external knowledge sources. Additionally, we have developed a dataset specifically for evaluating Large Language Models (LLMs) within the ChatCoach framework on communicative medical coaching tasks. Our empirical results validate the effectiveness of ChatCoach.</abstract>
-      <url hash="eb01e453">2024.findings-acl.94</url>
+      <url hash="a7c0d5ea">2024.findings-acl.94</url>
       <bibkey>huang-etal-2024-benchmarking</bibkey>
       <doi>10.18653/v1/2024.findings-acl.94</doi>
       <video href="2024.findings-acl.94.mp4"/>
@@ -7254,7 +7254,7 @@
       <author><first>Dongmei</first><last>Zhang</last><affiliation>Microsoft and Microsoft</affiliation></author>
       <pages>1638-1662</pages>
       <abstract>This paper introduce a novel thought prompting approach called ”Everything of Thoughts” (XoT) for Large Language Models (LLMs) to defy the law of ”Penrose triangle” of existing thought paradigms, to achieve three key perspectives in thought generation simultaneously: performance, efficiency, and flexibility. XoT leverages pretrained reinforcement learning and Monte Carlo Tree Search (MCTS) to incorporate external domain knowledge and planning capability into thoughts, thereby enhancing LLMs’ decision-making capabilities. Through the MCTS-LLM collaborative thought revision framework, XoT autonomously produces high-quality comprehensive cognitive mappings with minimal LLM interactions. Additionally, XoT empowers LLMs to utilize flexible cognitive mappings for solving problems with multiple solutions.We evaluate XoT on several challenging problem-solving tasks, including Game of 24, 8-Puzzle, and Pocket Cube. Our results demonstrate that XoT significantly outperforms existing approaches in various dimensions, showcasing its remarkable proficiency in addressing complex problems across diverse domains. The data and code are available at https://github.com/microsoft/Everything-of-Thoughts-XoT.</abstract>
-      <url hash="963be159">2024.findings-acl.95</url>
+      <url hash="73ba2e9f">2024.findings-acl.95</url>
       <bibkey>ding-etal-2024-everything</bibkey>
       <doi>10.18653/v1/2024.findings-acl.95</doi>
       <video href="2024.findings-acl.95.mp4"/>
@@ -7269,7 +7269,7 @@
       <author><first>Monica</first><last>Lam</last><affiliation>Stanford University</affiliation></author>
       <pages>1663-1678</pages>
       <abstract>We introduce SPAGHETTI: Semantic Parsing Augmented Generation for Hybrid English information from Text Tables and Infoboxes, a hybrid question-answering (QA) pipeline that utilizes information from heterogeneous knowledge sources, including knowledge base, text, tables, and infoboxes. Our LLM-augmented approach achieves state-of-the-art performance on the Compmix dataset, the most comprehensive heterogeneous open-domain QA dataset, with 56.5% exact match (EM) rate. More importantly, manual analysis on a sample of the dataset suggests that SPAGHETTI is more than 90% accurate, indicating that EM is no longer suitable for assessing the capabilities of QA systems today.</abstract>
-      <url hash="a7bcf941">2024.findings-acl.96</url>
+      <url hash="d17b66d1">2024.findings-acl.96</url>
       <bibkey>zhang-etal-2024-spaghetti</bibkey>
       <doi>10.18653/v1/2024.findings-acl.96</doi>
       <video href="2024.findings-acl.96.mp4"/>
@@ -7288,7 +7288,7 @@
       <author><first>Shafiq</first><last>Joty</last><affiliation>SalesForce.com and Nanyang Technological University</affiliation></author>
       <pages>1679-1705</pages>
       <abstract>In the rapidly evolving field of large language models (LLMs), data augmentation (DA) has emerged as a pivotal technique for enhancing model performance by diversifying training examples without the need for additional data collection. This survey explores the transformative impact of LLMs on DA, particularly addressing the unique challenges and opportunities they present in the context of natural language processing (NLP) and beyond. From both data and learning perspectives, we examine various strategies that utilize LLMs for data augmentation, including a novel exploration of learning paradigms where LLM-generated data is used for diverse forms of further training. Additionally, this paper highlights the primary open challenges faced in this domain, ranging from controllable data augmentation to multi-modal data augmentation. This survey highlights a paradigm shift introduced by LLMs in DA, and aims to serve as a comprehensive guide for researchers and practitioners.</abstract>
-      <url hash="216c53ba">2024.findings-acl.97</url>
+      <url hash="fd0fa258">2024.findings-acl.97</url>
       <bibkey>ding-etal-2024-data</bibkey>
       <doi>10.18653/v1/2024.findings-acl.97</doi>
     </paper>
@@ -7301,7 +7301,7 @@
       <author><first>Tianxing</first><last>He</last></author>
       <pages>1706-1715</pages>
       <abstract>Recent watermarked generation algorithms inject detectable signatures during language generation to facilitate post-hoc detection. While token-level watermarks are vulnerable to paraphrase attacks, SemStamp (Hou et al., 2023) applies watermark on the semantic representation of sentences and demonstrates promising robustness. SemStamp employs locality-sensitive hashing (LSH) to partition the semantic space with arbitrary hyperplanes, which results in a suboptimal tradeoff between robustness and speed. We propose k-SemStamp, a simple yet effective enhancement of SemStamp, utilizing k-means clustering as an alternative of LSH to partition the embedding space with awareness of inherent semantic structure. Experimental results indicate that k-SemStamp saliently improves its robustness and sampling efficiency while preserving the generation quality, advancing a more effective tool for machine-generated text detection.</abstract>
-      <url hash="87227024">2024.findings-acl.98</url>
+      <url hash="4d0cbcfb">2024.findings-acl.98</url>
       <bibkey>hou-etal-2024-k</bibkey>
       <doi>10.18653/v1/2024.findings-acl.98</doi>
       <video href="2024.findings-acl.98.mp4"/>
@@ -7314,7 +7314,7 @@
       <author><first>Tristan</first><last>Thrush</last><affiliation>Stanford University</affiliation></author>
       <pages>1716-1726</pages>
       <abstract>This paper introduces the ColorSwap dataset, designed to assess and improve the proficiency of multimodal models in matching objects with their colors. The dataset is comprised of 2,000 unique image-caption pairs, grouped into 1,000 examples. Each example includes a caption-image pair, along with a “color-swapped” pair. We follow the Winoground schema: the two captions in an example have the same words, but the color words have been rearranged to modify different objects. The dataset was created through a novel blend of automated caption and image generation with humans in the loop. We evaluate image-text matching (ITM) and visual language models (VLMs) and find that even the latest ones are still not robust at this task. GPT-4V and LLaVA score 72% and 42% on our main VLM metric, although they may improve with more advanced prompting techniques. On the main ITM metric, contrastive models such as CLIP and SigLIP perform close to chance (at 12% and 30%, respectively), although the non-contrastive BLIP ITM model is stronger (87%). We also find that finetuning on fewer than 2,000 examples yields significant performance gains on this out-of-distribution word-order understanding task.</abstract>
-      <url hash="a5965878">2024.findings-acl.99</url>
+      <url hash="9b19f965">2024.findings-acl.99</url>
       <bibkey>burapacheep-etal-2024-colorswap</bibkey>
       <doi>10.18653/v1/2024.findings-acl.99</doi>
       <video href="2024.findings-acl.99.mp4"/>
@@ -7326,7 +7326,7 @@
       <author><first>Salman</first><last>Avestimehr</last><affiliation>University of Southern California</affiliation></author>
       <pages>1727-1735</pages>
       <abstract>Numerous recent works aim to enhance the efficacy of Large Language Models (LLMs) through strategic prompting. In particular, the Optimization by PROmpting (OPRO) approach provides state-of-the-art performance by leveraging LLMs as optimizers where the optimization task is to find instructions that maximize the task accuracy. In this paper, we revisit OPRO for automated prompting with relatively small-scale LLMs, such as LLaMa-2 family and Mistral 7B. Our investigation reveals that OPRO shows limited effectiveness in small-scale LLMs, with limited inference capabilities constraining optimization ability. We suggest future automatic prompting engineering to consider both model capabilities and computational costs. Additionally, for small-scale LLMs, we recommend direct instructions that clearly outline objectives and methodologies as robust prompt baselines, ensuring efficient and effective prompt engineering in ongoing research.</abstract>
-      <url hash="16875fb1">2024.findings-acl.100</url>
+      <url hash="1642e5b8">2024.findings-acl.100</url>
       <bibkey>zhang-etal-2024-revisiting-opro</bibkey>
       <doi>10.18653/v1/2024.findings-acl.100</doi>
     </paper>
@@ -7336,7 +7336,7 @@
       <author><first>Manjesh</first><last>Hanawal</last><affiliation>Indian Institute of Technology Bombay</affiliation></author>
       <pages>1736-1748</pages>
       <abstract>Pre-trained Language Models (PLMs), like BERT, with self-supervision objectives exhibit remarkable performance and generalization across various tasks. However, they suffer in inference latency due to their large size. To address this issue, side branches are attached at intermediate layers, enabling early inference of samples without requiring them to pass through all layers. However, the challenge is to decide which layer to infer and exit each sample so that the accuracy and latency are balanced. Moreover, the distribution of the samples to be inferred may differ from that used for training necessitating cross-domain adaptation. We propose an online learning algorithm named Cross-Domain Inference in Early Exit BERT (CeeBERT) that dynamically determines early exits of samples based on the level of confidence at each exit point. CeeBERT learns optimal thresholds from domain-specific confidence observed at intermediate layers on the fly, eliminating the need for labeled data. Experimental results on five distinct datasets with BERT and ALBERT models demonstrate CeeBERT’s ability to improve latency by reducing unnecessary computations with minimal drop in performance. By adapting to the threshold values, CeeBERT can speed up the BERT/ALBERT models by <tex-math>2\times</tex-math> - <tex-math>3.1\times</tex-math> with minimal drop in accuracy. The anonymized source code is available at https://github.com/Div290/CeeBERT.</abstract>
-      <url hash="9f28e796">2024.findings-acl.101</url>
+      <url hash="d9678ce2">2024.findings-acl.101</url>
       <bibkey>bajpai-hanawal-2024-ceebert</bibkey>
       <doi>10.18653/v1/2024.findings-acl.101</doi>
       <video href="2024.findings-acl.101.mp4"/>
@@ -7347,7 +7347,7 @@
       <author><first>Rohini</first><last>Srihari</last><affiliation>State University of New York at Buffalo</affiliation></author>
       <pages>1749-1762</pages>
       <abstract>Large Language Models (LLMs) have made significant progress in integrating safety and knowledge alignment. However, adversarial actors can manipulate these models into generating unsafe responses, and excessive safety alignment can lead to unintended hallucinations. To address these challenges, we introduce UniWiz, a novel 2-step data orchestration framework that unifies safety and knowledge data generation. We propose a “safety-priming” method to generate synthetic safety data and overcome safety bottlenecks. We also inject relevant knowledge into conversations by retrieving factual information from curated sources. UniWiz dataset consists of 17,638 quality-controlled conversations and 10,000 augmented preference data. Pretrained models fine-tuned on UniWiz show improvements across various metrics and outperform state-of-the-art instruction-tuned models trained on much larger datasets.</abstract>
-      <url hash="ffa98070">2024.findings-acl.102</url>
+      <url hash="aa5fcb00">2024.findings-acl.102</url>
       <bibkey>das-srihari-2024-uniwiz</bibkey>
       <doi>10.18653/v1/2024.findings-acl.102</doi>
       <video href="2024.findings-acl.102.mp4"/>
@@ -7361,7 +7361,7 @@
       <author><first>Marcello</first><last>Federico</last><affiliation>Amazon</affiliation></author>
       <pages>1763-1775</pages>
       <abstract>We show that content on the web is often translated into many languages, and the low quality of these multi-way translations indicates they were likely created using Machine Translation (MT). Multi-way parallel, machine generated content not only dominates the translations in lower resource languages; it also constitutes a large fraction of the total web content in those languages. We also find evidence of a selection bias in the type of content which is translated into many languages, consistent with low quality English content being translated en masse into many lower resource languages, via MT. Our work raises serious concerns about training models such as multilingual large language models on both monolingual and bilingual data scraped from the web.</abstract>
-      <url hash="cd62913d">2024.findings-acl.103</url>
+      <url hash="f7905f07">2024.findings-acl.103</url>
       <bibkey>thompson-etal-2024-shocking</bibkey>
       <doi>10.18653/v1/2024.findings-acl.103</doi>
       <video href="2024.findings-acl.103.mp4"/>
@@ -7376,7 +7376,7 @@
       <author><first>Brian</first><last>Gallagher</last><affiliation>Lawrence Livermore National Laboratory</affiliation></author>
       <pages>1776-1782</pages>
       <abstract>Traditionally, developing new language models (LMs) capable of addressing multiple tasks involves fine-tuning pre-trained LMs using a wide collection of datasets, a process that often incurs significant computational expenses. Model merging emerges as a cost-effective alternative, allowing the integration of existing models fine-tuned on different tasks into a single model that performs well across all tasks, eliminating the need for additional training. In this paper, we propose RankMean, an algorithm for merging fine-tuned LMs without requiring any downstream data. RankMean determines merging coefficients based on the relative rankings of weight change magnitudes and applies these coefficients for module-wise integration of various fine-tuned models. Our experimental results demonstrate that RankMean outperforms existing baseline methods on multiple benchmarks. The code is available at https://github.com/VITA-Group/RankMean.</abstract>
-      <url hash="015ac774">2024.findings-acl.104</url>
+      <url hash="c99187be">2024.findings-acl.104</url>
       <bibkey>perin-etal-2024-rankmean</bibkey>
       <doi>10.18653/v1/2024.findings-acl.104</doi>
       <video href="2024.findings-acl.104.mp4"/>
@@ -7389,7 +7389,7 @@
       <author><first>Nanyun</first><last>Peng</last><affiliation>University of California, Los Angeles</affiliation></author>
       <pages>1783-1805</pages>
       <abstract>Large Vision-Language Models (LVLMs) suffer from hallucination issues, wherein the models generate plausible-sounding but factually incorrect outputs, undermining their reliability. A comprehensive quantitative evaluation is necessary to identify and understand the extent of hallucinations in these models. However, existing benchmarks are often limited in scope, focusing mainly on object hallucinations. Furthermore, current evaluation methods struggle to effectively address the subtle semantic distinctions between model outputs and reference data, as well as the balance between hallucination and informativeness. To address these issues, we introduce a multi-dimensional benchmark covering objects, attributes, and relations, with challenging images selected based on associative biases. Moreover, we propose a large language model (LLM)-based two-stage evaluation framework that generalizes the popular CHAIR metric and incorporates both faithfulness and coverage into the evaluation. Experiments on 10 established LVLMs demonstrate that our evaluation metric is more comprehensive and better correlated with humans than existing work when evaluating on our challenging human-annotated benchmark dataset. Our work also highlights the critical balance between faithfulness and coverage of model outputs, and encourages future works to address hallucinations in LVLMs while keeping their outputs informative.</abstract>
-      <url hash="d3e0a78c">2024.findings-acl.105</url>
+      <url hash="5ac5ad26">2024.findings-acl.105</url>
       <bibkey>qiu-etal-2024-valor</bibkey>
       <doi>10.18653/v1/2024.findings-acl.105</doi>
     </paper>
@@ -7418,7 +7418,7 @@
       <author><first>Meng</first><last>Jiang</last><affiliation>University of Notre Dame</affiliation></author>
       <pages>1817-1829</pages>
       <abstract>The rapid advancement of Large Language Models (LLMs) has demonstrated their vast potential across various domains, attributed to their extensive pretraining knowledge and exceptional generalizability. However, LLMs often encounter challenges in generating harmful content when faced with problematic prompts. To address this problem, existing work attempted to implement a gradient ascent based approach to prevent LLMs from producing harmful output. While these methods can be effective, they frequently impact the model utility in responding to normal prompts. To address this gap, we introduce Selective Knowledge negation Unlearning (SKU), a novel unlearning framework for LLMs, designed to eliminate harmful knowledge while preserving utility on normal prompts. Specifically, SKU is consisted of two stages: harmful knowledge acquisition stage and knowledge negation stage. The first stage aims to identify and acquire harmful knowledge within the model, whereas the second is dedicated to remove this knowledge. SKU selectively isolates and removes harmful knowledge in model parameters, ensuring the model’s performance remains robust on normal prompts. Our experiments conducted across various LLM architectures demonstrate that SKU identifies a good balance point between removing harmful information and preserving utility.</abstract>
-      <url hash="349c2912">2024.findings-acl.107</url>
+      <url hash="a931baaa">2024.findings-acl.107</url>
       <bibkey>liu-etal-2024-towards-safer</bibkey>
       <doi>10.18653/v1/2024.findings-acl.107</doi>
       <video href="2024.findings-acl.107.mp4"/>
@@ -7435,7 +7435,7 @@
       <author><first>Mengnan</first><last>Du</last><affiliation>New Jersey Institute of Technology</affiliation></author>
       <pages>1830-1842</pages>
       <abstract>Chain of Thought (CoT) is significant in improving the reasoning abilities of large language models (LLMs). However, the correlation between the effectiveness of CoT and the length of reasoning steps in prompts remains largely unknown. To shed light on this, we have conducted several empirical experiments to explore the relations. Specifically, we design experiments that expand and compress the rationale reasoning steps within CoT demonstrations, while keeping all other factors constant. We have the following key findings. First, the results indicate that lengthening the reasoning steps in prompts, even without adding new information into the prompt, considerably enhances LLMs’ reasoning abilities across multiple datasets. Alternatively, shortening the reasoning steps, even while preserving the key information, significantly diminishes the reasoning abilities of models. This finding highlights the importance of the number of steps in CoT prompts and provides practical guidance to make better use of LLMs’ potential in complex problem-solving scenarios. Second, we also investigated the relationship between the performance of CoT and the rationales used in demonstrations. Surprisingly, the result shows that even incorrect rationales can yield favorable outcomes if they maintain the requisite length of inference. Third, we observed that the advantages of increasing reasoning steps are task-dependent: simpler tasks require fewer steps, whereas complex tasks gain significantly from longer inference sequences.</abstract>
-      <url hash="133881c1">2024.findings-acl.108</url>
+      <url hash="1ae058d8">2024.findings-acl.108</url>
       <bibkey>jin-etal-2024-impact</bibkey>
       <doi>10.18653/v1/2024.findings-acl.108</doi>
       <video href="2024.findings-acl.108.mp4"/>
@@ -7452,7 +7452,7 @@
       <author><first>Kristen</first><last>Johnson</last><affiliation>Michigan State University</affiliation></author>
       <pages>1843-1856</pages>
       <abstract>While task-agnostic debiasing provides notable generalizability and reduced reliance on downstream data, its impact on language modeling ability and the risk of relearning social biases from downstream task-specific data remain as the two most significant challenges when debiasing Pretrained Language Models (PLMs). The impact on language modeling ability can be alleviated given a high-quality and long-contextualized debiasing corpus, but there remains a deficiency in understanding the specifics of relearning biases. We empirically ascertain that the effectiveness of task-agnostic debiasing hinges on the quantitative bias level of both the task-specific data used for downstream applications and the debiased model. We empirically show that the lower bound of the bias level of the downstream fine-tuned model can be approximated by the bias level of the debiased model, in most practical cases. To gain more in-depth understanding about how the parameters of PLMs change during fine-tuning due to the forgetting issue of PLMs, we propose a novel framework which can Propagate Socially-fair Debiasing to Downstream Fine-tuning, ProSocialTuning. Our proposed framework can push the fine-tuned model to approach the bias lower bound during downstream fine-tuning, indicating that the ineffectiveness of debiasing can be alleviated by overcoming the forgetting issue through regularizing successfully debiased attention heads based on the PLMs’ bias levels from stages of pretraining and debiasing.</abstract>
-      <url hash="8b9cc5d7">2024.findings-acl.109</url>
+      <url hash="06cfe14f">2024.findings-acl.109</url>
       <bibkey>liu-etal-2024-towards-understanding</bibkey>
       <doi>10.18653/v1/2024.findings-acl.109</doi>
     </paper>
@@ -7468,7 +7468,7 @@
       <author><first>Zijian</first><last>Huang</last><affiliation>University of Auckland</affiliation></author>
       <pages>1857-1871</pages>
       <abstract>A summary structure is inherent to certain types of texts according to the Genre Theory of Linguistics. Such structures aid readers in efficiently locating information within summaries. However, most existing automatic summarization methods overlook the importance of summary structure, resulting in summaries that emphasize the most prominent information while omitting essential details from other sections. While a few summarizers recognize the importance of summary structure, they rely heavily on the predefined labels of summary structures in the source document and ground truth summaries. To address these shortcomings, we developed a Structured Knowledge-Guided Summarization (SKGSum) and its variant, SKGSum-W, which do not require structure labels. Instead, these methods rely on a set of automatically extracted summary points to generate summaries. We evaluate the proposed methods using three real-world datasets. The results indicate that our methods not only improve the quality of summaries, in terms of ROUGE and BERTScore, but also broaden the types of documents that can be effectively summarized.</abstract>
-      <url hash="26a2764c">2024.findings-acl.110</url>
+      <url hash="c0e43dc6">2024.findings-acl.110</url>
       <bibkey>wang-etal-2024-skgsum</bibkey>
       <doi>10.18653/v1/2024.findings-acl.110</doi>
       <video href="2024.findings-acl.110.mp4"/>
@@ -7483,7 +7483,7 @@
       <author><first>Min</first><last>Zhang</last><affiliation>Harbin Institute of Technology, Shenzhen</affiliation></author>
       <pages>1872-1884</pages>
       <abstract>Spoken Named Entity Recognition (NER) aims to extract entities from speech. The extracted entities can help voice assistants better understand user’s questions and instructions. However, current Chinese Spoken NER datasets are laboratory-controlled data that are collected by reading existing texts in quiet environments, rather than natural spoken data, and the texts used for reading are also limited in topics. These limitations obstruct the development of Spoken NER in more natural and common real-world scenarios. To address this gap, we introduce a real-world Chinese Spoken NER dataset (RWCS-NER), encompassing open-domain daily conversations and task-oriented intelligent cockpit instructions. We compare several mainstream pipeline approaches on RWCS-NER. The results indicate that the current methods, affected by Automatic Speech Recognition (ASR) errors, do not perform satisfactorily in real settings. Aiming to enhance Spoken NER in real-world scenarios, we propose two approaches: self-training-asr and mapping then distilling (MDistilling). Experiments show that both approaches can achieve significant improvements, particularly MDistilling. Even compared with GPT4.0, MDistilling still reaches better results. We believe that our work will advance the field of Spoken NER in real-world settings.</abstract>
-      <url hash="6b7a0d34">2024.findings-acl.111</url>
+      <url hash="60e28192">2024.findings-acl.111</url>
       <bibkey>zhou-etal-2024-chinese</bibkey>
       <doi>10.18653/v1/2024.findings-acl.111</doi>
       <video href="2024.findings-acl.111.mp4"/>
@@ -7495,7 +7495,7 @@
       <author><first>Sangwon</first><last>Yoon</last></author>
       <pages>1885-1897</pages>
       <abstract>As natural language generation (NLG) models have become prevalent, systematically assessing the quality of machine-generated texts has become increasingly important. Recent studies introduce LLM-based evaluators that operate as reference-free metrics, demonstrating their capability to adeptly handle novel tasks. However, these models generally rely on a single-agent approach, which, we argue, introduces an inherent limit to their performance. This is because there exist biases in LLM agent’s responses, including preferences for certain text structure or content. In this work, we propose DEBATE, an NLG evaluation framework based on multi-agent scoring system augmented with a concept of Devil’s Advocate. Within the framework, one agent is instructed to criticize other agents’ arguments, potentially resolving the bias in LLM agent’s answers. DEBATE substantially outperforms the previous state-of-the-art methods in two meta-evaluation benchmarks in NLG evaluation, SummEval and TopicalChat. We also show that the extensiveness of debates among agents and the persona of an agent can influence the performance of evaluators.</abstract>
-      <url hash="55cd437e">2024.findings-acl.112</url>
+      <url hash="1e907021">2024.findings-acl.112</url>
       <bibkey>kim-etal-2024-debate</bibkey>
       <doi>10.18653/v1/2024.findings-acl.112</doi>
     </paper>
@@ -7508,7 +7508,7 @@
       <author><first>Zhifang</first><last>Sui</last><affiliation>Peking University</affiliation></author>
       <pages>1898-1912</pages>
       <abstract>Understanding the deep semantics of images is essential in the era dominated by social media. However, current research works primarily on the superficial description of images, revealing a notable deficiency in the systematic investigation of the inherent deep semantics. In this work, we introduce DEEPEVAL, a comprehensive benchmark to assess Large Multimodal Models’ (LMMs) capacities of visual deep semantics. DEEPEVAL includes human-annotated dataset and three progressive subtasks: fine-grained description selection, in-depth title matching, and deep semantics understanding. Utilizing DEEPEVAL, we evaluate 9 open-source LMMs and GPT-4V(ision). Our evaluation demonstrates a substantial gap between the deep semantic comprehension capabilities of existing LMMs and humans. For example, GPT-4V is 30% behind humans in understanding deep semantics, even though it achieves human-comparable performance in image description. Further analysis reveals that LMM performance on DEEPEVAL varies according to the specific facets of deep semantics explored, indicating the fundamental challenges remaining in developing LMMs.</abstract>
-      <url hash="5016cac6">2024.findings-acl.113</url>
+      <url hash="ce6ca1a5">2024.findings-acl.113</url>
       <bibkey>yang-etal-2024-large</bibkey>
       <doi>10.18653/v1/2024.findings-acl.113</doi>
       <video href="2024.findings-acl.113.mp4"/>
@@ -7524,7 +7524,7 @@
       <author><first>Donghong</first><last>Ji</last></author>
       <pages>1913-1927</pages>
       <abstract>Document-level event extraction aims to extract structured event information from unstructured text. However, a single document often contains limited event information and the roles of different event arguments may be biased due to the influence of the information source.This paper addresses the limitations of traditional document-level event extraction by proposing the task of cross-document event extraction (CDEE) to integrate event information from multiple documents and provide a comprehensive perspective on events. We construct a novel cross-document event extraction dataset, namely CLES, which contains 20,059 documents and 37,688 mention-level events, where over 70% of them are cross-document. To address the task, we propose a CDEE pipeline that includes 5 steps, namely event extraction, coreference resolution, entity normalization, role normalization and entity-role resolution. Our CDEE pipeline achieves about 72% F1 in end-to-end cross-document event extraction, suggesting the challenge of this task and setting up a benchmark for future research. Our work builds a new line of information extraction research and will attract new research attention.</abstract>
-      <url hash="a0af815c">2024.findings-acl.114</url>
+      <url hash="f172e2a7">2024.findings-acl.114</url>
       <bibkey>gao-etal-2024-harvesting</bibkey>
       <doi>10.18653/v1/2024.findings-acl.114</doi>
       <video href="2024.findings-acl.114.mp4"/>
@@ -7537,7 +7537,7 @@
       <author><first>Veronika</first><last>Thost</last><affiliation>International Business Machines</affiliation></author>
       <pages>1928-1942</pages>
       <abstract>Reasoning about subjective natural language descriptions, such as opinions and preferences, is a challenging topic that largely remains unsolved to date. In particular, state-of-the-art large language models (LLMs) perform disappointingly in this task, show strong biases, and do not meet the interpretability requirements often needed in these kinds of applications. We propose a novel approach for reasoning about subjective knowledge that integrates potential and implicit meanings and explicitly models the relational nature of the information. We apply supervised graph learning, offer explanations for the model’s reasoning, and show that our model performs well across all 15 topics of OpinionQA, outperforming several prominent LLMs. Our detailed analysis further shows its unique advantages and the complementary nature it offers in comparison to LLMs.</abstract>
-      <url hash="36fe3b2d">2024.findings-acl.115</url>
+      <url hash="7166b562">2024.findings-acl.115</url>
       <bibkey>hwang-etal-2024-graph</bibkey>
       <doi>10.18653/v1/2024.findings-acl.115</doi>
       <video href="2024.findings-acl.115.mp4"/>
@@ -7555,7 +7555,7 @@
       <author><first>Xiang</first><last>Wang</last><affiliation>University of Science and Technology of China</affiliation></author>
       <pages>1943-1958</pages>
       <abstract>Molecular Relational Learning (MRL), aiming to understand interactions between molecular pairs, plays a pivotal role in advancing biochemical research. Recently, the adoption of large language models (LLMs), known for their vast knowledge repositories and advanced logical inference capabilities, has emerged as a promising way for efficient and effective MRL. Despite their potential, these methods predominantly rely on textual data, thus not fully harnessing the wealth of structural information inherent in molecular graphs. Moreover, the absence of a unified framework exacerbates the issue of insufficient data exploitation, as it hinders the sharing of interaction mechanism learned across various datasets. To address these challenges, this work proposes a novel LLM-based multi-modal framework for molecular interaction modeling following Chain-of-Thought (CoT) theory, termed MolTC, which effectively integrate graphical information of two molecules in pair. To train this integrated framework efficiently, we introduce a *multi-hierarchical CoT theory* to refine its training paradigm, and conduct a comprehensive *Molecular Interactive Instructions* dataset for the development of biochemical LLMs involving MRL.Our experiments,conducted across various datasets involving over 4,000,000 molecular pairs, exhibit the superiority of our method over current GNN and LLM-based baselines. Code is available at https://github.com/MangoKiller/MolTC.</abstract>
-      <url hash="c6ad4349">2024.findings-acl.116</url>
+      <url hash="7c378f3e">2024.findings-acl.116</url>
       <bibkey>fang-etal-2024-moltc</bibkey>
       <doi>10.18653/v1/2024.findings-acl.116</doi>
     </paper>
@@ -7566,7 +7566,7 @@
       <author><first>Kai-Wei</first><last>Chang</last><affiliation>University of California, Los Angeles</affiliation></author>
       <pages>1959-1981</pages>
       <abstract>Despite the significant advancements in keyphrase extraction and keyphrase generation methods, the predominant approach for evaluation mainly relies on exact matching with human references. This scheme fails to recognize systems that generate keyphrases semantically equivalent to the references or diverse keyphrases that carry practical utility. To better assess the capability of keyphrase systems, we propose KPEval, a comprehensive evaluation framework consisting of four critical aspects: reference agreement, faithfulness, diversity, and utility. For each aspect, we design semantic-based metrics to reflect the evaluation objectives. Meta-evaluation studies demonstrate that our evaluation strategy correlates better with human preferences compared to a range of previously proposed metrics. Using KPEval, we re-evaluate 23 keyphrase systems and discover that (1) established model comparison results have blind-spots especially when considering reference-free evaluation; (2) large language models are underestimated by prior evaluation works; and (3) there is no single best model that can excel in all the aspects.</abstract>
-      <url hash="ba30f8f8">2024.findings-acl.117</url>
+      <url hash="3415fd3d">2024.findings-acl.117</url>
       <bibkey>wu-etal-2024-kpeval</bibkey>
       <doi>10.18653/v1/2024.findings-acl.117</doi>
       <video href="2024.findings-acl.117.mp4"/>
@@ -7579,7 +7579,7 @@
       <author><first>Guanglai</first><last>Gao</last><affiliation>Inner Mongolia University</affiliation></author>
       <pages>1982-1994</pages>
       <abstract>Knowledge graph embedding (KGE) is extensively employed for link prediction by representing entities and relations as low-dimensional vectors. In real-world scenarios, knowledge graphs (KGs) usually encompass diverse domains, which poses challenges to KG representations. However, existing KGE methods rarely make domain constraints on the embedding distribution of multi-domain KGs, leading to the embedding overlapping of different domains and performance degradation of link prediction. To address this challenge, we propose <b>Du</b>al <b>A</b>rchimedean <b>S</b>piral Knowledge Graph <b>E</b>mbedding (DuASE), a low-dimensional KGE model for multi-domain KGs. DuASE is inspired by our discovery that relation types can distinguish entities from different domains. Specifically, DuASE encodes entities with the same relation on the same Archimedean spiral, allowing it to differentiate the entities from different domains. To avoid embedding overlapping across domains, DuASE further makes the head and the tail spirals in the same triplet cluster to their respective domain space by a regularization function. Thus, DuASE can better capture the domain information and the dependencies between entities when modeling the multi-domain KGs, leading to improved KG representations. We validate the effectiveness of DuASE on the novel multi-domain dataset (<tex-math>n</tex-math>-MDKG) introduced in this study and three other benchmark datasets.</abstract>
-      <url hash="3fd62358">2024.findings-acl.118</url>
+      <url hash="8ba6c472">2024.findings-acl.118</url>
       <bibkey>li-etal-2024-learning</bibkey>
       <doi>10.18653/v1/2024.findings-acl.118</doi>
     </paper>
@@ -7593,7 +7593,7 @@
       <author><first>Chuan</first><last>Wu</last><affiliation>The University of Hong Kong</affiliation></author>
       <pages>1995-2008</pages>
       <abstract>With the remarkable capabilities, large language models (LLMs) have emergedas essential elements in numerous NLP applications, while parameter-efficientfinetuning, especially LoRA, has gained popularity as a lightweight approachfor model customization. Meanwhile, various dropout methods, initially designedfor full finetuning with all the parameters updated, alleviates overfittingassociated with excessive parameter redundancy. Hence, a possible contradictionarises from negligible trainable parameters of LoRA and the effectiveness ofprevious dropout methods, which has been largely overlooked. To fill this gap,we first confirm that parameter-efficient LoRA is also overfitting-prone. Wethen revisit transformer-specific dropout methods, and establish theirequivalence and distinctions mathematically and empirically. Building upon thiscomparative analysis, we introduce a unified framework for a comprehensiveinvestigation, which instantiates these methods based on dropping position,structural pattern and compensation measure. Through this framework, we revealthe new preferences and performance comparisons of them when involved withlimited trainable parameters. This framework also allows us to amalgamate themost favorable aspects into a novel dropout method named HiddenKey. Extensiveexperiments verify the remarkable superiority and sufficiency of HiddenKeyacross multiple models and tasks, which highlights it as the preferred approachfor high-performance and parameter-efficient finetuning of LLMs.</abstract>
-      <url hash="48916584">2024.findings-acl.119</url>
+      <url hash="586d6d3c">2024.findings-acl.119</url>
       <bibkey>wang-etal-2024-lora</bibkey>
       <doi>10.18653/v1/2024.findings-acl.119</doi>
       <video href="2024.findings-acl.119.mp4"/>
@@ -7609,7 +7609,7 @@
       <author><first>Chuanyi</first><last>Liu</last><affiliation>Harbin Institute of Technology</affiliation></author>
       <pages>2009-2024</pages>
       <abstract>Large Language Model (LLM)-based approach has become the mainstream for Text-to-SQL task and achieves remarkable performance. In this paper, we augment the existing prompt engineering methods by exploiting the database content and execution feedback. Specifically, we introduce DART-SQL, which comprises two key components: (1) Question Rewriting: DART-SQL rewrites natural language questions by leveraging database content information to eliminate ambiguity. (2) Execution-Guided Refinement: DART-SQL incorporates database content information and utilizes the execution results of the generated SQL to iteratively refine the SQL. We apply this framework to the two LLM-based approaches (DAIL-SQL and C3) and test it on four widely used benchmarks (Spider-dev, Spider-test, Realistic and DK). Experiments show that our framework for DAIL-SQL and C3 achieves an average improvement of 12.41% and 5.38%, respectively, in terms of execution accuracy(EX) metric.</abstract>
-      <url hash="dac55caf">2024.findings-acl.120</url>
+      <url hash="d0068300">2024.findings-acl.120</url>
       <bibkey>mao-etal-2024-enhancing</bibkey>
       <doi>10.18653/v1/2024.findings-acl.120</doi>
     </paper>
@@ -7621,7 +7621,7 @@
       <author><first>William Yang</first><last>Wang</last><affiliation>UC Santa Barbara</affiliation></author>
       <pages>2025-2038</pages>
       <abstract>Large language models often necessitate grounding on external knowledge to generate faithful and reliable answers. Yet even with the correct groundings in the reference, they can ignore them and rely on wrong groundings or their inherent biases to hallucinate when users, being largely unaware of the specifics of the stored information, pose questions that might not directly correlate with the retrieved groundings. In this work, we formulate this knowledge alignment problem and introduce MixAlign, a framework that interacts with both the human user and the knowledge base to obtain and integrate clarifications on how the user question relates to the stored information. MixAlign employs a language model to achieve automatic knowledge alignment and, if necessary, further enhances this alignment through human user clarifications. Experimental results highlight the crucial role of knowledge alignment in boosting model performance and mitigating hallucination, with improvements noted up to 22.2% and 27.1% respectively. We also demonstrate the effectiveness of MixAlign in improving knowledge alignment by producing high-quality, user-centered clarifications.</abstract>
-      <url hash="86892faa">2024.findings-acl.121</url>
+      <url hash="932c3300">2024.findings-acl.121</url>
       <bibkey>zhang-etal-2024-knowledge-alignment</bibkey>
       <doi>10.18653/v1/2024.findings-acl.121</doi>
       <video href="2024.findings-acl.121.mp4"/>
@@ -7642,7 +7642,7 @@
       <author><first>Anh Tuan</first><last>Luu</last><affiliation>Nanyang Technological University</affiliation></author>
       <pages>2039-2056</pages>
       <abstract>Knowledge Base Question Answering (KBQA) aims to answer natural language questions over large-scale knowledge bases (KBs), which can be summarized into two crucial steps: knowledge retrieval and semantic parsing. However, three core challenges remain: inefficient knowledge retrieval, mistakes of retrieval adversely impacting semantic parsing, and the complexity of previous KBQA methods. To tackle these challenges, we introduce ChatKBQA, a novel and simple generate-then-retrieve KBQA framework, which proposes first generating the logical form with fine-tuned LLMs, then retrieving and replacing entities and relations with an unsupervised retrieval method, to improve both generation and retrieval more directly. Experimental results show that ChatKBQA achieves new state-of-the-art performance on standard KBQA datasets, WebQSP, and CWQ. This work can also be regarded as a new paradigm for combining LLMs with knowledge graphs (KGs) for interpretable and knowledge-required question answering.</abstract>
-      <url hash="12777d3a">2024.findings-acl.122</url>
+      <url hash="14c787ff">2024.findings-acl.122</url>
       <bibkey>luo-etal-2024-chatkbqa</bibkey>
       <doi>10.18653/v1/2024.findings-acl.122</doi>
       <video href="2024.findings-acl.122.mp4"/>
@@ -7657,7 +7657,7 @@
       <author><first>Jingjing</first><last>Xu</last></author>
       <pages>2057-2080</pages>
       <abstract>With promising yet saturated results in high-resource settings, low-resource datasets have gradually become crucial benchmarks (e.g., BigBench Hard, superGLUE) for evaluating the learning ability of advanced neural networks. In this work, we find that there exists a set of “hard examples” in low-resource settings that challenge neural networks but are not well evaluated, which causes over-estimated performance. We first give a theoretical analysis on which factors bring the difficulty of low-resource learning. It then motivates us to propose a challenging benchmark Achilles-Bench to better evaluate the learning ability, which covers 11 datasets, including 8 natural language process (NLP) datasets and 3 computer vision (CV) datasets. Experiments on a wide range of models show that neural networks, even pre-trained language models, have sharp performance drops on our benchmark, demonstrating the effectiveness of evaluating the weaknesses of neural networks. On NLP tasks, we surprisingly find that despite better results on traditional low-resource benchmarks, pre-trained networks, does not show performance improvements on our benchmarks. there is still a large robustness gap between existing models and human-level performance, highlighting the need for robust low-resource learning models.</abstract>
-      <url hash="90e0827c">2024.findings-acl.123</url>
+      <url hash="8b50027c">2024.findings-acl.123</url>
       <bibkey>wang-etal-2024-achilles</bibkey>
       <doi>10.18653/v1/2024.findings-acl.123</doi>
     </paper>
@@ -7672,7 +7672,7 @@
       <author><first>Ge</first><last>Yu</last></author>
       <pages>2081-2107</pages>
       <abstract>This paper introduces INTERVENOR (INTERactiVE chaiN Of Repair), a system designed to emulate the interactive code repair processes observed in humans, encompassing both code diagnosis and code repair. INTERVENOR prompts Large Language Models (LLMs) to play distinct roles during the code repair process, functioning as both a Code Learner and a Code Teacher. Specifically, the Code Learner is tasked with adhering to instructions to generate or repair code, while the Code Teacher is responsible for crafting a Chain-of-Repair (CoR) to serve as guidance for the Code Learner. During generating the CoR, the Code Teacher needs to check the generated codes from Code Learner and reassess how to address code bugs based on error feedback received from compilers. Experimental results demonstrate that INTERVENOR surpasses baseline models, exhibiting improvements of approximately 18% and 4.3% over GPT-3.5 in code generation and code translation tasks, respectively. Our further analyses show that CoR is effective to illuminate the reasons behind bugs and outline solution plans in natural language. With the feedback of code compilers, INTERVENOR can accurately identify syntax errors and assertion errors and provide precise instructions to repair codes. All data and codes are available at [https://github.com/NEUIR/INTERVENOR](https://github.com/NEUIR/INTERVENOR).</abstract>
-      <url hash="a781ce3e">2024.findings-acl.124</url>
+      <url hash="ac26e901">2024.findings-acl.124</url>
       <bibkey>wang-etal-2024-intervenor</bibkey>
       <doi>10.18653/v1/2024.findings-acl.124</doi>
       <video href="2024.findings-acl.124.mp4"/>
@@ -7691,7 +7691,7 @@
       <author><first>Fei</first><last>Huang</last><affiliation>Alibaba Group</affiliation></author>
       <pages>2108-2126</pages>
       <abstract>Large language models (LLMs) have advanced the development of various AI conversational agents, including role-playing agents that mimic diverse characters and human behaviors. While prior research has predominantly focused on enhancing the conversational capability, role-specific knowledge and style of these agents, there has been a noticeable gap in assessing their social intelligence. In this paper, we introduce SocialBench, the first benchmark designed to systematically evaluate the sociality of role-playing agents at both individual and group levels of social interactions. SocialBench is constructed from various sources and covers a wide range of 500 characters and over 6,000 question prompts and 30,800 multi-turn role-playing utterances. We conduct comprehensive evaluations on this benchmark using mainstream LLMs. We find that agents excelling in individual level does not imply their proficiency in group level. Experimental results on SocialBench confirm its significance as a testbed for assessing the social interaction of role-playing agents. The benchmark is publicly accessible at https://github.com/X-PLUG/RoleInteract.</abstract>
-      <url hash="aa646ecc">2024.findings-acl.125</url>
+      <url hash="9c126adb">2024.findings-acl.125</url>
       <bibkey>chen-etal-2024-socialbench</bibkey>
       <doi>10.18653/v1/2024.findings-acl.125</doi>
       <video href="2024.findings-acl.125.mp4"/>
@@ -7708,7 +7708,7 @@
       <author><first>Qikai</first><last>Cheng</last></author>
       <pages>2127-2137</pages>
       <abstract>Evaluating large language models (LLMs) is fundamental, particularly in the context of practical applications. Conventional evaluation methods, typically designed primarily for LLM development, yield numerical scores that ignore the user experience. Therefore, our study shifts the focus from model-centered to human-centered evaluation in the context of AI-powered writing assistance applications. Our proposed metric, termed “Revision Distance,” utilizes LLMs to suggest revision edits that mimic the human writing process. It is determined by counting the revision edits generated by LLMs. Benefiting from the generated revision edit details, our metric can provide a self-explained text evaluation result in a human-understandable manner beyond the context-independent score. Our results show that for the easy-writing task, “Revision Distance” is consistent with established metrics (ROUGE, Bert-score, and GPT-score), but offers more insightful, detailed feedback and better distinguishes between texts. Moreover, in the context of challenging academic writing tasks, our metric still delivers reliable evaluations where other metrics tend to struggle. Furthermore, our metric also holds significant potential for scenarios lacking reference texts.</abstract>
-      <url hash="c0a63db2">2024.findings-acl.126</url>
+      <url hash="6358dd07">2024.findings-acl.126</url>
       <bibkey>ma-etal-2024-model</bibkey>
       <doi>10.18653/v1/2024.findings-acl.126</doi>
       <video href="2024.findings-acl.126.mp4"/>
@@ -7724,7 +7724,7 @@
       <author><first>Yongrui</first><last>Chen</last></author>
       <pages>2138-2148</pages>
       <abstract>Incomplete utterance rewriting (IUR) aims to reconstruct the utterance with omitted information and pronouns to be standalone and complete based on the context. The existing works predominantly focus on simple ellipsis and coreference problems in brief multi-turn dialogues. But in actual scenarios: 1) the context of the dialogues frequently comprises multiple similar candidates for ellipsis and coreference resolution, pouring to confuse. 2) the number of turns tends to be more extensive, while the content with various topics also grows more complex. This paper proposes a novel method called CaT to address these issues. In particular, we first devise a tacker model, distilled from GPT4-turbo, to adopt Context Tracking that dynamically updates a list of key phrases turn by turn, as accurate candidates for ellipsis and coreference resolution. Second, we further present the Dynamic Context Introduction mechanism to filter irrelevant preceding contexts that are not relied on by any element within the key phrase list to condense extended dialogues. Comprehensive experiments indicate that our solution provides a significant improvement over the existing baselines, and achieves state-of-the-art on three benchmarks.</abstract>
-      <url hash="d6f4a987">2024.findings-acl.127</url>
+      <url hash="106834a8">2024.findings-acl.127</url>
       <bibkey>guo-etal-2024-context</bibkey>
       <doi>10.18653/v1/2024.findings-acl.127</doi>
     </paper>
@@ -7737,7 +7737,7 @@
       <author><first>Yanghua</first><last>Xiao</last><affiliation>Fudan University</affiliation></author>
       <pages>2149-2176</pages>
       <abstract>Emotional intelligence in large language models (LLMs) is of great importance in Natural Language Processing. However, the previous research mainly focus on basic sentiment analysis tasks, such as emotion recognition, which is not enough to evaluate LLMs’ overall emotional intelligence. Therefore, this paper presents a novel framework named EmotionQueen for evaluating the emotional intelligence of LLMs. The framework includes four distinctive tasks: Key Event Recognition, Mixed Event Recognition, Implicit Emotional Recognition, and Intention Recognition. LLMs are requested to recognize important event or implicit emotions and generate empathetic response.We also design two metrics to evaluate LLMs’ capabilities in recognition and response for emotion-related statements. Experiments yield significant conclusions about LLMs’ capabilities and limitations in emotion intelligence.</abstract>
-      <url hash="718bc439">2024.findings-acl.128</url>
+      <url hash="e9ccd85a">2024.findings-acl.128</url>
       <bibkey>chen-etal-2024-emotionqueen</bibkey>
       <doi>10.18653/v1/2024.findings-acl.128</doi>
     </paper>
@@ -7754,7 +7754,7 @@
       <author><first>Tong</first><last>Zhang</last><affiliation>UIUC</affiliation></author>
       <pages>2177-2197</pages>
       <abstract>Since the emergence of large language models, prompt learning has become a popular method for optimizing and customizing these models. Special prompts, such as Chain-of-Thought, have even revealed previously unknown reasoning capabilities within these models. However, the progress of discovering effective prompts has been slow, driving a desire for general prompt optimization methods. Unfortunately, few existing prompt learning methods satisfy the criteria of being truly “general”, i.e., automatic, discrete, black-box, gradient-free, and interpretable all at once. In this paper, we introduce metaheuristics, a branch of discrete non-convex optimization methods with over 100 options, as a promising approach to prompt learning. Within our paradigm, we test six typical methods: hill climbing, simulated annealing, genetic algorithms with/without crossover, tabu search, and harmony search, demonstrating their effectiveness in white-box and black-box prompt learning. Furthermore, we show that these methods can be used to discover more human-understandable prompts that were previously unknown in both reasoning and image generation tasks, opening the door to a cornucopia of possibilities in prompt optimization.</abstract>
-      <url hash="9a1fa691">2024.findings-acl.129</url>
+      <url hash="85772b60">2024.findings-acl.129</url>
       <bibkey>pan-etal-2024-plum</bibkey>
       <doi>10.18653/v1/2024.findings-acl.129</doi>
       <video href="2024.findings-acl.129.mp4"/>
@@ -7769,7 +7769,7 @@
       <author><first>Yanghua</first><last>Xiao</last><affiliation>Fudan University</affiliation></author>
       <pages>2198-2224</pages>
       <abstract>In the era of social media video platforms, popular “hot-comments” play a crucial role in attracting user impressions of short-form videos, making them vital for marketing and branding purpose. However, existing research predominantly focuses on generating descriptive comments or “danmaku” in English, offering immediate reactions to specific video moments. Addressing this gap, our study introduces HOTVCOM, the largest Chinese video hot-comment dataset, comprising 94k diverse videos and 137 million comments. We also present the ComHeat framework, which synergistically integrates visual, auditory, and textual data to generate influential hot-comments on the Chinese video dataset. Empirical evaluations highlight the effectiveness of our framework, demonstrating its excellence on both the newly constructed and existing datasets.</abstract>
-      <url hash="ea351c3c">2024.findings-acl.130</url>
+      <url hash="774f5813">2024.findings-acl.130</url>
       <bibkey>chen-etal-2024-hotvcom</bibkey>
       <doi>10.18653/v1/2024.findings-acl.130</doi>
     </paper>
@@ -7783,7 +7783,7 @@
       <author><first>Yanghua</first><last>Xiao</last><affiliation>Fudan University</affiliation></author>
       <pages>2225-2238</pages>
       <abstract>The evaluation of the problem-solving capability under incomplete information scenarios of Large Language Models (LLMs) is increasingly important, encompassing capabilities such as questioning, knowledge search, error detection, and path planning. Current research mainly focus on LLMs’ problem-solving capability such as “Twenty Questions”.However, these kinds of games do not require recognizing misleading cues which are necessary in the incomplete information scenario.Moreover, the existing game such as “Who is undercover” are highly subjective, making it challenging for evaluation.Therefore, in this paper, we introduce a novel game named BrainKing based on the “Who is undercover” and “Twenty Questions” for evaluating LLM capabilities under incomplete information scenarios. It requires LLMs to identify target entities with limited yes-or-no questions and potential misleading answers. By setting up easy, medium, and hard difficulty modes, we comprehensively assess the performance of LLMs across various aspects. Our results reveal the capabilities and limitations of LLMs in BrainKing, providing significant insights of LLM problem-solving levels.</abstract>
-      <url hash="4b3873a0">2024.findings-acl.131</url>
+      <url hash="ad098ec5">2024.findings-acl.131</url>
       <bibkey>chen-etal-2024-large</bibkey>
       <doi>10.18653/v1/2024.findings-acl.131</doi>
     </paper>
@@ -7793,7 +7793,7 @@
       <author><first>Marek</first><last>Rei</last><affiliation>Imperial College London</affiliation></author>
       <pages>2239-2258</pages>
       <abstract>Knowledge distillation optimises a smaller student model to behave similarly to a larger teacher model, retaining some of the performance benefits. While this method can improve results on in-distribution examples, it does not necessarily generalise to out-of-distribution (OOD) settings. We investigate two complementary methods for improving the robustness of the resulting student models on OOD domains. The first approach augments the distillation with generated unlabeled examples that match the target distribution. The second method upsamples data points among the training set that are similar to the target distribution. When applied on the task of natural language inference (NLI), our experiments on MNLI show that distillation with these modifications outperforms previous robustness solutions. We also find that these methods improve performance on OOD domains even beyond the target domain.</abstract>
-      <url hash="2f1ba68b">2024.findings-acl.132</url>
+      <url hash="4b42d130">2024.findings-acl.132</url>
       <bibkey>stacey-rei-2024-distilling</bibkey>
       <doi>10.18653/v1/2024.findings-acl.132</doi>
       <video href="2024.findings-acl.132.mp4"/>
@@ -7807,7 +7807,7 @@
       <author><first>Jason</first><last>Baldridge</last><affiliation>Google</affiliation></author>
       <pages>2259-2273</pages>
       <abstract>Similar to vision-and-language navigation (VLN) tasks that focus on bridging the gap between vision and language for embodied navigation, the new Rendezvous (RVS) task requires reasoning over allocentric spatial relationships using non-sequential navigation instructions and maps. However, performance substantially drops in new environments with no training data.Using opensource descriptions paired with coordinates (e.g., Wikipedia) provides training data but suffers from limited spatially-oriented text resulting in low geolocation resolution. We propose a large-scale augmentation method for generating high-quality synthetic data for new environments using readily available geospatial data. Our method constructs a grounded knowledge-graph, capturing entity relationships. Sampled entities and relations (“shop north of school”) generate navigation instructions via (i) generating numerous templates using context-free grammar (CFG) to embed specific entities and relations; (ii) feeding the entities and relation into a large language model (LLM) for instruction generation. A comprehensive evaluation on RVS, showed that our approach improves the 100-meter accuracy by 45.83% on unseen environments. Furthermore, we demonstrate that models trained with CFG-based augmentation achieve superior performance compared with those trained with LLM-based augmentation, both in unseen and seen environments. These findings suggest that the potential advantages of explicitly structuring spatial information for text-based geospatial reasoning in previously unknown, can unlock data-scarce scenarios.</abstract>
-      <url hash="5a6521f3">2024.findings-acl.133</url>
+      <url hash="61b0f1c3">2024.findings-acl.133</url>
       <bibkey>paz-argaman-etal-2024-unknown</bibkey>
       <doi>10.18653/v1/2024.findings-acl.133</doi>
       <video href="2024.findings-acl.133.mp4"/>
@@ -7822,7 +7822,7 @@
       <author><first>Reut</first><last>Tsarfaty</last><affiliation>Google and Bar-Ilan University, Technion</affiliation></author>
       <pages>2274-2286</pages>
       <abstract>Despite it being the cornerstone of BPE, the most common tokenization algorithm, the importance of compression in the tokenization process is still unclear. In this paper, we argue for the theoretical importance of compression, that can be viewed as 0-gram language modeling where equal probability is assigned to all tokens. We also demonstrate the empirical importance of compression for downstream success of pre-trained language models. We control the compression ability of several BPE tokenizers by varying the amount of documents available during their training: from 1 million documents to a character-based tokenizer equivalent to no training data at all. We then pre-train English language models based on those tokenizers and fine-tune them over several tasks. We show that there is a correlation between tokenizers’ compression and models’ downstream performance, suggesting that compression is a reliable intrinsic indicator of tokenization quality. These correlations are more pronounced for generation tasks (over classification) or for smaller models (over large ones). We replicated a representative part of our experiments on Turkish and found similar results, confirming that our results hold for languages with typological characteristics dissimilar to English. We conclude that building better compressing tokenizers is a fruitful avenue for further research and for improving overall model performance.</abstract>
-      <url hash="20ee1a43">2024.findings-acl.134</url>
+      <url hash="cd03c066">2024.findings-acl.134</url>
       <bibkey>goldman-etal-2024-unpacking</bibkey>
       <doi>10.18653/v1/2024.findings-acl.134</doi>
       <video href="2024.findings-acl.134.mp4"/>
@@ -7840,7 +7840,7 @@
       <author><first>Heuiseok</first><last>Lim</last><affiliation>Korea University</affiliation></author>
       <pages>2287-2303</pages>
       <abstract>Byte Pair Encoding is an effective approach in machine translation across several languages. However, our analysis indicates that BPE is prone to over-segmentation in the morphologically rich language, Korean, which can erode word semantics and lead to semantic confusion during training. This semantic confusion, stemming from over-segmentation, ultimately contributes to a degradation of overall translation quality. To address this issue, we introduce Length-aware Subword Vocabulary Construction (LeVoC), a novel approach strategically incorporating longer words into the vocabulary. By utilizing an external monolingual Korean corpus, LeVoC extracts and integrates long words, effectively preserving morphological information and reducing semantic confusion. Our experiments demonstrate that LeVoC not only significantly outperforms BPE, but also can be applied to and surpass current state-of-the-art morpheme-aware subword tokenization methods. We provide evidence that the difficulty in translating sentences with long words in Korean is associated with morphological compositionality, and LeVoC’s ability to reduce semantic confusion during training leads to improved translation quality.</abstract>
-      <url hash="63786cdb">2024.findings-acl.135</url>
+      <url hash="54233cfa">2024.findings-acl.135</url>
       <bibkey>lee-etal-2024-length</bibkey>
       <doi>10.18653/v1/2024.findings-acl.135</doi>
       <video href="2024.findings-acl.135.mp4"/>
@@ -7855,7 +7855,7 @@
       <author><first>Matan</first><last>Eyal</last><affiliation>Allen Institute for Artificial Intelligence</affiliation></author>
       <pages>2304-2317</pages>
       <abstract>As instruction-tuned large language models (LLMs) gain global adoption, their ability to follow instructions in multiple languages becomes increasingly crucial. In this work, we investigate how multilinguality during instruction tuning of a multilingual LLM affects instruction-following across languages from the pre-training corpus. We first show that many languages transfer some instruction-following capabilities to other languages from even monolingual tuning. Furthermore, we find that only 40 multilingual examples integrated in an English tuning set substantially improve multilingual instruction-following, both in seen and unseen languages during tuning. In general, we observe that models tuned on multilingual mixtures exhibit comparable or superior performance in multiple languages compared to monolingually tuned models, despite training on 10x fewer examples in those languages. Finally, we find that diversifying the instruction tuning set with even just 2-4 languages significantly improves cross-lingual generalization. Our results suggest that building massively multilingual instruction-tuned models can be done with only a very small set of multilingual instruction-responses.</abstract>
-      <url hash="fb52f2ad">2024.findings-acl.136</url>
+      <url hash="c6afe426">2024.findings-acl.136</url>
       <bibkey>shaham-etal-2024-multilingual</bibkey>
       <doi>10.18653/v1/2024.findings-acl.136</doi>
     </paper>
@@ -7869,7 +7869,7 @@
       <author><first>Zheng</first><last>Liu</last></author>
       <pages>2318-2335</pages>
       <abstract>In this paper, we introduce a new embedding model called M3-Embedding, which is distinguished for its versatility in Multi-Linguality, Multi-Functionality, and Multi-Granularity. It provides a uniform support for the semantic retrieval of more than 100 working languages. It can simultaneously accomplish the three common retrieval functionalities: dense retrieval, multi-vector retrieval, and sparse retrieval. Besides, it is also capable of processing inputs of different granularities, spanning from short sentences to long documents of up to 8,192 tokens. The effective training of M3-Embedding presents a series of technical contributions. Notably, we propose a novel self-knowledge distillation approach, where the relevance scores from different retrieval functionalities can be integrated as the teacher signal to enhance the training quality. We also optimize the batching strategy, which enables a large batch size and high training throughput to improve the discriminativeness of embeddings. M3-Embedding exhibits a superior performance in our experiment, leading to new state-of-the-art results on multilingual, cross-lingual, and long-document retrieval benchmarks.</abstract>
-      <url hash="cde5a7c2">2024.findings-acl.137</url>
+      <url hash="53b49fee">2024.findings-acl.137</url>
       <bibkey>chen-etal-2024-m3</bibkey>
       <doi>10.18653/v1/2024.findings-acl.137</doi>
     </paper>
@@ -7887,7 +7887,7 @@
       <author><first>Xuanhua</first><last>Shi</last><affiliation>Huazhong University of Science and Technology</affiliation></author>
       <pages>2336-2353</pages>
       <abstract>Large Language Models (LLMs) have shown remarkable progress in automated code generation. Yet, LLM-generated code may contain errors in API usage, class, data structure, or missing project-specific information. As much of this project-specific context cannot fit into the prompts of LLMs, we must find ways to allow the model to explore the project-level code context. We present CoCoGen, a new code generation approach that uses compiler feedback to improve the LLM-generated code. CoCoGen first leverages static analysis to identify mismatches between the generated code and the project’s context. It then iteratively aligns and fixes the identified errors using information extracted from the code repository. We integrate CoCoGen with two representative LLMs, i.e., GPT-3.5-Turbo and Code Llama (13B), and apply it to Python code generation. Experimental results show that CoCoGen significantly improves the vanilla LLMs by over 80% in generating code dependent on the project context and consistently outperforms the existing retrieval-based code generation baselines.</abstract>
-      <url hash="40013528">2024.findings-acl.138</url>
+      <url hash="fb0bfb5b">2024.findings-acl.138</url>
       <bibkey>bi-etal-2024-iterative</bibkey>
       <doi>10.18653/v1/2024.findings-acl.138</doi>
       <video href="2024.findings-acl.138.mp4"/>
@@ -7901,7 +7901,7 @@
       <author><first>Kelong</first><last>Mao</last></author>
       <pages>2354-2365</pages>
       <abstract>Legal case retrieval plays an important role in promoting judicial justice and fairness. One of its greatest challenges is that the definition of relevance goes far beyond the common semantic relevance as in ad-hoc retrieval. In this paper, we reveal that the legal elements, which typically comprise key facts in a specialized legal context, can largely improve the relevance matching of legal case retrieval. To facilitate the use of legal elements, we construct a Chinese legal element dataset called LeCaRD-Elem based on the widely-used LeCaRD dataset, through a two-stage semi-automatic method with a minimized reliance on human labor. Meanwhile, we introduce two new models to enhance legal search using legal elements. The first, Elem4LCR-E, is a two-stage model that explicitly predicts legal elements from texts and then leverages them for improved ranking. Recognizing the potential benefits of more seamless integration, we further propose an end-to-end model called Elem4LCR-I, which internalizes the legal element knowledge into its model parameters using a tailored teacher-student training framework. Extensive experiments underscore the significant value of legal elements and demonstrate the superiority of our two proposed models in enhancing legal search over existing methods.</abstract>
-      <url hash="4915df2c">2024.findings-acl.139</url>
+      <url hash="f63465bd">2024.findings-acl.139</url>
       <bibkey>deng-etal-2024-element</bibkey>
       <doi>10.18653/v1/2024.findings-acl.139</doi>
     </paper>
@@ -7918,7 +7918,7 @@
       <author><first>Zhongyu</first><last>Wei</last><affiliation>Fudan University</affiliation></author>
       <pages>2366-2389</pages>
       <abstract>The growth of social media, characterized by its multimodal nature, has led to the emergence of diverse phenomena and challenges, which calls for an effective approach to uniformly solve automated tasks. The powerful Large Vision Language Models make it possible to handle a variety of tasks simultaneously, but even with carefully designed prompting methods, the general domain models often fall short in aligning with the unique speaking style and context of social media tasks. In this paper, we introduce a Large Vision Language Model for Social Media Processing (SoMeLVLM), which is a cognitive framework equipped with five key capabilities including knowledge &amp; comprehension, application, analysis, evaluation, and creation. SoMeLVLM is designed to understand and generate realistic social media behavior. We have developed a 654k multimodal social media instruction-tuning dataset to support our cognitive framework and fine-tune our model. Our experiments demonstrate that SoMeLVLM achieves state-of-the-art performance in multiple social media tasks. Further analysis shows its significant advantages over baselines in terms of cognitive abilities.</abstract>
-      <url hash="0760544b">2024.findings-acl.140</url>
+      <url hash="224b79fe">2024.findings-acl.140</url>
       <bibkey>zhang-etal-2024-somelvlm</bibkey>
       <doi>10.18653/v1/2024.findings-acl.140</doi>
       <video href="2024.findings-acl.140.mp4"/>
@@ -7933,7 +7933,7 @@
       <author><first>Heuiseok</first><last>Lim</last><affiliation>Korea University</affiliation></author>
       <pages>2390-2415</pages>
       <abstract>The evolution of large language models (LLMs) has culminated in a multitask model paradigm where prompts drive the generation of user-specific outputs. However, this advancement has revealed a critical challenge: LLMs frequently produce outputs against socially acceptable commonsense standards in various scenarios. To address this gap in commonsense reasoning, we present KoCommonGEN v2, a fine-grained benchmark dataset focused on Korean commonsense reasoning. This dataset, enriched with human annotations, comprises multiple-choice questions across seven error categories. These categories include commonsense memorization, numerical commonsense, toxic speech, and more, which are vulnerable to undermining the reliability of LLMs’ commonsense reasoning capabilities. The empirical results present that LLMs struggle with Korean commonsense reasoning. With human accuracy benchmarked at approximately 85%, GPT-4’s performance lags at about 74%, and other LLMs demonstrate an average accuracy of around 42%. Our findings emphasize the need for targeted improvements in Korean commonsense reasoning within LLMs, paving the way for more socially and contextually sensitive AI models.</abstract>
-      <url hash="8993de68">2024.findings-acl.141</url>
+      <url hash="e7e87161">2024.findings-acl.141</url>
       <bibkey>seo-etal-2024-kocommongen</bibkey>
       <doi>10.18653/v1/2024.findings-acl.141</doi>
       <video href="2024.findings-acl.141.mp4"/>
@@ -7949,7 +7949,7 @@
       <author><first>Georgios</first><last>Kollias</last><affiliation>International Business Machines</affiliation></author>
       <pages>2416-2430</pages>
       <abstract>Transformer-based Language Models have become ubiquitous in Natural Language Processing (NLP) due to their impressive performance on various tasks. However, expensive training as well as inference remains a significant impediment to their widespread applicability. While enforcing sparsity at various levels of the model architecture has found promise in addressing scaling and efficiency issues, there remains a disconnect between how sparsity affects network topology. Inspired by brain neuronal networks, we explore sparsity approaches through the lens of network topology. Specifically, we exploit mechanisms seen in biological networks, such as preferential attachment and redundant synapse pruning, and show that principled, model-agnostic sparsity approaches are performant and efficient across diverse NLP tasks, spanning both classification (such as natural language inference) and generation (summarization, machine translation), despite our sole objective not being optimizing performance. NeuroPrune is competitive with (or sometimes superior to) baselines on performance and can be up to 10x faster in terms of training time for a given level of sparsity, simultaneously exhibiting measurable improvements in inference time in many cases.</abstract>
-      <url hash="9f28769e">2024.findings-acl.142</url>
+      <url hash="f1640698">2024.findings-acl.142</url>
       <bibkey>dhurandhar-etal-2024-neuroprune</bibkey>
       <doi>10.18653/v1/2024.findings-acl.142</doi>
       <video href="2024.findings-acl.142.mp4"/>
@@ -7963,7 +7963,7 @@
       <author><first>Karthikeyan</first><last>Natesan Ramamurthy</last><affiliation>International Business Machines</affiliation></author>
       <pages>2431-2452</pages>
       <abstract>Evaluation and ranking of large language models (LLMs) has become an important problem with the proliferation of these models and their impact. Evaluation methods either require human responses which are expensive to acquire or use pairs of LLMs to evaluate each other which can be unreliable. In this paper, we provide a novel perspective where, given a dataset of prompts (viz. questions, instructions, etc.) and a set of LLMs, we rank them without access to any ground truth or reference responses. Inspired by real life where both an expert and a knowledgeable person can identify a novice our main idea is to consider triplets of models, where each one of them evaluates the other two, correctly identifying the worst model in the triplet with high probability. We also analyze our idea and provide sufficient conditions for it to succeed. Applying this idea repeatedly we propose two methods to rank LLMs. In experiments on different generative tasks (summarization, multiple-choice, and dialog), our methods reliably recover true rankings without reference data. This points to a viable low-resource mechanism for practical use.</abstract>
-      <url hash="9757ee04">2024.findings-acl.143</url>
+      <url hash="dfb6838a">2024.findings-acl.143</url>
       <bibkey>dhurandhar-etal-2024-ranking</bibkey>
       <doi>10.18653/v1/2024.findings-acl.143</doi>
       <video href="2024.findings-acl.143.mp4"/>
@@ -7979,7 +7979,7 @@
       <author><first>Zhengwei</first><last>Tao</last></author>
       <pages>2453-2473</pages>
       <abstract>The utilization of large language models for medical dialogue generation has attracted considerable attention due to its potential to enhance response richness and coherence. While previous studies have made strides in optimizing model performance, there is a pressing need to bolster the model’s capacity for diagnostic logic to ensure patient safety. In response to this need, we propose an approach termed preference learning from process feedback (PLPF), which involves integrating the doctor’s diagnostic logic into LLMs. PLPF encompasses three key components: rule modeling, preference data generation, and preference alignment. These components collectively serve to train the model to adhere to the diagnostic process. Our experimental results, utilizing Standardized Patient Testing, demonstrate that PLPF enhances the diagnostic accuracy of the baseline model in medical conversations by 17.6%, surpassing the performance of traditional approaches. Moreover, PLPF exhibits effectiveness in both multi-round and single-round dialogue tasks, thereby highlighting its potential in improving medical dialogue generation. Our dataset is available at https://github.com/Chengfeng-Dou/SpTesting.</abstract>
-      <url hash="ee5c1d47">2024.findings-acl.144</url>
+      <url hash="22773d1e">2024.findings-acl.144</url>
       <bibkey>dou-etal-2024-integrating</bibkey>
       <doi>10.18653/v1/2024.findings-acl.144</doi>
     </paper>
@@ -7991,7 +7991,7 @@
       <author><first>Xingrun</first><last>Xing</last></author>
       <pages>2474-2488</pages>
       <abstract>The pre-trained language models are continually fine-tuned to better support downstream applications. However, this operation may result in significant performance degeneration on general tasks beyond the targeted domain. To overcome this problem, we propose LM-Cocktail which enables the fine-tuned model to stay resilient in general perspectives. Our method is conducted in the form of model merging, where the fine-tuned language model is merged with the pre-trained base model or the peer models from other domains through weighted average. Despite simplicity, LM-Cocktail is surprisingly effective: the resulted model is able to achieve a strong empirical performance in the whole scope of general tasks while preserving a superior capacity in its targeted domain.</abstract>
-      <url hash="a24a5d96">2024.findings-acl.145</url>
+      <url hash="a4aa8fae">2024.findings-acl.145</url>
       <bibkey>xiao-etal-2024-lm</bibkey>
       <doi>10.18653/v1/2024.findings-acl.145</doi>
     </paper>
@@ -8003,7 +8003,7 @@
       <author><first>Tieyun</first><last>Qian</last><affiliation>Wuhan University</affiliation></author>
       <pages>2489-2511</pages>
       <abstract>Large language models (LLMs) have achieved satisfactory performance in counterfactual generation. However, confined by the stochastic generation process of LLMs, there often are misalignments between LLMs and humans which hinder LLMs from handling complex tasks like relation extraction. As a result, LLMs may generate commonsense-violated counterfactuals like ‘eggs were produced by a box’. To bridge this gap, we propose to mimick the episodic memory retrieval, the working mechanism of human hippocampus, to align LLMs’ generation process with that of humans. In this way, LLMs can derive experience from their extensive memory, which keeps in line with the way humans gain commonsense. We then implement two central functions in the hippocampus, i.e., pattern separation and pattern completion, to retrieve the episodic memory from LLMs and generate commonsense counterfactuals for relation extraction. Experimental results demonstrate the improvements of our framework over existing methods in terms of the quality of counterfactuals.</abstract>
-      <url hash="53da2730">2024.findings-acl.146</url>
+      <url hash="c5c3abf2">2024.findings-acl.146</url>
       <bibkey>miao-etal-2024-episodic</bibkey>
       <doi>10.18653/v1/2024.findings-acl.146</doi>
     </paper>
@@ -8038,7 +8038,7 @@
       <author><first>Saif</first><last>Mohammad</last><affiliation>National Research Council Canada</affiliation></author>
       <pages>2512-2530</pages>
       <abstract>Exploring and quantifying semantic relatedness is central to representing language and holds significant implications across various NLP tasks. While earlier NLP research primarily focused on semantic similarity, often within the English language context, we instead investigate the broader phenomenon of semantic relatedness. In this paper, we present <i>SemRel</i>, a new semantic relatedness dataset collection annotated by native speakers across 13 languages: <i>Afrikaans, Algerian Arabic, Amharic, English, Hausa, Hindi, Indonesian, Kinyarwanda, Marathi, Moroccan Arabic, Modern Standard Arabic, Spanish,</i> and <i>Telugu</i>. These languages originate from five distinct language families and are predominantly spoken in Africa and Asia – regions characterised by a relatively limited availability of NLP resources. Each instance in the SemRel datasets is a sentence pair associated with a score that represents the degree of semantic textual relatedness between the two sentences. The scores are obtained using a comparative annotation framework. We describe the data collection and annotation processes, challenges when building the datasets, baseline experiments, and their impact and utility in NLP.</abstract>
-      <url hash="18c53f5f">2024.findings-acl.147</url>
+      <url hash="ea378668">2024.findings-acl.147</url>
       <bibkey>ousidhoum-etal-2024-semrel2024</bibkey>
       <doi>10.18653/v1/2024.findings-acl.147</doi>
       <video href="2024.findings-acl.147.mp4"/>
@@ -8049,7 +8049,7 @@
       <author><first>Xiaojun</first><last>Quan</last><affiliation>SUN YAT-SEN UNIVERSITY</affiliation></author>
       <pages>2531-2546</pages>
       <abstract>Chinese grammatical error correction (CGEC) faces serious overcorrection challenges when employing autoregressive generative models such as sequence-to-sequence (Seq2Seq) models and decoder-only large language models (LLMs). While previous methods aim to address overcorrection in Seq2Seq models, they are difficult to adapt to decoder-only LLMs. In this paper, we propose an alignment-enhanced corrector for the overcorrection problem that applies to both Seq2Seq models and decoder-only LLMs. Our method first trains a correction model to generate an initial correction of the source sentence. Then, we combine the source sentence with the initial correction and feed it through an alignment model for another round of correction, aiming to enforce the alignment model to focus on potential overcorrection. Moreover, to enhance the model’s ability to identify nuances, we further explore the reverse alignment of the source sentence and the initial correction. Finally, we transfer the alignment knowledge from two alignment models to the correction model, instructing it on how to avoid overcorrection. Experimental results on three CGEC datasets demonstrate the effectiveness of our approach in alleviating overcorrection and improving overall performance. Our code has been made publicly available.</abstract>
-      <url hash="a2ddd8ab">2024.findings-acl.148</url>
+      <url hash="e431bfdd">2024.findings-acl.148</url>
       <bibkey>yang-quan-2024-alirector</bibkey>
       <doi>10.18653/v1/2024.findings-acl.148</doi>
       <video href="2024.findings-acl.148.mp4"/>
@@ -8061,7 +8061,7 @@
       <author><first>Aykut</first><last>Koc</last><affiliation>Bilkent University</affiliation></author>
       <pages>2547-2556</pages>
       <abstract>The emergence of transformers has revolutionized natural language processing (NLP), as evidenced in various NLP tasks. While graph neural networks (GNNs) show recent promise in NLP, they are not standalone replacements for transformers. Rather, recent research explores combining transformers and GNNs. Existing GNN-based approaches rely on static graph construction methods requiring excessive text processing, and most of them are not scalable with the increasing document and word counts. We address these limitations by proposing a novel dynamic graph construction method for text documents based on vector visibility graphs (VVGs) generated from transformer output. Then, we introduce visibility pooler (VISPool), a scalable model architecture that seamlessly integrates VVG convolutional networks into transformer pipelines. We evaluate the proposed model on the General Language Understanding Evaluation (GLUE) benchmark datasets. VISPool outperforms the baselines with less trainable parameters, demonstrating the viability of the visibility-based graph construction method for enhancing transformers with GNNs.</abstract>
-      <url hash="325c77d1">2024.findings-acl.149</url>
+      <url hash="700332ff">2024.findings-acl.149</url>
       <bibkey>alikasifoglu-etal-2024-vispool</bibkey>
       <doi>10.18653/v1/2024.findings-acl.149</doi>
       <video href="2024.findings-acl.149.mp4"/>
@@ -8074,7 +8074,7 @@
       <author><first>Saif</first><last>Mohammad</last><affiliation>National Research Council Canada</affiliation></author>
       <pages>2557-2574</pages>
       <abstract>Stories are rich in the emotions they exhibit in their narratives and evoke in the readers. The emotional journeys of the various characters within a story are central to their appeal. Computational analysis of the emotions of novels, however, has rarely examined the variation in the emotional trajectories of the different characters within them, instead considering the entire novel to represent a single story arc. In this work, we use character dialogue to distinguish between the emotion arcs of the narration and the various characters. We analyze the emotion arcs of the various characters in a dataset of English literary novels using the framework of Utterance Emotion Dynamics. Our findings show that the narration and the dialogue largely express disparate emotions through the course of a novel, and that the commonalities or differences in the emotional arcs of stories are more accurately captured by those associated with individual characters.</abstract>
-      <url hash="16d9355a">2024.findings-acl.150</url>
+      <url hash="5bf5ef73">2024.findings-acl.150</url>
       <bibkey>vishnubhotla-etal-2024-emotion</bibkey>
       <doi>10.18653/v1/2024.findings-acl.150</doi>
     </paper>
@@ -8084,7 +8084,7 @@
       <author><first>Denilson</first><last>Barbosa</last><affiliation>University of Alberta</affiliation></author>
       <pages>2575-2587</pages>
       <abstract>Open-domain question answering (Open-QA) is a common task for evaluating large language models (LLMs). However, current Open-QA evaluations are criticized for the ambiguity in questions and the lack of semantic understanding in evaluators. Complex evaluators, powered by foundation models or LLMs and pertaining to semantic equivalence, still deviate from human judgments by a large margin. We propose to study the entailment relations of answers to identify more informative and more general system answers, offering a much closer evaluation to human judgment on both NaturalQuestions and TriviaQA while being learning-free. The entailment-based evaluation we propose allows the assignment of bonus or partial marks by quantifying the inference gap between answers, enabling a nuanced ranking of answer correctness that has higher AUC than current methods.</abstract>
-      <url hash="29ea7104">2024.findings-acl.151</url>
+      <url hash="c8432d0f">2024.findings-acl.151</url>
       <bibkey>yao-barbosa-2024-accurate</bibkey>
       <doi>10.18653/v1/2024.findings-acl.151</doi>
       <video href="2024.findings-acl.151.mp4"/>
@@ -8096,7 +8096,7 @@
       <author><first>Antonios</first><last>Anastasopoulos</last><affiliation>Athena Research Center and George Mason University</affiliation></author>
       <pages>2588-2595</pages>
       <abstract>Multi-word expressions (MWEs) present unique challenges in natural language processing (NLP), particularly within the context of translation systems, due to their inherent scarcity, non-compositional nature, and other distinct lexical and morphosyntactic characteristics, issues that are exacerbated in low-resource settings.In this study, we elucidate and attempt to address these challenges by leveraging a substantial corpus of human-annotated Greek MWEs. To address the complexity of translating such phrases, we propose a novel method leveraging an available out-of-context lexicon.We assess the translation capabilities of current state-of-the-art systems on this task, employing both automated metrics and human evaluators.We find that by using our method when applicable, the performance of current systems can be significantly improved, however these models are still unable to produce translations comparable to those of a human speaker.</abstract>
-      <url hash="f5959f43">2024.findings-acl.152</url>
+      <url hash="f7df4390">2024.findings-acl.152</url>
       <bibkey>dimakis-etal-2024-dictionary</bibkey>
       <doi>10.18653/v1/2024.findings-acl.152</doi>
       <video href="2024.findings-acl.152.mp4"/>
@@ -8109,7 +8109,7 @@
       <author><first>Cheng-Lin</first><last>Liu</last><affiliation>Institute of automation, Chinese academy of science, Chinese Academy of Sciences</affiliation></author>
       <pages>2596-2608</pages>
       <abstract>Geometry problem solving (GPS) is a challenging mathematical reasoning task requiring multi-modal understanding, fusion, and reasoning. Existing neural solvers take GPS as a vision-language task but are short in the representation of geometry diagrams that carry rich and complex layout information. In this paper, we propose a layout-aware neural solver named LANS, integrated with two new modules: multimodal layout-aware pre-trained language module (MLA-PLM) and layout-aware fusion attention (LA-FA). MLA-PLM adopts structural-semantic pre-training (SSP) to implement global relationship modeling, and point-match pre-training (PMP) to achieve alignment between visual points and textual points. LA-FA employs a layout-aware attention mask to realize point-guided cross-modal fusion for further boosting layout awareness of LANS. Extensive experiments on datasets Geometry3K and PGPS9K validate the effectiveness of the layout-aware modules and superior problem-solving performance of our LANS solver, over existing symbolic and neural solvers. We have made our code and data publicly available.</abstract>
-      <url hash="f89499ff">2024.findings-acl.153</url>
+      <url hash="b40fbdb5">2024.findings-acl.153</url>
       <bibkey>li-etal-2024-lans</bibkey>
       <doi>10.18653/v1/2024.findings-acl.153</doi>
       <video href="2024.findings-acl.153.mp4"/>
@@ -8125,7 +8125,7 @@
       <author><first>Yulia</first><last>Tsvetkov</last><affiliation>Department of Computer Science, University of Washington</affiliation></author>
       <pages>2609-2636</pages>
       <abstract>We propose Knowledge Crosswords, a geometric knowledge reasoning benchmark consisting of incomplete knowledge networks bounded by structured factual constraints, where LLMs are tasked with inferring the missing facts to meet all constraints. The novel setting of geometric knowledge reasoning necessitates new LM abilities beyond existing atomic/linear multi-hop QA, such as backtracking, verifying facts and constraints, reasoning with uncertainty, and more. Knowledge Crosswords contains 2,101 individual problems, covering diverse knowledge domains, and is further divided into three difficulty levels. We conduct extensive experiments to evaluate existing LLMs and approaches on Knowledge Crosswords. Results demonstrate that baseline approaches struggle with larger knowledge networks and semantically-equivalent entity distractors. In light of their limitations, we propose two new approaches, Staged Prompting and Verify-All, to augment LLMs’ abilities for error-aware backtracking and constraint verification. Our Verify-All significantly outperforms prior methods and is more robust towards problems in the hard subset. Further analysis shows that geometric knowledge reasoning poses new challenges to LLMs’ knowledge abilities, particularly in robustness towards varying option orders, complex structural constraints in knowledge networks, “none of the above” scenarios, and more.</abstract>
-      <url hash="294b6bba">2024.findings-acl.154</url>
+      <url hash="7d2c401c">2024.findings-acl.154</url>
       <bibkey>ding-etal-2024-knowledge</bibkey>
       <doi>10.18653/v1/2024.findings-acl.154</doi>
       <video href="2024.findings-acl.154.mp4"/>
@@ -8140,7 +8140,7 @@
       <author><first>Minnan</first><last>Luo</last><affiliation>Xi’an Jiaotong University</affiliation></author>
       <pages>2637-2667</pages>
       <abstract>Large language models are limited by challenges in factuality and hallucinations to be directly employed off-the-shelf for judging the veracity of news articles, where factual accuracy is paramount. In this work, we propose DELL that identifies three key stages in misinformation detection where LLMs could be incorporated as part of the pipeline: 1) LLMs could generate news reactions to represent diverse perspectives and simulate user-news interaction networks; 2) LLMs could generate explanations for proxy tasks (e.g., sentiment, stance) to enrich the contexts of news articles and produce experts specializing in various aspects of news understanding; 3) LLMs could merge task-specific experts and provide an overall prediction by incorporating the predictions and confidence scores of varying experts. Extensive experiments on seven datasets with three LLMs demonstrate that DELL outperforms state-of-the-art baselines by up to 16.8% in macro f1-score. Further analysis reveals that the generated reactions and explanations are greatly helpful in misinformation detection, while our proposed LLM-guided expert merging helps produce better-calibrated predictions.</abstract>
-      <url hash="4024b96d">2024.findings-acl.155</url>
+      <url hash="4d23081e">2024.findings-acl.155</url>
       <bibkey>wan-etal-2024-dell</bibkey>
       <doi>10.18653/v1/2024.findings-acl.155</doi>
       <video href="2024.findings-acl.155.mp4"/>
@@ -8158,7 +8158,7 @@
       <author><first>Daniel</first><last>Khashabi</last><affiliation>Johns Hopkins University</affiliation></author>
       <pages>2668-2680</pages>
       <abstract>As the influence of large language models (LLMs) spans across global communities, their safety challenges in multilingual settings become paramount for alignment research. This paper examines the variations in safety challenges faced by LLMs across different languages and discusses approaches to alleviating such concerns. By comparing how state-of-the-art LLMs respond to the same set of malicious prompts written in higher- vs. lower-resource languages,we observe that (1) LLMs tend to generate unsafe responses much more often when a malicious prompt is written in a lower-resource language, and (2) LLMs tend to generate more irrelevant responses to malicious prompts in lower-resource languages. To understand where the discrepancy can be attributed, we study the effect of instruction tuning with reinforcement learning from human feedback (RLHF) or supervised finetuning (SFT) on the HH-RLHF dataset. Surprisingly, while training with high-resource languages improves model alignment, training in lower-resource languages yields minimal improvement. This suggests that the bottleneck of cross-lingual alignment is rooted in the pretraining stage. Our findings highlight the challenges in cross-lingual LLM safety, and we hope they inform future research in this direction.</abstract>
-      <url hash="ba0137df">2024.findings-acl.156</url>
+      <url hash="1a312a36">2024.findings-acl.156</url>
       <bibkey>shen-etal-2024-language</bibkey>
       <doi>10.18653/v1/2024.findings-acl.156</doi>
     </paper>
@@ -8175,7 +8175,7 @@
       <author><first>Leonid</first><last>Karlinsky</last><affiliation>IBM Research AI</affiliation></author>
       <pages>2681-2706</pages>
       <abstract>Recent works have demonstrated the effectiveness of self-alignment in which a large language model is aligned to follow general instructions using instructional data generated from the model itself starting from a handful of human-written seeds. Instead of general alignment, in this work, we focus on self-alignment for expert domain specialization (e.g., biomedicine, finance). As a preliminary, we quantitively show the marginal effect that generic instruction-following training has on downstream expert domains’ performance. To remedy this, we propose self-specialization - allowing for effective model specialization while achieving cross-task generalization by leveraging only a few labeled seeds. Self-specialization offers a data- and parameter-efficient way of “carving out” an expert model out of a generalist pre-trained LLM. Exploring a variety of popular open large models as a base for specialization, our experimental results in both biomedical and financial domains show that our self-specialized models outperform their base models by a large margin, and even larger models that are generally instruction-tuned or that have been adapted to the target domain by other means.</abstract>
-      <url hash="c4dfbf1f">2024.findings-acl.157</url>
+      <url hash="0db16ebf">2024.findings-acl.157</url>
       <bibkey>kang-etal-2024-self</bibkey>
       <doi>10.18653/v1/2024.findings-acl.157</doi>
       <video href="2024.findings-acl.157.mp4"/>
@@ -8191,7 +8191,7 @@
       <author><first>Yizhou</first><last>Sun</last><affiliation>University of California, Los Angeles</affiliation></author>
       <pages>2707-2720</pages>
       <abstract>Taxonomy Expansion, which relies on modeling concepts and concept relations, can be formulated as a set representation learning task. The generalization of set, fuzzy set, incorporates uncertainty and measures the information within a semantic concept, making it suitable for concept modeling. Existing works usually model sets as vectors or geometric objects such as boxes, which are not closed under set operations. In this work, we propose a sound and efficient formulation of set representation learning based on its volume approximation as a fuzzy set. The resulting embedding framework, <i>Fuzzy Set Embedding</i>, satisfies all set operations and compactly approximates the underlying fuzzy set, hence preserving information while being efficient to learn, relying on minimum neural architecture. We empirically demonstrate the power of FUSE on the task of taxonomy expansion, where FUSE achieves remarkable improvements up to 23% compared with existing baselines. Our work marks the first attempt to understand and efficiently compute the embeddings of fuzzy sets.</abstract>
-      <url hash="70553188">2024.findings-acl.158</url>
+      <url hash="58867318">2024.findings-acl.158</url>
       <bibkey>xu-etal-2024-fuse</bibkey>
       <doi>10.18653/v1/2024.findings-acl.158</doi>
       <video href="2024.findings-acl.158.mp4"/>
@@ -8204,7 +8204,7 @@
       <author><first>Rajiv</first><last>Jain</last><affiliation>Adobe Systems</affiliation></author>
       <pages>2721-2733</pages>
       <abstract>Rule-based reasoning, a fundamental type of legal reasoning, enables us to draw conclusions by accurately applying a rule to a set of facts. We explore causal language models as rule-based reasoners, specifically with respect to compositional rules - rules consisting of multiple elements which form a complex logical expression. Reasoning about compositional rules is challenging because it requires multiple reasoning steps, and attending to the logical relationships between elements. We introduce a new prompting method, Chain of Logic, which elicits rule-based reasoning through decomposition (solving elements as independent threads of logic), and recomposition (recombining these sub-answers to resolve the underlying logical expression). This method was inspired by the IRAC (Issue, Rule, Application, Conclusion) framework, a sequential reasoning approach used by lawyers. We evaluate chain of logic across eight rule-based reasoning tasks involving three distinct compositional rules from the LegalBench benchmark and demonstrate it consistently outperforms other prompting methods, including chain of thought and self-ask, using open-source and commercial language models.</abstract>
-      <url hash="5bcd0b37">2024.findings-acl.159</url>
+      <url hash="42ba2924">2024.findings-acl.159</url>
       <bibkey>servantez-etal-2024-chain</bibkey>
       <doi>10.18653/v1/2024.findings-acl.159</doi>
       <video href="2024.findings-acl.159.mp4"/>
@@ -8215,7 +8215,7 @@
       <author><first>Hung-yi</first><last>Lee</last><affiliation>National Taiwan University</affiliation></author>
       <pages>2734-2751</pages>
       <abstract>Long-form generations from large language models (LLMs) contain a mix of factual and non-factual claims, making evaluating factuality difficult.Prior works evaluate the factuality of a long paragraph by decomposing it into multiple facts, verifying those facts independently, and aggregating the results.Such methods assume that combining factual claims forms a factual paragraph.The above assumption can be violated: we show that strong open-source models like Llama-chat can generate paragraphs that contain verifiable facts, but the facts are combined into a non-factual paragraph due to entity ambiguity.We further reveal that existing factuality metrics, including FActScore and citation recall, cannot properly evaluate these non-factual paragraphs and overestimate their factuality.To address this, we introduce an enhanced metric, **D-FActScore**, specifically designed for content with ambiguous entities.We evaluate the D-FActScores of people biographies generated by retrieval-augmented LLMs.We show that D-FActScore can better assess the factuality of paragraphs with entity ambiguity than FActScore.We also find that four widely used open-source LLMs tend to mix information of distinct entities to form non-factual paragraphs, making their D-FActScore much lower than FActScore by over 10%.</abstract>
-      <url hash="38ee7981">2024.findings-acl.160</url>
+      <url hash="04bc0118">2024.findings-acl.160</url>
       <bibkey>chiang-lee-2024-merging</bibkey>
       <doi>10.18653/v1/2024.findings-acl.160</doi>
       <video href="2024.findings-acl.160.mp4"/>
@@ -8229,7 +8229,7 @@
       <author><first>Tal</first><last>Linzen</last><affiliation>New York University and Google</affiliation></author>
       <pages>2752-2773</pages>
       <abstract>Do LMs infer the semantics of text from co-occurrence patterns in their training data? Merrill et al. (2022) argue that, in theory, sentence co-occurrence probabilities predicted by an optimal LM should reflect the entailment relationship of the constituent sentences, but it is unclear whether probabilities predicted by neural LMs encode entailment in this way because of strong assumptions made by Merrill et al. (namely, that humans always avoid redundancy). In this work, we investigate whether their theory can be used to decode entailment relations from neural LMs. We find that a test similar to theirs can decode entailment relations between natural sentences, well above random chance, though not perfectly, across many datasets and LMs. This suggests LMs implicitly model aspects of semantics to predict semantic effects on sentence co-occurrence patterns. However, we find the test that predicts entailment in practice works in the opposite direction to the theoretical test. We thus revisit the assumptions underlying the original test, finding its derivation did not adequately account for redundancy in human-written text. We argue that better accounting for redundancy related to *explanations* might derive the observed flipped test and, more generally, improve computational models of speakers in linguistics.</abstract>
-      <url hash="31657083">2024.findings-acl.161</url>
+      <url hash="9dd0dfc5">2024.findings-acl.161</url>
       <bibkey>merrill-etal-2024-learn</bibkey>
       <doi>10.18653/v1/2024.findings-acl.161</doi>
       <video href="2024.findings-acl.161.mp4"/>
@@ -8244,7 +8244,7 @@
       <author><first>Diyi</first><last>Yang</last><affiliation>Stanford University</affiliation></author>
       <pages>2774-2788</pages>
       <abstract>Psychological inoculation, a strategy designed to build resistance against persuasive misinformation, has shown efficacy in curbing its spread and mitigating its adverse effects at early stages. Despite its effectiveness, the design and optimization of these inoculations typically demand substantial human and financial resources, primarily due to the need for repeated experimental trials. To address these challenges, this paper introduces Simulated Misinformation Susceptibility Tests (SMISTs), leveraging Large Language Models (LLMs) to simulate participant responses in misinformation studies. SMIST employs a life experience-driven simulation methodology, which accounts for various aspects of participants’ backgrounds, to mitigate common issues of caricatures and stereotypes in LLM simulations and enhance response diversity. Our extensive experimentation demonstrates that SMIST, utilizing GPT-4 as the backend model, yields results that align closely with those obtained from human-subject studies in misinformation susceptibility. This alignment suggests that LLMs can effectively serve as proxies in evaluating the impact of psychological inoculations. Moreover, SMIST offers the critical benefit of being applicable to emerging or anticipated misinformation scenarios without exposing human participants to potentially harmful content. This characteristic of SMIST not only preserves participant safety but also expands the scope of misinformation research to include more sensitive or speculative topics.</abstract>
-      <url hash="12a037f6">2024.findings-acl.162</url>
+      <url hash="04deadc3">2024.findings-acl.162</url>
       <bibkey>ma-etal-2024-simulated</bibkey>
       <doi>10.18653/v1/2024.findings-acl.162</doi>
       <video href="2024.findings-acl.162.mp4"/>
@@ -8257,7 +8257,7 @@
       <author><first>Diyi</first><last>Yang</last><affiliation>Stanford University</affiliation></author>
       <pages>2789-2805</pages>
       <abstract>As Natural Language Processing (NLP) systems become increasingly integrated into human social life, these technologies will need to increasingly rely on social intelligence. Although there are many valuable datasets that benchmark isolated dimensions of social intelligence, there does not yet exist any body of work to join these threads into a cohesive subfield in which researchers can quickly identify research gaps and future directions. Towards this goal, we build a Social AI Data Infrastructure, which consists of a comprehensive social AI taxonomy and a data library of 480 NLP datasets. Our infrastructure allows us to analyze existing dataset efforts, and also evaluate language models’ performance in different social intelligence aspects. Our analyses demonstrate its utility in enabling a thorough understanding of current data landscape and providing a holistic perspective on potential directions for future dataset development. We show there is a need for multifaceted datasets, increased diversity in language and culture, more long-tailed social situations, and more interactive data in future social intelligence data efforts.</abstract>
-      <url hash="6482a4f7">2024.findings-acl.163</url>
+      <url hash="e4ee467f">2024.findings-acl.163</url>
       <bibkey>li-etal-2024-social</bibkey>
       <doi>10.18653/v1/2024.findings-acl.163</doi>
       <video href="2024.findings-acl.163.mp4"/>
@@ -8270,7 +8270,7 @@
       <author><first>Hai</first><last>Zhao</last><affiliation>Shanghai Jiao Tong University</affiliation></author>
       <pages>2806-2813</pages>
       <abstract>The prevalent approach for optimizing pre-trained language models in downstream tasks is fine-tuning. However, it is both time-consuming and memory-inefficient. In response, a more efficient method called Prefix Tuning, which insert learnable vectors into each Transformer layers, has been proposed and proven effective. Recent investigations reveal that prefix tokens carry context-specific information, prompting the hypothesis that enhancing their specialization can improve model performance. To address this, we propose Selective Prefix Tuning (SPT), integrating a selective mechanism inspired by selective self-attention. Additionally, we introduce Selective Loss (SL) to encourage diversity in prefix tokens. Extensive experiments validate the effectiveness of SPT in sentence and token classification tasks. We contribute insight into understanding the role of prefix in model adaptation.</abstract>
-      <url hash="07cf69b2">2024.findings-acl.164</url>
+      <url hash="68b9c4bf">2024.findings-acl.164</url>
       <bibkey>zhang-etal-2024-selective</bibkey>
       <doi>10.18653/v1/2024.findings-acl.164</doi>
       <video href="2024.findings-acl.164.mp4"/>
@@ -8281,7 +8281,7 @@
       <author><first>Soroush</first><last>Vosoughi</last><affiliation>Dartmouth College</affiliation></author>
       <pages>2814-2827</pages>
       <abstract>The rapid proliferation of online content necessitates effective summarization methods, among which dynamic aspect-based summarization stands out. Unlike its traditional counterpart, which assumes a fixed set of known aspects, this approach adapts to the varied aspects of the input text. We introduce a novel multi-objective learning framework employing a Longformer-Encoder-Decoder for this task. The framework optimizes aspect number prediction, minimizes disparity between generated and reference summaries for each aspect, and maximizes dissimilarity across aspect-specific summaries. Extensive experiments show our method significantly outperforms baselines on three diverse datasets, largely due to the effective alignment of generated and reference aspect counts without sacrificing single-aspect summarization quality.</abstract>
-      <url hash="45760470">2024.findings-acl.165</url>
+      <url hash="e9652247">2024.findings-acl.165</url>
       <bibkey>guo-vosoughi-2024-modabs</bibkey>
       <doi>10.18653/v1/2024.findings-acl.165</doi>
       <video href="2024.findings-acl.165.mp4"/>
@@ -8292,7 +8292,7 @@
       <author><first>Suma</first><last>Bhat</last><affiliation>University of Illinois, Urbana Champaign</affiliation></author>
       <pages>2828-2839</pages>
       <abstract>Non-compositional expressions are an integral part of natural language and their meanings cannot be directly derived from the meanings of their component words. Recent work has shown how their processing remains a challenge for pre-trained language models. Here we consider the fact that prior knowledge of their component words is inadequate to infer their meaning as a whole and that these expressions constitute a long-tailed process in language (based on their occurrence in corpora and their coming into use as an idiomatic expression in a continual manner). Against this backdrop, this paper studies the ability of recent pre-trained language models to generate non-compositional expressions in English and their continual learning. Formulating this as a mask infilling task termed as CLoNE, the study uncovers the combined challenges of non-compositionality and their continual learning. Using a set of three diverse idiomatic expression datasets repurposed for this task, we benchmark different large pre-trained language models and different continual learning methods on the task of non-compositional expression generation. Our experiments on the CLoNE task show that large pre-trained language models are limited in their ability to generate non-compositional expressions and available continual learning methods are inadequate for our proposed CLoNE task which calls for more effective methods for continual learning of non-compositionality. Our datasets and code will be released publicly upon acceptance.</abstract>
-      <url hash="8878bbef">2024.findings-acl.166</url>
+      <url hash="e0202698">2024.findings-acl.166</url>
       <bibkey>zhou-bhat-2024-non</bibkey>
       <doi>10.18653/v1/2024.findings-acl.166</doi>
     </paper>
@@ -8310,7 +8310,7 @@
       <author><first>Shaoting</first><last>Zhang</last><affiliation>University of North Carolina at Charlotte</affiliation></author>
       <pages>2840-2861</pages>
       <abstract>This paper surveys and organizes research works of medical dialog systems, which is an important yet challenging task. Although these systems have been surveyed in the medical community from an application perspective, a systematic review from a rigorous technical perspective has to date remained noticeably absent. As a result, an overview of the categories, methods, evaluation of medical dialogue systems remain limited and underspecified, hindering the further improvement of this area. To fill this gap, we investigate an initial pool of 325 papers from well-known computer science, natural language processing conferences and journals, and make an overview. Recently, large language models have shown strong model capacity on downstream tasks, which also reshape medical dialog systems’ foundation.Despite the alluring practical application value, current medical dialogue systems still suffer from problems. To this end, this paper lists grand challenges of medical dialog systems, especially of large language models.</abstract>
-      <url hash="867d0e41">2024.findings-acl.167</url>
+      <url hash="cf9d0683">2024.findings-acl.167</url>
       <bibkey>shi-etal-2024-medical</bibkey>
       <doi>10.18653/v1/2024.findings-acl.167</doi>
     </paper>
@@ -8325,7 +8325,7 @@
       <author><first>Gholamreza</first><last>Haffari</last><affiliation>Monash University</affiliation></author>
       <pages>2862-2883</pages>
       <abstract>Large language models (LLMs) have demonstrated strong reasoning abilities when prompted to generate chain-of-thought (CoT) explanations alongside answers. However, previous research on evaluating LLMs has solely focused on answer accuracy, neglecting the correctness of the generated CoT. In this paper, we delve deeper into the CoT reasoning capabilities of LLMs in multi-hop question answering by utilizing knowledge graphs (KGs). We propose a novel discriminative and generative CoT evaluation paradigm to assess LLMs’ knowledge of reasoning and the accuracy of the generated CoT. Through experiments conducted on 5 different families of LLMs across 2 multi-hop question-answering datasets, we find that LLMs possess sufficient knowledge to perform reasoning. However, there exists a significant disparity between answer accuracy and faithfulness of the CoT generated by LLMs, indicating that they often arrive at correct answers through incorrect reasoning.</abstract>
-      <url hash="0311aed6">2024.findings-acl.168</url>
+      <url hash="cdd88d2f">2024.findings-acl.168</url>
       <bibkey>nguyen-etal-2024-direct</bibkey>
       <doi>10.18653/v1/2024.findings-acl.168</doi>
       <video href="2024.findings-acl.168.mp4"/>
@@ -8338,7 +8338,7 @@
       <author><first>AiTi</first><last>Aw</last><affiliation>I2R</affiliation></author>
       <pages>2884-2896</pages>
       <abstract>Real-world news comments pose a significant challenge due to their noisy and ambiguous nature, which complicates their modeling for clustering and summarization tasks. Most previous research has predominantly focused on extractive summarization methods within specific constraints. This paper concentrates on Clustering and Abstractive Summarization of online news Comments (CASC). First, we introduce an enhanced fast clustering algorithm that maintains a dynamic similarity threshold to ensure the high density of each comment cluster being built. Moreover, we pioneer the exploration of tuning Large Language Models (LLMs) through a chain-of-thought strategy to generate summaries for each comment cluster. On the other hand, a notable challenge in CASC research is the scarcity of evaluation data. To address this problem, we design an annotation scheme and contribute a manual test suite tailored for CASC. Experimental results on the test suite demonstrate the effectiveness of our improvements to the baseline methods. In addition, the quantitative and qualitative analyses illustrate the adaptability of our approach to real-world news comment scenarios.</abstract>
-      <url hash="3c221654">2024.findings-acl.169</url>
+      <url hash="3fbfd581">2024.findings-acl.169</url>
       <bibkey>zhang-etal-2024-comprehensive</bibkey>
       <doi>10.18653/v1/2024.findings-acl.169</doi>
       <video href="2024.findings-acl.169.mp4"/>
@@ -8353,7 +8353,7 @@
       <author><first>Zhumin</first><last>Chen</last><affiliation>Shandong University</affiliation></author>
       <pages>2897-2917</pages>
       <abstract>Fine-tuning has been demonstrated to be an effective method to improve the domain performance of large language models (LLMs). However, LLMs might fit the dataset bias and shortcuts for prediction, leading to poor generation performance. Previous works have proven that LLMs are prone to exhibit position bias, i.e., leveraging information positioned at the beginning or end, or specific positional cues within the input. Existing debiasing methods for LLMs require external bias knowledge or annotated non-biased samples, which is lacking for position debiasing and impractical in reality. In this work, we propose a self-supervised position debiasing (SOD) framework to mitigate position bias for LLMs. SOD leverages unsupervised responses from pre-trained LLMs for debiasing without relying on any external knowledge. To improve the quality of unsupervised responses, we propose an objective alignment (OAM) module to prune these responses. Experiments on eight datasets and five tasks show that SOD consistently outperforms existing methods in mitigating three types of position biases. Besides, SOD achieves this by sacrificing only a small performance on biased samples, which is general and effective. To facilitate the reproducibility of the results, we share the code of all methods and datasets on https://github.com/LZKSKY/SOD.</abstract>
-      <url hash="eace0f8d">2024.findings-acl.170</url>
+      <url hash="cb42860a">2024.findings-acl.170</url>
       <bibkey>liu-etal-2024-self-supervised</bibkey>
       <doi>10.18653/v1/2024.findings-acl.170</doi>
       <video href="2024.findings-acl.170.mp4"/>
@@ -8366,7 +8366,7 @@
       <author><first>Dingqi</first><last>Yang</last><affiliation>University of Macau</affiliation></author>
       <pages>2918-2929</pages>
       <abstract>Knowledge Graph (KG) embeddings are essential for link prediction over KGs. Compared to triplets, hyper-relational facts consisting of a base triplet and an arbitrary number of key-value pairs, can better characterize real-world facts and have aroused various hyper-relational embedding techniques recently. Nevertheless, existing works seldom consider the ontology of KGs, which is beneficial to link prediction tasks. A few studies attempt to incorporate the ontology information, by either utilizing the ontology as constraints on entity representations or jointly learning from hyper-relational facts and the ontology. However, existing approaches mostly overlook the ontology hierarchy and suffer from the dominance issue of facts over ontology, resulting in suboptimal performance. Against this background, we propose a universal contrastive learning framework for hyper-relational KG embeddings (<tex-math>\textbf{HyperCL}</tex-math>), which is flexible to integrate different hyper-relational KG embedding methods and effectively boost their link prediction performance. HyperCL designs relation-aware Graph Attention Networks to capture the hierarchical ontology and a concept-aware contrastive loss to alleviate the dominance issue. We evaluate HyperCL on three real-world datasets in different link prediction tasks. Experimental results show that HyperCL consistently boosts the performance of state-of-the-art baselines with an average improvement of 3.1-7.4% across the three datasets.</abstract>
-      <url hash="6e0553c6">2024.findings-acl.171</url>
+      <url hash="5ea310f2">2024.findings-acl.171</url>
       <bibkey>lu-etal-2024-hypercl</bibkey>
       <doi>10.18653/v1/2024.findings-acl.171</doi>
     </paper>
@@ -8379,7 +8379,7 @@
       <author><first>Minghua</first><last>Xu</last><affiliation>Huazhong University of Science and Technology</affiliation></author>
       <pages>2930-2942</pages>
       <abstract>Multifaceted ideology detection (MID) aims to detect the ideological leanings of texts towards multiple facets. Previous studies on ideology detection mainly focus on one generic facet and ignore label semantics and explanatory descriptions of ideologies, which are a kind of instructive information and reveal the specific concepts of ideologies. In this paper, we develop a novel concept semantics-enhanced framework for the MID task. Specifically, we propose a bidirectional iterative concept flow (BICo) method to encode multifaceted ideologies. BICo enables the concepts to flow across levels of the schema tree and enriches concept representations with multi-granularity semantics. Furthermore, we explore concept attentive matching and concept-guided contrastive learning strategies to guide the model to capture ideology features with the learned concept semantics. Extensive experiments on the benchmark dataset show that our approach achieves state-of-the-art performance in MID, including in the cross-topic scenario.</abstract>
-      <url hash="5702fc05">2024.findings-acl.172</url>
+      <url hash="625b65b9">2024.findings-acl.172</url>
       <bibkey>liu-etal-2024-encoding</bibkey>
       <doi>10.18653/v1/2024.findings-acl.172</doi>
     </paper>
@@ -8389,7 +8389,7 @@
       <author><first>Zhenghua</first><last>Li</last><affiliation>Soochow University, China</affiliation></author>
       <pages>2943-2956</pages>
       <abstract>Revealing the syntactic structure of sentences in Chinese poses significant challenges for word-level parsers due to the absence of clear word boundaries. To facilitate a transition from word-level to character-level Chinese dependency parsing, this paper proposes modeling latent internal structures within words. In this way, each word-level dependency tree is interpreted as a forest of character-level trees. A constrained Eisner algorithm is implemented to ensure the compatibility of character-level trees, guaranteeing a single root for intra-word structures and establishing inter-word dependencies between these roots. Experiments on Chinese treebanks demonstrate the superiority of our method over both the pipeline framework and previous joint models. A detailed analysis reveals that a coarse-to-fine parsing strategy empowers the model to predict more linguistically plausible intra-word structures.</abstract>
-      <url hash="ff4a6973">2024.findings-acl.173</url>
+      <url hash="f4cc08f5">2024.findings-acl.173</url>
       <bibkey>hou-li-2024-character</bibkey>
       <doi>10.18653/v1/2024.findings-acl.173</doi>
       <video href="2024.findings-acl.173.mp4"/>
@@ -8401,7 +8401,7 @@
       <author><first>Jingwei</first><last>Cheng</last><affiliation>Northeastern University, China</affiliation></author>
       <pages>2957-2966</pages>
       <abstract>Zero-shot Relation Extraction (ZSRE) aims to predict unseen relations between entity pairs from input sentences. Existing prototype-based ZSRE methods encode relation descriptions into prototype embeddings and predict by measuring the similarity between sentence embeddings and prototype embeddings. However, these methods often overlook abundant side information of relations and suffer from a significant encoding gap between prototypes and sentences, limiting performance. To this end, we propose a framework named AlignRE, based on two Alignment methods for ZSRE. Specifically, we present a novel perspective centered on encoding schema alignment to enhance prototype-based ZSRE methods. We utilize well-designed prompt-tuning to bridge the encoding gap. To improve prototype quality, we explore and leverage multiple side information and propose a prototype aggregation method based on semantic alignment to create comprehensive relation prototype representations. We conduct experiments on FewRel and Wiki-ZSL datasets and consistently outperform state-of-the-art methods. Moreover, our method exhibits substantially faster performance and reduces the need for extensive manual labor in prototype construction. Code is available at https://github.com/lizehan1999/AlignRE.</abstract>
-      <url hash="5c53545f">2024.findings-acl.174</url>
+      <url hash="f44680ff">2024.findings-acl.174</url>
       <bibkey>li-etal-2024-alignre</bibkey>
       <doi>10.18653/v1/2024.findings-acl.174</doi>
       <video href="2024.findings-acl.174.mp4"/>
@@ -8415,7 +8415,7 @@
       <author><first>Rui</first><last>Yan</last><affiliation>Renmin University of China</affiliation></author>
       <pages>2967-2985</pages>
       <abstract>Supervised fine-tuning (SFT) on instruction-following corpus is a crucial approach toward the alignment of large language models (LLMs). However, the performance of LLMs on standard knowledge and reasoning benchmarks tends to suffer from deterioration at the latter stage of the SFT process, echoing the phenomenon of alignment tax. Through our pilot study, we put a hypothesis that the data biases are probably one cause behind the phenomenon. To address the issue, we introduce a simple disperse-then-merge framework. To be concrete, we disperse the instruction-following data into portions and then train multiple sub-models using different data portions. Lastly, we merge multiple models into a single one via model merging techniques. Despite its simplicity, our framework outperforms various sophisticated methods such as data curation and training regularization on a series of standard knowledge and reasoning benchmarks.</abstract>
-      <url hash="4f20d8bc">2024.findings-acl.175</url>
+      <url hash="740ac7ed">2024.findings-acl.175</url>
       <bibkey>fu-etal-2024-disperse</bibkey>
       <doi>10.18653/v1/2024.findings-acl.175</doi>
     </paper>
@@ -8431,7 +8431,7 @@
       <author><first>Zhiqiang</first><last>Zhang</last><affiliation>Ant Group</affiliation></author>
       <pages>2986-2999</pages>
       <abstract>To tackle the problem of domain-specific knowledge scarcity within large language models (LLMs), knowledge graph-retrievalaugmented method has been proven to be an effective and efficient technique for knowledge infusion. However, existing approaches face two primary challenges: knowledge mismatch between public available knowledge graphs and the specific domain of the task at hand, and poor information compliance of LLMs with knowledge graphs. In this paper, we leverage a small set of labeled samples and a large-scale corpus to efficiently construct domain-specific knowledge graphs by an LLM, addressing the issue of knowledge mismatch. Additionally, we propose a three-stage KG-LLM alignment strategy to enhance the LLM’s capability to utilize information from knowledge graphs. We conduct experiments with a limited-sample setting on two biomedical question-answering datasets, and the results demonstrate that our approach outperforms existing baselines.</abstract>
-      <url hash="b6ba4b41">2024.findings-acl.176</url>
+      <url hash="bcf05299">2024.findings-acl.176</url>
       <bibkey>jiang-etal-2024-efficient</bibkey>
       <doi>10.18653/v1/2024.findings-acl.176</doi>
     </paper>
@@ -8442,7 +8442,7 @@
       <author><first>Heuiseok</first><last>Lim</last><affiliation>Korea University</affiliation></author>
       <pages>3000-3012</pages>
       <abstract>The advent of large language models has experienced a remarkable improvement in the field of machine translation. However, machine translation is still vulnerable to critical meaning deviations, which may incur catastrophic issues in social or ethical contexts. In particular, existing critical error detection primarily focuses on identifying sentence-level errors, leaving the precise localization of such errors within the sentence unaddressed. In this paper, we introduce a new task, word-level critical error detection (WCED), to detect critical errors at a fine-grained level in machine translation sentences. The task aims to identify the parts of a machine translation that contain catastrophic meaning distortions. We hypothesize that the ability to determine errors at the sentence level will positively influence the detection of more granular errors. We propose a sentence-level error detection module to predict which words in a sentence have critical errors. Experimental results demonstrate that our method outperforms existing methodologies and LLM in En-De, Zh-En, En-Ru, and En-Ko. Our method is helpful for determining the fine-grained location of errors. We hope that such studies will improve the capacity to address critical errors adeptly.</abstract>
-      <url hash="7fc040e5">2024.findings-acl.177</url>
+      <url hash="54afef71">2024.findings-acl.177</url>
       <bibkey>jung-etal-2024-towards</bibkey>
       <doi>10.18653/v1/2024.findings-acl.177</doi>
       <video href="2024.findings-acl.177.mp4"/>
@@ -8458,7 +8458,7 @@
       <author><first>Bohan</first><last>Zhuang</last><affiliation>Zhejiang University</affiliation></author>
       <pages>3013-3026</pages>
       <abstract>Large Language Models (LLMs), such as LLaMA and T5, have shown exceptional performance across various tasks through fine-tuning. Although low-rank adaption (LoRA) has emerged to cheaply fine-tune these LLMs on downstream tasks, their deployment is still hindered by the vast model scale and computational costs. Post-training model pruning offers a way to compress LLMs. However, the current pruning methods designed for LLMs are not compatible with LoRA. This is due to their utilization of unstructured pruning on LLMs, impeding the merging of LoRA weights, or their dependence on the gradients of pre-trained weights to guide pruning, which can impose significant memory overhead.To this end, we propose LoRAPrune, a new framework that delivers an accurate structured pruned model in a highly memory-efficient manner. Specifically, we first design a LoRA-guided pruning criterion, which uses the weights and gradients of LoRA, rather than the gradients of pre-trained weights for importance estimation. We subsequently integrate this criterion into an iterative pruning process, effectively removing redundant channels and heads. Extensive experimental results demonstrate the superior performance of our LoRAPrune over existing approaches on the LLaMA series models.At a 50% compression rate, LoRAPrune demonstrates superior performance over LLM-Pruner, achieving a reduction in perplexity by 4.81 on WikiText2 and 3.46 on PTB, while also decreasing memory usage by 52.6%.Besides, LoRAPrune also matches semi-structural pruning across multiple LLMs, proving its wide applicability. The code is available at https://github.com/aim-uofa/LoRAPrune.</abstract>
-      <url hash="25ac81d3">2024.findings-acl.178</url>
+      <url hash="82811d40">2024.findings-acl.178</url>
       <bibkey>zhang-etal-2024-loraprune</bibkey>
       <doi>10.18653/v1/2024.findings-acl.178</doi>
       <video href="2024.findings-acl.178.mp4"/>
@@ -8471,7 +8471,7 @@
       <author><first>Xunliang</first><last>Cai</last></author>
       <pages>3027-3043</pages>
       <abstract>The recent advancements in large language models (LLMs) have been extraordinary, yet the escalating inference costs associated with them present challenges in real-world applications. To address these challenges, we propose a novel approach called Early-exiting Speculative Decoding (EESD) with lossless acceleration. Specifically, EESD utilizes a segment of the LLM to generate draft tokens, incorporating Early-exiting structures after the first N layers. To enhance the quality of draft tokens, a self-distillation method is integrated. This early-exiting design not only reduces deployment and training costs but also significantly accelerates the token generation speed. Moreover, we introduce a novel sampling mechanism that leverages Thompson Sampling to regulate the generation processes, automatically determining the quantity of draft tokens in each round. The original LLM is then employed to validate these draft tokens through a single forward pass, and thus guarantees that the final output text maintains a distribution consistent with vanilla auto-regressive decoding. The experimental results on both 13B and 70B models demonstrate that our approach decodes tokens at a markedly accelerated rate compared to prior methods, showing the effectiveness of our approach.</abstract>
-      <url hash="e8a6931a">2024.findings-acl.179</url>
+      <url hash="de88011a">2024.findings-acl.179</url>
       <bibkey>liu-etal-2024-speculative-decoding</bibkey>
       <doi>10.18653/v1/2024.findings-acl.179</doi>
     </paper>
@@ -8485,7 +8485,7 @@
       <author><first>Ji</first><last>Zhang</last><affiliation>Alibaba Group</affiliation></author>
       <pages>3044-3052</pages>
       <abstract>For the grammatical error correction (GEC) task, there usually exist multiple correction ways for an erroneous input sentence, leading to multiple references. Observing the high proportion of multi-reference instances in Chinese GEC training data, we target a systematic study on how to better utilize multi-reference training data. We propose two new approaches and a simple two-stage training strategy. We compare them against previously proposed approaches, on two Chinese training datasets, i.e., Lang-8 for second language learner texts and FCGEC-Train for native speaker texts, and three test datasets. The experiments and analyses demonstrate the effectiveness of our proposed approaches and reveal interesting insights. Our code is available at https://github.com/ymliucs/MrGEC.</abstract>
-      <url hash="5c4d0140">2024.findings-acl.180</url>
+      <url hash="f0cd4269">2024.findings-acl.180</url>
       <bibkey>liu-etal-2024-towards-better</bibkey>
       <doi>10.18653/v1/2024.findings-acl.180</doi>
     </paper>
@@ -8500,7 +8500,7 @@
       <author><first>Jie</first><last>Tang</last><affiliation>Tsinghua University, Tsinghua University</affiliation></author>
       <pages>3053-3077</pages>
       <abstract>Open large language models (LLMs) with great performance in various tasks have significantly advanced the development of LLMs. However, they are far inferior to commercial models such as ChatGPT and GPT-4 when acting as agents to tackle complex tasks in the real world. These agent tasks employ LLMs as the central controller responsible for planning, memorization, and tool utilization, necessitating both fine-grained prompting methods and robust LLMs to achieve satisfactory performance. Though many prompting methods have been proposed to complete particular agent tasks, there is lack of research focusing on improving the agent capabilities of LLMs themselves without compromising their general abilities. In this work, we present AgentTuning, a simple and general method to enhance the agent abilities of LLMs while maintaining their general LLM capabilities. We construct AgentInstruct, a lightweight instruction-tuning dataset containing high-quality interaction trajectories. We employ a hybrid instruction-tuning strategy by combining AgentInstruct with open-source instructions from general domains. AgentTuning is used to instruction-tune the Llama 2 series, resulting in AgentLM. Our evaluations show that AgentTuning enables LLMs’ agent capabilities without compromising general abilities. The AgentLM-70B is comparable to GPT-3.5-turbo on unseen agent tasks, demonstrating generalized agent capabilities. We open source the AgentInstruct and AgentLM-7B, 13B, and 70B models at https://anonymous.4open.science/r/AgentTuning, serving open and powerful alternatives to commercial LLMs for agent tasks.</abstract>
-      <url hash="66fa1168">2024.findings-acl.181</url>
+      <url hash="7b5e5db6">2024.findings-acl.181</url>
       <bibkey>zeng-etal-2024-agenttuning</bibkey>
       <doi>10.18653/v1/2024.findings-acl.181</doi>
     </paper>
@@ -8511,7 +8511,7 @@
       <author><first>Guodong</first><last>Zhou</last><affiliation>Soochow University, China</affiliation></author>
       <pages>3078-3087</pages>
       <abstract>Recently, the use of pre-trained generation models for extracting sentiment elements has resulted in significant advancements in aspect-based sentiment analysis benchmarks. However, these approaches often overlook the importance of explicitly modeling structure among sentiment elements. To address this limitation, we present a study that aims to integrate general pre-trained sequence-to-sequence language models with a structure-aware transition-based approach. Therefore, we propose a transition system for opinion tree generation, designed to better exploit pre-trained language models for structured fine-tuning. Our proposed transition system ensures the structural integrity of the generated opinion tree. By leveraging pre-trained generation models and simplifying the transition set, we are able to maximize the accuracy of opinion tree generation. Extensive experiments show that our model significantly advances the state-of-the-art performance on several benchmark datasets. In addition, the empirical studies also indicate that the proposed opinion tree generation with transition system is more effective in capturing the sentiment structure than other generation models.</abstract>
-      <url hash="4c5e498b">2024.findings-acl.182</url>
+      <url hash="8995bd6e">2024.findings-acl.182</url>
       <bibkey>ma-etal-2024-transition</bibkey>
       <doi>10.18653/v1/2024.findings-acl.182</doi>
       <video href="2024.findings-acl.182.mp4"/>
@@ -8525,7 +8525,7 @@
       <author><first>Anh Tuan</first><last>Luu</last><affiliation>Nanyang Technological University</affiliation></author>
       <pages>3088-3105</pages>
       <abstract>Dynamic topic models track the evolution of topics in sequential documents, which have derived various applications like trend analysis. However, existing models suffer from repetitive topic and unassociated topic issues, failing to reveal the evolution and hindering further applications. To address these issues, we break the tradition of simply chaining topics in existing work and propose a novel neural Chain-Free Dynamic Topic Model. We introduce a new evolution-tracking contrastive learning method that builds the similarity relations among dynamic topics. This not only tracks topic evolution but also maintains topic diversity, mitigating the repetitive topic issue. To avoid unassociated topics, we further present an unassociated word exclusion method that consistently excludes unassociated words from discovered topics. Extensive experiments demonstrate our model significantly outperforms state-of-the-art baselines, tracking topic evolution with high-quality topics, showing better performance on downstream tasks, and remaining robust to the hyperparameter for evolution intensities.</abstract>
-      <url hash="5aab1262">2024.findings-acl.183</url>
+      <url hash="b1c09579">2024.findings-acl.183</url>
       <bibkey>wu-etal-2024-modeling</bibkey>
       <doi>10.18653/v1/2024.findings-acl.183</doi>
     </paper>
@@ -8542,7 +8542,7 @@
       <author><first>Timothy</first><last>Baldwin</last><affiliation>Mohamed bin Zayed University of Artificial Intelligence and The University of Melbourne</affiliation></author>
       <pages>3106-3119</pages>
       <abstract>Many studies have demonstrated that large language models (LLMs) can produce harmful responses, exposing users to unexpected risks. Previous studies have proposed comprehensive taxonomies of LLM risks, as well as corresponding prompts that can be used to examine LLM safety. However, the focus has been almost exclusively on English. We aim to broaden LLM safety research by introducing a dataset for the safety evaluation of Chinese LLMs, and extending it to better identify false negative and false positive examples in terms of risky prompt rejections. We further present a set of fine-grained safety assessment criteria for each risk type, facilitating both manual annotation and automatic evaluation in terms of LLM response harmfulness. Our experiments over five LLMs show that region-specific risks are the prevalent risk type. Warning: this paper contains example data that may be offensive, harmful, or biased. Our data is available at https://github.com/Libr-AI/do-not-answer.</abstract>
-      <url hash="6bf4e5f8">2024.findings-acl.184</url>
+      <url hash="cd97fa61">2024.findings-acl.184</url>
       <bibkey>wang-etal-2024-chinese</bibkey>
       <doi>10.18653/v1/2024.findings-acl.184</doi>
       <video href="2024.findings-acl.184.mp4"/>
@@ -8554,7 +8554,7 @@
       <author><first>Hiroki</first><last>Sakaji</last><affiliation>Hokkaido University</affiliation></author>
       <pages>3120-3131</pages>
       <abstract>Recently, Large Language Models (LLMs) have attracted significant attention for their exceptional performance across a broad range of tasks, particularly in text analysis. However, the finance sector presents a distinct challenge due to its dependence on time-series data for complex forecasting tasks. In this study, we introduce a novel framework called LLMFactor, which employs Sequential Knowledge-Guided Prompting (SKGP) to identify factors that influence stock movements using LLMs. Unlike previous methods that relied on keyphrases or sentiment analysis, this approach focuses on extracting factors more directly related to stock market dynamics, providing clear explanations for complex temporal changes. Our framework directs the LLMs to create background knowledge through a fill-in-the-blank strategy and then discerns potential factors affecting stock prices from related news. Guided by background knowledge and identified factors, we leverage historical stock prices in textual format to predict stock movement. An extensive evaluation of the LLMFactor framework across four benchmark datasets from both the U.S. and Chinese stock markets demonstrates its superiority over existing state-of-the-art methods and its effectiveness in financial time-series forecasting.</abstract>
-      <url hash="0bfa35a7">2024.findings-acl.185</url>
+      <url hash="79e0c9a1">2024.findings-acl.185</url>
       <bibkey>wang-etal-2024-llmfactor</bibkey>
       <doi>10.18653/v1/2024.findings-acl.185</doi>
       <video href="2024.findings-acl.185.mp4"/>
@@ -8565,7 +8565,7 @@
       <author><first>Aston</first><last>Zhang</last><affiliation>Meta</affiliation></author>
       <pages>3132-3149</pages>
       <abstract>Autonomous graphical user interface (GUI) agents aim to facilitate task automation by interacting with the user interface without manual intervention. Recent studies have investigated eliciting the capabilities of large language models (LLMs) for effective engagement in diverse environments. To align with the input-output requirement of LLMs, most existing approaches are developed under a sandbox setting where they rely on external tools and application-specific APIs to parse the environment into textual elements and interpret the predicted actions. Consequently, those approaches often grapple with inference inefficiency and error propagation risks. To mitigate the challenges, we introduce Auto-GUI, a multimodal solution that directly interacts with the interface, bypassing the need for environment parsing or reliance on application-dependent APIs. Moreover, we propose a chain-of-action technique—leveraging a series of intermediate previous action histories and future action plans—to help the agent decide what action to execute. We evaluate our approach on a new device-control benchmark AITW with 30<tex-math>K</tex-math> unique instructions, spanning multi-step tasks such as application operation, web searching, and web shopping. Experimental results show that Auto-GUI achieves state-of-the-art performance with an action type prediction accuracy of 90% and an overall action success rate of 74%. Code is publicly available at https://github.com/cooelf/Auto-GUI.</abstract>
-      <url hash="ec63b39b">2024.findings-acl.186</url>
+      <url hash="b6b70a68">2024.findings-acl.186</url>
       <bibkey>zhang-zhang-2024-look</bibkey>
       <doi>10.18653/v1/2024.findings-acl.186</doi>
       <video href="2024.findings-acl.186.mp4"/>
@@ -8581,7 +8581,7 @@
       <author><first>Hong</first><last>Chen</last><affiliation>Renmin University of China</affiliation></author>
       <pages>3150-3170</pages>
       <abstract>Structured pruning is a widely used technique for reducing the size of pre-trained language models (PLMs), but current methods often overlook the potential of compressing the hidden dimension <tex-math>d</tex-math> in PLMs, a dimension critical to model size and efficiency. This paper introduces a novel structured pruning approach, Structured Pruning with PCA Projection (<tex-math>\rm SP^3</tex-math>), targeting the effective reduction of <tex-math>d</tex-math> by projecting features into a space defined by principal components before masking. Extensive experiments on benchmarks (GLUE and SQuAD) show that can reduce <tex-math>d</tex-math> by 70%, compress 94% of the <tex-math>\rm BERT_{base}</tex-math> model, and maintain over 96% accuracy and outperform other methods that compress <tex-math>d</tex-math> by 6% in accuracy at the same compression ratio. <tex-math>\rm SP^3</tex-math> has also proven effective with other models, including OPT and Llama.Our data and code are available at https://github.com/hyx1999/SP3</abstract>
-      <url hash="61e54a84">2024.findings-acl.187</url>
+      <url hash="d636c5fd">2024.findings-acl.187</url>
       <bibkey>hu-etal-2024-sp3</bibkey>
       <doi>10.18653/v1/2024.findings-acl.187</doi>
       <video href="2024.findings-acl.187.mp4"/>
@@ -8594,7 +8594,7 @@
       <author><first>Hyunju</first><last>Lee</last><affiliation>Gwangju Institute of Science and Technology</affiliation></author>
       <pages>3171-3185</pages>
       <abstract>With the proliferation of digital communication, dialogue summarization has become increasingly important. However, it still faces a shortage of data. To address this issue, we developed **Gen**erative **D**ata Augmentation Strategy Leveraging **Ex**ternal Data for Abstractive Dialogue Summarization (**GENDEX**), which is based on the hypothetical foundation that texts containing people and their interpersonal interactions can potentially serve as summaries of corresponding dialogues. We filter short texts containing people and resolve coreferences for better contextual analysis. We then identify the semantic roles of words within the texts and filter them based on the patterns observed in the dialogue summarization datasets. Using these texts, we generate synthetic dialogues through a controlled generation method. To better leverage the augmented data, we utilize noise-tolerant training to fine-tune the summarization model. The experimental results demonstrate the effectiveness of our proposed method, showing its robust performance, generalizability, and scalability. Moreover, performance improvements by *GENDEX* were observed regardless of complexity of dialogues. The code is available at https://github.com/DMCB-GIST/GENDEX.</abstract>
-      <url hash="30d8cbe4">2024.findings-acl.188</url>
+      <url hash="1bf90e05">2024.findings-acl.188</url>
       <bibkey>park-etal-2024-gendex</bibkey>
       <doi>10.18653/v1/2024.findings-acl.188</doi>
       <video href="2024.findings-acl.188.mp4"/>
@@ -8606,7 +8606,7 @@
       <author><first>Ron</first><last>Meir</last><affiliation>Technion, Technion</affiliation></author>
       <pages>3186-3194</pages>
       <abstract>Artificial agents that learn to communicate in order to accomplish a given task acquire communication protocols that are typically opaque to a human. A large body of work has attempted to evaluate the emergent communication via various evaluation measures, with **compositionality** featuring as a prominent desired trait. However, current evaluation procedures do not directly expose the compositionality of the emergent communication. We propose a procedure to assess the compositionality of emergent communication by finding the best-match between emerged words and natural language concepts.The best-match algorithm provides both a global score and a translation-map from emergent words to natural language concepts. To the best of our knowledge, it is the first time that such direct and interpretable mapping between emergent words and human concepts is provided.</abstract>
-      <url hash="d2738d5e">2024.findings-acl.189</url>
+      <url hash="ec7e67b7">2024.findings-acl.189</url>
       <bibkey>carmeli-etal-2024-concept</bibkey>
       <doi>10.18653/v1/2024.findings-acl.189</doi>
     </paper>
@@ -8617,7 +8617,7 @@
       <author><first>Apoorv</first><last>Saxena</last><affiliation>Adobe Systems</affiliation></author>
       <pages>3195-3211</pages>
       <abstract>Document revision is a crucial aspect of the writing process, particularly in collaborative environments where multiple authors contribute simultaneously. However, current tools lack an efficient way to provide a comprehensive overview of changes between versions, leading to difficulties in understanding revisions. To address this, we propose a novel task of providing thematic summary of changes between document versions, organizing individual edits based on shared themes. We assess capabilities of LLMs on this task and further introduce three strategies to tackle this task: (i) representing the input of two documents along with edits in the ‘diff’ format (ii) a two-stage task decomposition with individual edit description generation as an intermediate task and (iii) clustering based chunking and subsequent merging techniques for handling longer documents. Our experiments demonstrate the effectiveness of our approach in improving the model’s capacity to handle this complex task. Additionally, we introduce ChangeSumm, a curated dataset comprising human-written thematic summaries for pairs of document versions, to facilitate evaluation and further research in this direction.</abstract>
-      <url hash="b8886c5a">2024.findings-acl.190</url>
+      <url hash="47ed64be">2024.findings-acl.190</url>
       <bibkey>t-y-s-s-etal-2024-tale</bibkey>
       <doi>10.18653/v1/2024.findings-acl.190</doi>
       <video href="2024.findings-acl.190.mp4"/>
@@ -8633,7 +8633,7 @@
       <author><first>Ying</first><last>Sha</last></author>
       <pages>3212-3228</pages>
       <abstract>Aspect Sentiment Triplet Extraction (ASTE) aims to extract the triplets of aspect terms, their associated sentiment and opinion terms. Previous works based on different modeling paradigms have achieved promising results. However, these methods struggle to comprehensively explore the various specific relations between sentiment elements in multi-view linguistic features, which is the prior indication effect for facilitating sentiment triplets extraction, requiring to align and aggregate them to capture the complementary higher-order interactions. In this paper, we propose Multi-view Linguistic Features Enhancement (MvLFE) to explore the aforementioned prior indication effect in the “Refine, Align, and Aggregate” learning process. Specifically, we first introduce the relational graph attention network to encode the word-pair relations represented by each linguistic feature and refine them to pay more attention to the aspect-opinion pairs. Next, we employ the multi-view contrastive learning to align them at a fine-grained level in the contextual semantic space to maintain semantic consistency. Finally, we utilize the multi-semantic cross attention to capture and aggregate the complementary higher-order interactions between diverse linguistic features to enhance the aspect-opinion relations. Experimental results on several benchmark datasets show the effectiveness and robustness of our model, which achieves state-of-the-art performance.</abstract>
-      <url hash="2ce02f08">2024.findings-acl.191</url>
+      <url hash="3a2b946c">2024.findings-acl.191</url>
       <bibkey>su-etal-2024-refine</bibkey>
       <doi>10.18653/v1/2024.findings-acl.191</doi>
     </paper>
@@ -8643,7 +8643,7 @@
       <author><first>Yue</first><last>Zhang</last><affiliation>Westlake University</affiliation></author>
       <pages>3229-3236</pages>
       <abstract>Gender bias has been widely observed in NLP models, which has the potential to perpetuate harmful stereotypes and discrimination. In this paper, we construct a dataset GenderStance of 36k samples to measure gender bias in stance detection, determining whether models consistently predict the same stance for a particular gender group. We find that all models are gender-biased and prone to classify sentences that contain male nouns as Against and those with female nouns as Favor. Moreover, extensive experiments indicate that sources of gender bias stem from the fine-tuning data and the foundation model itself. We will publicly release our code and dataset.</abstract>
-      <url hash="cb065181">2024.findings-acl.192</url>
+      <url hash="03725b63">2024.findings-acl.192</url>
       <bibkey>li-zhang-2024-pro</bibkey>
       <doi>10.18653/v1/2024.findings-acl.192</doi>
     </paper>
@@ -8656,7 +8656,7 @@
       <author><first>Naoaki</first><last>Okazaki</last><affiliation>Tokyo Institute of Technology</affiliation></author>
       <pages>3237-3245</pages>
       <abstract>Large Language Models (LLMs) are widely used to evaluate natural language generation tasks as automated metrics.However, the likelihood, a measure of LLM’s plausibility for a sentence, can vary due to superficial differences in sentences, such as word order and sentence structure.It is therefore possible that there might be a likelihood bias if LLMs are used for evaluation: they might overrate sentences with higher likelihoods while underrating those with lower likelihoods.In this paper, we investigate the presence and impact of likelihood bias in LLM-based evaluators.We also propose a method to mitigate the likelihood bias.Our method utilizes highly biased instances as few-shot examples for in-context learning.Our experiments in evaluating the data-to-text and grammatical error correction tasks reveal that several LLMs we test display a likelihood bias.Furthermore, our proposed method successfully mitigates this bias, also improving evaluation performance (in terms of correlation of models with human scores) significantly.</abstract>
-      <url hash="f5254ea6">2024.findings-acl.193</url>
+      <url hash="5b980bf8">2024.findings-acl.193</url>
       <bibkey>ohi-etal-2024-likelihood</bibkey>
       <doi>10.18653/v1/2024.findings-acl.193</doi>
       <video href="2024.findings-acl.193.mp4"/>
@@ -8672,7 +8672,7 @@
       <author><first>Hai</first><last>Zhao</last><affiliation>Shanghai Jiao Tong University</affiliation></author>
       <pages>3246-3257</pages>
       <abstract>Benchmark plays a pivotal role in assessing the advancements of large language models (LLMs). While numerous benchmarks have been proposed to evaluate LLMs’ capabilities, there is a notable absence of a dedicated benchmark for assessing their musical abilities. To address this gap, we present ZIQI-Eval, a comprehensive and large-scale music benchmark specifically designed to evaluate the music-related capabilities of LLMs.ZIQI-Eval encompasses a wide range of questions, covering 10 major categories and 56 subcategories, resulting in over 14,000 meticulously curated data entries. By leveraging ZIQI-Eval, we conduct a comprehensive evaluation over 16 LLMs to evaluate and analyze LLMs’ performance in the domain of music.Results indicate that all LLMs perform poorly on the ZIQI-Eval benchmark, suggesting significant room for improvement in their musical capabilities.With ZIQI-Eval, we aim to provide a standardized and robust evaluation framework that facilitates a comprehensive assessment of LLMs’ music-related abilities. The dataset is available at GitHub and HuggingFace.</abstract>
-      <url hash="0b20b82b">2024.findings-acl.194</url>
+      <url hash="77110556">2024.findings-acl.194</url>
       <bibkey>li-etal-2024-music</bibkey>
       <doi>10.18653/v1/2024.findings-acl.194</doi>
       <video href="2024.findings-acl.194.mp4"/>
@@ -8687,7 +8687,7 @@
       <author><first>Hai</first><last>Zhao</last><affiliation>Shanghai Jiao Tong University</affiliation></author>
       <pages>3258-3270</pages>
       <abstract>Large Language Models (LLMs) have shown remarkable comprehension abilities but face challenges in GPU memory usage during inference, hindering their scalability for real-time applications like chatbots. To accelerate inference, we store computed keys and values (KV cache) in the GPU memory. Existing methods study the KV cache compression to reduce memory by pruning the pre-computed KV cache. However, they neglect the inter-layer dependency between layers and huge memory consumption in pre-computation. To explore these deficiencies, we find that the number of crucial keys and values that influence future generations decreases layer by layer and we can extract them by the consistency in attention weights. Based on the findings, we propose PyramidInfer, a method that compresses the KV cache by layer-wise retaining crucial context. PyramidInfer saves significant memory by computing fewer keys and values without sacrificing performance. Experimental results show PyramidInfer improves 2.2x throughput compared to Accelerate with over 54% GPU memory reduction in KV cache.</abstract>
-      <url hash="9948d24d">2024.findings-acl.195</url>
+      <url hash="bcf75d84">2024.findings-acl.195</url>
       <bibkey>yang-etal-2024-pyramidinfer</bibkey>
       <doi>10.18653/v1/2024.findings-acl.195</doi>
       <video href="2024.findings-acl.195.mp4"/>
@@ -8702,7 +8702,7 @@
       <author><first>Min</first><last>Zhang</last><affiliation>Harbin Institute of Technology, Shenzhen</affiliation></author>
       <pages>3271-3290</pages>
       <abstract>Drama is a form of storytelling inspired by human creativity, proceeding with a predefined storyline, carrying emotions and thoughts.This paper introduces LLM-based interactive drama, which endows traditional drama with an unprecedented immersion, where a person is allowed to walk into it and interact with the characters and scenes.We define this new artistic genre by 6 essential elements—plot, character, thought, diction, spectacle and interaction—and study the entire pipeline to forge a backbone drama LLM to drive the playing process, which is challenged by limited drama resources, uncontrollable narrative development, and complicated instruction following.We propose Narrative Chain to offer finer control over the narrative progression during interaction with players;Auto-Drama to synthesize drama scripts given arbitrary stories;Sparse Instruction Tuning to allow the model to follow sophisticated instructions.We manually craft 3 scripts, Detective Conan, Harry Potter, Romeo and Juliet, and design a 5-dimension principle to evaluate the drama LLM comprehensively.</abstract>
-      <url hash="4784346b">2024.findings-acl.196</url>
+      <url hash="3df90879">2024.findings-acl.196</url>
       <bibkey>wu-etal-2024-role</bibkey>
       <doi>10.18653/v1/2024.findings-acl.196</doi>
       <video href="2024.findings-acl.196.mp4"/>
@@ -8718,7 +8718,7 @@
       <author><first>Gunhee</first><last>Kim</last><affiliation>Seoul National University</affiliation></author>
       <pages>3291-3325</pages>
       <abstract>While Large Language Models (LLMs) can serve as agents to simulate human behaviors (i.e., role-playing agents), we emphasize the importance of point-in-time role-playing. This situates characters at specific moments in the narrative progression for three main reasons: (i) enhancing users’ narrative immersion, (ii) avoiding spoilers, and (iii) fostering engagement in fandom role-playing. To accurately represent characters at specific time points, agents must avoid character hallucination, where they display knowledge that contradicts their characters’ identities and historical timelines. We introduce TimeChara, a new benchmark designed to evaluate point-in-time character hallucination in role-playing LLMs. Comprising 10,895 instances generated through an automated pipeline, this benchmark reveals significant hallucination issues in current state-of-the-art LLMs (e.g., GPT-4o). To counter this challenge, we propose Narrative-Experts, a method that decomposes the reasoning steps and utilizes narrative experts to reduce point-in-time character hallucinations effectively. Still, our findings with TimeChara highlight the ongoing challenges of point-in-time character hallucination, calling for further study.</abstract>
-      <url hash="dd253253">2024.findings-acl.197</url>
+      <url hash="24f3c993">2024.findings-acl.197</url>
       <bibkey>ahn-etal-2024-timechara</bibkey>
       <doi>10.18653/v1/2024.findings-acl.197</doi>
       <video href="2024.findings-acl.197.mp4"/>
@@ -8733,7 +8733,7 @@
       <author><first>Qi</first><last>Liu</last><affiliation>University of Hong Kong</affiliation></author>
       <pages>3326-3342</pages>
       <abstract>VLMs (Vision-Language Models) extend the capabilities of LLMs (Large Language Models) to accept multimodal inputs. Since it has been verified that LLMs can be induced to generate harmful or inaccurate content through specific test cases (termed as Red Teaming), how VLMs perform in similar scenarios, especially with their combination of textual and visual inputs, remains a question. To explore this problem, we present a novel red teaming dataset RTVLM, which encompasses 12 subtasks (e.g., image misleading, multi-modal jailbreaking, face fairness, etc) under 4 primary aspects (faithfulness, privacy, safety, fairness). Our RTVLM is the first red teaming dataset to benchmark current VLMs in terms of these 4 different aspects. Detailed analysis shows that 10 prominent open-sourced VLMs struggle with the red teaming in different degrees and have up to 31% performance gap with GPT-4V. Additionally, we simply apply red teaming alignment to LLaVA-v1.5 with Supervised Fine-tuning (SFT) using RTVLM, and this bolsters the models’ performance with 10% in RTVLM test set, 13% in MM-hallu, and without noticeable decline in MM-Bench, overpassing other LLaVA-based models in similar size with regular alignment data. This reveals that current open-sourced VLMs still lack red teaming alignment. Our code and datasets will be open-sourced.</abstract>
-      <url hash="0268fc05">2024.findings-acl.198</url>
+      <url hash="54b41aa1">2024.findings-acl.198</url>
       <bibkey>li-etal-2024-red</bibkey>
       <doi>10.18653/v1/2024.findings-acl.198</doi>
     </paper>
@@ -8747,7 +8747,7 @@
       <author><first>Wei</first><last>Peng</last><affiliation>Huawei Technologies Ltd.</affiliation></author>
       <pages>3343-3353</pages>
       <abstract>A Large Language Model (LLM) tends to generate inconsistent and sometimes contradictory outputs when presented with a prompt that has equivalent semantics but is expressed differently from the original prompt. To achieve semantic consistency of an LLM, one of the key approaches is to finetune the model with prompt-output pairs with semantically equivalent meanings. Despite its effectiveness, a data-driven finetuning method incurs substantial computation costs in data preparation and model optimization. In this regime, an LLM is treated as a “black box”, restricting our ability to gain deeper insights into its internal mechanism. In this paper, we are motivated to enhance the semantic consistency of LLMs through a more interpretable method (i.e., model editing) to this end. We first identify the model components (i.e., attention heads) that have a key impact on the semantic consistency of an LLM. We subsequently inject biases into the output of these model components along the semantic-consistency activation direction. It is noteworthy that these modifications are cost-effective, without reliance on mass manipulations of the original model parameters. Through comprehensive experiments on the constructed NLU and open-source NLG datasets, our method demonstrates significant improvements in the semantic consistency and task performance of LLMs. Additionally, our method exhibits promising generalization capabilities by performing well on tasks beyond the primary tasks.</abstract>
-      <url hash="1a63391e">2024.findings-acl.199</url>
+      <url hash="bbf888c6">2024.findings-acl.199</url>
       <bibkey>yang-etal-2024-enhancing</bibkey>
       <doi>10.18653/v1/2024.findings-acl.199</doi>
       <video href="2024.findings-acl.199.mp4"/>
@@ -8761,7 +8761,7 @@
       <author><first>Honguk</first><last>Woo</last></author>
       <pages>3354-3376</pages>
       <abstract>In embodied instruction-following (EIF), the integration of pretrained language models (LMs) as task planners emerges as a significant branch, where tasks are planned at the skill level by prompting LMs with pretrained skills and user instructions. However, grounding these pretrained skills in different domains remains challenging due to their intricate entanglement with the domain-specific knowledge. To address this challenge, we present a semantic skill grounding (SemGro) framework that leverages the hierarchical nature of semantic skills. SemGro recognizes the broad spectrum of these skills, ranging from short-horizon low-semantic skills that are universally applicable across domains to long-horizon rich-semantic skills that are highly specialized and tailored for particular domains. The framework employs an iterative skill decomposition approach, starting from the higher levels of semantic skill hierarchy and then moving downwards, so as to ground each planned skill to an executable level within the target domain. To do so, we use the reasoning capabilities of LMs for composing and decomposing semantic skills, as well as their multi-modal extension for assessing the skill feasibility in the target domain. Our experiments in the VirtualHome benchmark show the efficacy of SemGro in 300 cross-domain EIF scenarios.</abstract>
-      <url hash="525ef913">2024.findings-acl.200</url>
+      <url hash="a55c46b3">2024.findings-acl.200</url>
       <bibkey>shin-etal-2024-semantic</bibkey>
       <doi>10.18653/v1/2024.findings-acl.200</doi>
       <video href="2024.findings-acl.200.mp4"/>
@@ -8775,7 +8775,7 @@
       <author><first>Zhendong</first><last>Mao</last><affiliation>University of Science and Technology of China</affiliation></author>
       <pages>3377-3394</pages>
       <abstract>Recently, tremendous strides have been made to align the generation of Large Language Models (LLMs) with human values to mitigate toxic or unhelpful content. Leveraging Reinforcement Learning from Human Feedback (RLHF) proves effective and is widely adopted by researchers. However, implementing RLHF is complex, and its sensitivity to hyperparameters renders achieving stable performance and scalability challenging. Furthermore, prevailing approaches to preference alignment primarily concentrate on pairwise comparisons, with limited exploration into multi-response scenarios, thereby overlooking the potential richness within the candidate pool. For the above reasons, we propose a new approach: Listwise Reward Enhancement for Preference Alignment (LIRE), a gradient-based reward optimization approach that incorporates the offline rewards of multiple responses into a streamlined listwise framework, thus eliminating the need for online sampling during training. LIRE is straightforward to implement, requiring minimal parameter tuning, and seamlessly aligns with the pairwise paradigm while naturally extending to multi-response scenarios. Moreover, we introduce a self-enhancement algorithm aimed at iteratively refining the reward during training. Our experiments demonstrate that LIRE consistently outperforms existing methods across several benchmarks on dialogue and summarization tasks, with good transferability to out-of-distribution data, assessed using proxy reward models and human annotators.</abstract>
-      <url hash="f4aee39e">2024.findings-acl.201</url>
+      <url hash="6dc020dd">2024.findings-acl.201</url>
       <bibkey>zhu-etal-2024-lire</bibkey>
       <doi>10.18653/v1/2024.findings-acl.201</doi>
       <video href="2024.findings-acl.201.mp4"/>
@@ -8790,7 +8790,7 @@
       <author><first>Gunhee</first><last>Kim</last><affiliation>Seoul National University</affiliation></author>
       <pages>3395-3405</pages>
       <abstract>3D dense captioning is a task to localize objects in a 3D scene and generate descriptive sentences for each object. Recent approaches in 3D dense captioning have adopted transformer encoder-decoder frameworks from object detection to build an end-to-end pipeline without hand-crafted components. However, these approaches struggle with contradicting objectives where a single query attention has to simultaneously view both the tightly localized object regions and contextual environment. To overcome this challenge, we introduce SIA (See-It-All), a transformer pipeline that engages in 3D dense captioning with a novel paradigm called late aggregation. SIA simultaneously decodes two sets of queries—context query and instance query. The instance query focuses on localization and object attribute descriptions, while the context query versatilely captures the region-of-interest of relationships between multiple objects or with the global scene, then aggregated afterwards (i.e., late aggregation) via simple distance-based measures. To further enhance the quality of contextualized caption generation, we design a novel aggregator to generate a fully informed caption based on the surrounding context, the global environment, and object instances. Extensive experiments on two of the most widely-used 3D dense captioning datasets demonstrate that our proposed method achieves a significant improvement over prior methods.</abstract>
-      <url hash="205d10e9">2024.findings-acl.202</url>
+      <url hash="eafc55cf">2024.findings-acl.202</url>
       <bibkey>kim-etal-2024-see</bibkey>
       <doi>10.18653/v1/2024.findings-acl.202</doi>
       <video href="2024.findings-acl.202.mp4"/>
@@ -8802,7 +8802,7 @@
       <author><first>Iryna</first><last>Gurevych</last><affiliation>Mohamed bin Zayed University of Artificial Intelligence and Technical University of Darmstadt</affiliation></author>
       <pages>3406-3432</pages>
       <abstract>Answering Questions over Knowledge Graphs (KGQA) is key to well-functioning autonomous language agents in various real-life applications. To improve the neural-symbolic reasoning capabilities of language agents powered by Large Language Models (LLMs) in KGQA, we propose the Decomposition-Alignment-Reasoning Agent (DARA) framework. DARA effectively parses questions into formal queries through a dual mechanism: high-level iterative task decomposition and low-level task grounding. Importantly, DARA can be efficiently trained with a small number of high-quality reasoning trajectories. Our experimental results demonstrate that DARA fine-tuned on LLMs (e.g. Llama-2-7B, Mistral) outperforms both in-context learning-based agents with GPT-4 and alternative fine-tuned agents, across different benchmarks, making such models more accessible for real-life applications. We also show that DARA attains performance comparable to state-of-the-art enumerating-and-ranking-based methods for KGQA.</abstract>
-      <url hash="cae34f96">2024.findings-acl.203</url>
+      <url hash="dede45cd">2024.findings-acl.203</url>
       <bibkey>fang-etal-2024-dara</bibkey>
       <doi>10.18653/v1/2024.findings-acl.203</doi>
       <video href="2024.findings-acl.203.mp4"/>
@@ -8814,7 +8814,7 @@
       <author><first>Hai</first><last>Zhao</last><affiliation>Shanghai Jiao Tong University</affiliation></author>
       <pages>3433-3446</pages>
       <abstract>The burgeoning size of Large Language Models (LLMs) has led to enhanced capabilities in generating responses, albeit at the expense of increased inference times and elevated resource demands. Existing methods of acceleration, predominantly hinged on knowledge distillation, generally necessitate fine-tuning of considerably large models, such as Llama-7B, posing a challenge for average users. Furthermore, present techniques for expediting inference and reducing costs operate independently. To address these issues, we introduce a novel and intuitive Guidance-based Knowledge Transfer (GKT) framework. This approach leverages a larger LLM as a ”teacher” to create guidance prompts, paired with a smaller ”student” model to finalize responses. Remarkably, GKT requires no fine-tuning and doesn’t necessitate the teacher and student models to have the same vocabulary, allowing for extensive batch generation to accelerate the process while ensuring user customization. GKT can be seamlessly integrated into cloud-edge collaboration architectures, and is versatile enough for plug-and-play application across various models. It excels in both efficiency and affordability, epitomizing a ”cheap and cheerful” solution. GKT achieves a maximum accuracy improvement of 14.18%, along with a 10.72 times speed-up on GSM8K and an accuracy improvement of 14.00 % along with a 7.73 times speed-up in CSQA. When utilizing ChatGPT as teacher model and Llama2-70B as the student model, we can achieve 95.00% of ChatGPT’s performance at 52% of the cost. The results highlight substantial enhancements in accuracy and processing speed on the GSM8K and CSQA datasets, surpassing the performance of using either the student or teacher models in isolation.</abstract>
-      <url hash="7bdea6f4">2024.findings-acl.204</url>
+      <url hash="de5d7c75">2024.findings-acl.204</url>
       <bibkey>yao-etal-2024-gkt</bibkey>
       <doi>10.18653/v1/2024.findings-acl.204</doi>
       <video href="2024.findings-acl.204.mp4"/>
@@ -8829,7 +8829,7 @@
       <author><first>Lilja</first><last>Øvrelid</last><affiliation>Dept. of Informatics, University of Oslo</affiliation></author>
       <pages>3447-3460</pages>
       <abstract>Grounded language models use external sources of information, such as knowledge graphs, to meet some of the general challenges associated with pre-training. By extending previous work on compositional generalization in semantic parsing, we allow for a controlled evaluation of the degree to which these models learn and generalize from patterns in knowledge graphs. We develop a procedure for generating natural language questions paired with knowledge graphs that targets different aspects of compositionality and further avoids grounding the language models in information already encoded implicitly in their weights. We evaluate existing methods for combining language models with knowledge graphs and find them to struggle with generalization to sequences of unseen lengths and to novel combinations of seen base components. While our experimental results provide some insight into the expressive power of these models, we hope our work and released datasets motivate future research on how to better combine language models with structured knowledge representations.</abstract>
-      <url hash="da4a1de3">2024.findings-acl.205</url>
+      <url hash="d29f7f79">2024.findings-acl.205</url>
       <bibkey>wold-etal-2024-compositional</bibkey>
       <doi>10.18653/v1/2024.findings-acl.205</doi>
       <video href="2024.findings-acl.205.mp4"/>
@@ -8844,7 +8844,7 @@
       <author><first>Min</first><last>Zhang</last><affiliation>Harbin Institute of Technology, Shenzhen</affiliation></author>
       <pages>3461-3475</pages>
       <abstract>Large Language Models (LLMs) have demonstrated impressive capabilities for generalizing in unseen tasks. In the Named Entity Recognition (NER) task, recent advancements have seen the remarkable improvement of LLMs in a broad range of entity domains via instruction tuning, by adopting entity-centric schema. In this work, we explore the potential enhancement of the existing methods by incorporating negative instances into training. Our experiments reveal that negative instances contribute to remarkable improvements by (1) introducing contextual information, and (2) clearly delineating label boundaries. Furthermore, we introduce an efficient longest common subsequence (LCS) matching algorithm, which is tailored to transform unstructured predictions into structured entities. By integrating these components, we present GNER, a Generative NER system that shows improved zero-shot performance across unseen entity domains. Our comprehensive evaluation illustrates our system’s superiority, surpassing state-of-the-art (SoTA) methods by 9 <tex-math>F_1</tex-math> score in zero-shot evaluation.</abstract>
-      <url hash="5193969d">2024.findings-acl.206</url>
+      <url hash="c134eb10">2024.findings-acl.206</url>
       <bibkey>ding-etal-2024-rethinking</bibkey>
       <doi>10.18653/v1/2024.findings-acl.206</doi>
     </paper>
@@ -8857,7 +8857,7 @@
       <author><first>Jun</first><last>Zhao</last><affiliation>Institute of automation, Chinese academy of science</affiliation></author>
       <pages>3476-3503</pages>
       <abstract>Knowledge editing aims to rectify inaccuracies in large language models (LLMs) without costly retraining for outdated or erroneous knowledge. However, current knowledge editing methods primarily focus on single editing, failing to meet the requirements for lifelong editing. This study reveals a performance degradation encountered by knowledge editing in lifelong editing, characterized by toxicity buildup and toxicity flash, with the primary cause identified as pattern unmatch. We introduce a knowledge editing approach named Wise-Layer Knowledge Editor (WilKE), which selects editing layer based on the pattern matching degree of editing knowledge across different layers in language models. Experimental results demonstrate that, in lifelong editing, WilKE exhibits an average improvement of 46.2% and 67.8% on editing GPT2-XL and GPT-J relative to state-of-the-art knowledge editing methods.</abstract>
-      <url hash="efb6b70e">2024.findings-acl.207</url>
+      <url hash="b3c25fc1">2024.findings-acl.207</url>
       <bibkey>hu-etal-2024-wilke</bibkey>
       <doi>10.18653/v1/2024.findings-acl.207</doi>
     </paper>
@@ -8869,7 +8869,7 @@
       <author><first>Guoqiang</first><last>Xu</last></author>
       <pages>3504-3518</pages>
       <abstract>Though notable progress has been made, neural-based aspect-based sentiment analysis (ABSA) models are prone to learn spurious correlations from annotation biases, resulting in poor robustness on adversarial data transformations. Among the debiasing solutions, causal inference-based methods have attracted much research attention, which can be mainly categorized into causal intervention methods and counterfactual reasoning methods. However, most of the present debiasing methods focus on single-variable causal inference, which is not suitable for ABSA with two input variables (the target aspect and the review). In this paper, we propose a novel framework based on multi-variable causal inference for debiasing ABSA. In this framework, different types of biases are tackled based on different causal intervention methods. For the review branch, the bias is modeled as indirect confounding from context, where backdoor adjustment intervention is employed for debiasing. For the aspect branch, the bias is described as a direct correlation with labels, where counterfactual reasoning is adopted for debiasing. Extensive experiments demonstrate the effectiveness of the proposed method compared to various baselines on the two widely used real-world aspect robustness test set datasets.</abstract>
-      <url hash="6e987dc0">2024.findings-acl.208</url>
+      <url hash="a23d1c71">2024.findings-acl.208</url>
       <bibkey>wu-etal-2024-diner</bibkey>
       <doi>10.18653/v1/2024.findings-acl.208</doi>
     </paper>
@@ -8881,7 +8881,7 @@
       <author><first>Guoqiang</first><last>Xu</last></author>
       <pages>3519-3532</pages>
       <abstract>Though Large Language Models (LLMs) have demonstrated the powerful capabilities of few-shot learning through prompting methods, supervised training is still necessary for complex reasoning tasks. Because of their extensive parameters and memory consumption, both Parameter-Efficient Fine-Tuning (PEFT) methods and Memory-Efficient Fine-Tuning methods have been proposed for LLMs. Nevertheless, the issue of large annotated data consumption, the aim of Data-Efficient Fine-Tuning, remains unexplored. One obvious way is to combine the PEFT method with active learning. However, the experimental results show that such a combination is not trivial and yields inferior results. Through probe experiments, such observation might be explained by two main reasons: uncertainty gap and poor model calibration. Therefore, in this paper, we propose a novel approach to effectively integrate uncertainty-based active learning and LoRA. Specifically, for the uncertainty gap, we introduce a dynamic uncertainty measurement that combines the uncertainty of the base model and the uncertainty of the full model during the iteration of active learning. For poor model calibration, we incorporate the regularization method during LoRA training to keep the model from being over-confident, and the Monte-Carlo dropout mechanism is employed to enhance the uncertainty estimation. Experimental results show that the proposed approach outperforms existing baseline models on three complex reasoning tasks.</abstract>
-      <url hash="06b7008e">2024.findings-acl.209</url>
+      <url hash="050ab4d4">2024.findings-acl.209</url>
       <bibkey>zhang-etal-2024-star</bibkey>
       <doi>10.18653/v1/2024.findings-acl.209</doi>
     </paper>
@@ -8910,7 +8910,7 @@
       <author><first>Shujian</first><last>Huang</last><affiliation>Nanjing University</affiliation></author>
       <pages>3546-3562</pages>
       <abstract>This study investigates how Large Language Models (LLMs) leverage source and reference data in machine translation evaluation task, aiming to better understand the mechanisms behind their remarkable performance in this task.We design the controlled experiments across various input modes and model types, and employ both coarse-grained and fine-grained prompts to discern the utility of source versus reference information.We find that reference information significantly enhances the evaluation accuracy, while surprisingly, source information sometimes is counterproductive, indicating LLMs’ inability to fully leverage the cross-lingual capability when evaluating translations.Further analysis of the fine-grained evaluation and fine-tuning experiments show similar results.These findings also suggest a potential research direction for LLMs that fully exploits the cross-lingual capability of LLMs to achieve better performance in machine translation evaluation tasks.</abstract>
-      <url hash="60822c68">2024.findings-acl.211</url>
+      <url hash="a935ed98">2024.findings-acl.211</url>
       <bibkey>huang-etal-2024-lost</bibkey>
       <doi>10.18653/v1/2024.findings-acl.211</doi>
       <video href="2024.findings-acl.211.mp4"/>
@@ -8926,7 +8926,7 @@
       <author><first>Jason</first><last>Weston</last><affiliation>New York University and Facebook</affiliation></author>
       <pages>3563-3578</pages>
       <abstract>Generation of plausible yet incorrect factual information, termed hallucination, is an unsolved issue in large language models. We study the ability of language models to deliberate on the responses they give in order to correct their mistakes. We develop the Chain-of-Verification (CoVe) method whereby the model first (i) drafts an initial response; then (ii) plans verification questions to fact-check its draft; (iii) answers those questions independently so the answers are not biased by other responses; and (iv) generates its final verified response. In experiments, we show CoVe decreases hallucinations across a variety of tasks, from list-based questions from Wikidata, closed book MultiSpanQA and longform text generation.</abstract>
-      <url hash="d6cf66c6">2024.findings-acl.212</url>
+      <url hash="51d0e7c1">2024.findings-acl.212</url>
       <bibkey>dhuliawala-etal-2024-chain</bibkey>
       <doi>10.18653/v1/2024.findings-acl.212</doi>
       <video href="2024.findings-acl.212.mp4"/>
@@ -8942,7 +8942,7 @@
       <author><first>Rui</first><last>Wang</last><affiliation>Shanghai Jiao Tong University</affiliation></author>
       <pages>3579-3602</pages>
       <abstract>Bargaining is an important and unique part of negotiation between humans. As LLM-driven agents learn to negotiate and act like real humans, how to evaluate agents’ bargaining abilities remains an open problem.For the first time, we formally described the Bargaining task as an asymmetric incomplete information game, defining the gains of the Buyer and Seller in multiple bargaining processes. It allows us to quantitatively assess an agent’s performance in the Bargain task.We collected a real product price dataset, AmazonHistoryPrice, and conducted evaluations of various LLM agents’ bargaining abilities. We find that playing a Buyer is much harder than a Seller, and increasing model size can not effectively improve the Buyer’s performance.To address the challenge, we propose a novel approach called OG-Narrator that integrates a deterministic Offer Generator to control the price range of Buyer’s offers, and an LLM Narrator to create natural language sentences for generated offers.Experimental results show that OG-Narrator improves the buyer’s deal rates from 26.67% to 88.88% and brings a ten times multiplication of profits on all baselines, even a model that has not been aligned.</abstract>
-      <url hash="c61dead4">2024.findings-acl.213</url>
+      <url hash="f068e1b6">2024.findings-acl.213</url>
       <bibkey>xia-etal-2024-measuring</bibkey>
       <doi>10.18653/v1/2024.findings-acl.213</doi>
       <video href="2024.findings-acl.213.mp4"/>
@@ -8971,7 +8971,7 @@
       <author><first>Mengfei</first><last>Yang</last><affiliation>China Academy of Space Technology</affiliation></author>
       <pages>3603-3614</pages>
       <abstract>How to evaluate the coding abilities of Large Language Models (LLMs) remains an open question. We find that existing benchmarks are poorly aligned with real-world code repositories and are insufficient to evaluate the coding abilities of LLMs.To address the knowledge gap, we propose a new benchmark named DevEval, which has three advances. (1) DevEval aligns with real-world repositories in multiple dimensions, e.g., code and dependency distributions. (2) DevEval is annotated by 13 developers and contains comprehensive annotations (e.g., requirements, original repositories, reference code, and reference dependencies). (3) DevEval comprises 1,825 testing samples from 115 repositories, covering 10 popular domains (e.g., Internet, Database). Based on DevEval, we propose repository-level code generation and evaluate 8 popular LLMs on DevEval (e.g., gpt-4, gpt-3.5, StarCoder 2, DeepSeek Coder, CodeLLaMa). Our experiments reveal these LLMs’ coding abilities in real-world code repositories. For example, the highest Pass@1 of gpt-4 only is 53.04% in our experiments. We also analyze LLMs’ failed cases and summarize their shortcomings. We hope DevEval can facilitate the development of LLMs in real code repositories. DevEval, prompts, and LLMs’ predictions have been released.</abstract>
-      <url hash="6b56d728">2024.findings-acl.214</url>
+      <url hash="5af528b7">2024.findings-acl.214</url>
       <bibkey>li-etal-2024-deveval</bibkey>
       <doi>10.18653/v1/2024.findings-acl.214</doi>
       <video href="2024.findings-acl.214.mp4"/>
@@ -8985,7 +8985,7 @@
       <author><first>Xueqi</first><last>Cheng</last><affiliation>, Chinese Academy of Sciences</affiliation></author>
       <pages>3615-3625</pages>
       <abstract>Exploring the application of large language models (LLMs) to graph learning is an emerging endeavor. However, the vast amount of information inherent in large graphs poses significant challenges to graph learning with LLMs. This work focuses on the link prediction task and introduces **LPNL** (Link Prediction via Natural Language), a framework based on large language models designed for scalable link prediction on large-scale heterogeneous graphs. We design novel prompts for link prediction that articulate graph details in natural language. We propose a two-stage sampling pipeline to extract crucial information from the graphs, and a divide-and-conquer strategy to control the input tokens within predefined limits, addressing the challenge of overwhelming information. We fine-tune a T5 model based on our self-supervised learning designed for link prediction. Extensive experimental results demonstrate that LPNL outperforms multiple advanced baselines in link prediction tasks on large-scale graphs.</abstract>
-      <url hash="8cc22c52">2024.findings-acl.215</url>
+      <url hash="1391e9f6">2024.findings-acl.215</url>
       <bibkey>bi-etal-2024-lpnl</bibkey>
       <doi>10.18653/v1/2024.findings-acl.215</doi>
       <video href="2024.findings-acl.215.mp4"/>
@@ -8999,7 +8999,7 @@
       <author><first>Holger</first><last>Schwenk</last></author>
       <pages>3626-3635</pages>
       <abstract>Multilingual parallel data for speech-to-speech translation is scarce and expensive to create from scratch. This is all the more true for expressive speech translation, which aims at preserving not only the semantics, but also the overall prosody (e.g. style, emotion, rate-of-speech). Existing corpora contain speech utterances with the same meaning, yet the overall prosody is typically different, as human annotators are not tasked with reproducing these aspects, or crowed-sourced efforts do not specifically target this kind of alignment in priority. In this paper, we propose a novel alignment algorithm, which automatically forms pairs of speech segments aligned not only in meaning, but also in expressivity. In order to validate our approach, we train an expressive multilingual speech-to-speech translation system on the automatically aligned data. Our experiments show that in comparison to semantic-only approaches, expressively aligned data yields large improvements in source expressivity preservation (e.g. 43% uplift in speech rate preservation on average), while still maintaining content translation quality. In some scenarios, results also indicate that this alignment algorithm can outperform standard, semantic-focused approaches even on content translation quality.</abstract>
-      <url hash="3e213ae8">2024.findings-acl.216</url>
+      <url hash="c8ea0a56">2024.findings-acl.216</url>
       <bibkey>heffernan-etal-2024-aligning</bibkey>
       <doi>10.18653/v1/2024.findings-acl.216</doi>
       <video href="2024.findings-acl.216.mp4"/>
@@ -9017,7 +9017,7 @@
       <author><first>Anh Tuan</first><last>Luu</last><affiliation>Nanyang Technological University</affiliation></author>
       <pages>3636-3657</pages>
       <abstract>Humans use multiple senses to comprehend the environment. Vision and language are two of the most vital senses since they allow us to easily communicate our thoughts and perceive the world around us. There has been a lot of interest in creating video-language understanding systems with human-like senses since a video-language pair can mimic both our linguistic medium and visual environment with temporal dynamics. In this survey, we review the key tasks of these systems and highlight the associated challenges. Based on the challenges, we summarize their methods from model architecture, model training, and data perspectives. We also conduct performance comparison among the methods, and discuss promising directions for future research.</abstract>
-      <url hash="c1d35d5c">2024.findings-acl.217</url>
+      <url hash="309990f9">2024.findings-acl.217</url>
       <bibkey>nguyen-etal-2024-video</bibkey>
       <doi>10.18653/v1/2024.findings-acl.217</doi>
       <video href="2024.findings-acl.217.mp4"/>
@@ -9031,7 +9031,7 @@
       <author><first>Enhong</first><last>Chen</last><affiliation>University of Science and Technology of China</affiliation></author>
       <pages>3658-3669</pages>
       <abstract>Since the release of ChatGPT, generative models have achieved tremendous success and become the de facto approach for various NLP tasks. However, its application in the field of input methods remains under-explored. Many neural network approaches have been applied to the construction of Chinese input method engines (IMEs). Previous research often assumed that the input pinyin was correct and focused on Pinyin-to-character (P2C) task, which significantly falls short of meeting users’ demands. Moreover, previous research could not leverage user feedback to optimize the model and provide personalized results. In this study, we propose a novel Generative Input paradigm named GeneInput. It uses prompts to handle all input scenarios and other intelligent auxiliary input functions, optimizing the model with user feedback. The results demonstrate that we have achieved state-of-the-art performance for the first time in the Full-mode Key-sequence to Characters task. GeneInput also includes RLHF-IME, a novel RLHF application framework for input method, that eliminates the need for manual ranking annotations and the performance surpasses GPT-4. Relevant resources have been open-sourced.</abstract>
-      <url hash="a391c281">2024.findings-acl.218</url>
+      <url hash="452e2051">2024.findings-acl.218</url>
       <bibkey>ding-etal-2024-generative</bibkey>
       <doi>10.18653/v1/2024.findings-acl.218</doi>
     </paper>
@@ -9043,10 +9043,10 @@
       <author><first>Bo</first><last>Wang</last><affiliation>School of Computer Science &amp; Technology, Beijing Institute of Technology</affiliation></author>
       <author><first>Yuyue</first><last>Zhao</last></author>
       <author><first>Yong</first><last>Liao</last><affiliation>University of Science and Technology of China and China Academic of Electronics and Information Technology</affiliation></author>
-      <author><first>Pengyuan</first><last>Zhou</last><affiliation>Aarhus University</affiliation></author>
+      <author><first>Peng</first><last>Zhou</last><affiliation>Aarhus University</affiliation></author>
       <pages>3670-3685</pages>
       <abstract>Retrieval-Augmented Generation (RAG) is an effective solution to supplement necessary knowledge to large language models (LLMs). Targeting its bottleneck of retriever performance, “generate-then-read” pipeline is proposed to replace the retrieval stage with generation from the LLM itself. Although promising, this research direction is underexplored and still cannot work in the scenario when source knowledge is given. In this paper, we formalize a general “A + B” framework with varying combinations of foundation models and types for systematic investigation. We explore the efficacy of the base and chat versions of LLMs and found their different functionalities suitable for generator A and reader B, respectively. Their combinations consistently outperform single models, especially in complex scenarios. Furthermore, we extend the application of the “A + B” framework to scenarios involving source documents through continuous learning, enabling the direct integration of external knowledge into LLMs. This approach not only facilitates effective acquisition of new knowledge but also addresses the challenges of safety and helpfulness post-adaptation. The paper underscores the versatility of the “A + B” framework, demonstrating its potential to enhance the practical application of LLMs across various domains.</abstract>
-      <url hash="03717acb">2024.findings-acl.219</url>
+      <url hash="27563eeb">2024.findings-acl.219</url>
       <bibkey>tang-etal-2024-b</bibkey>
       <doi>10.18653/v1/2024.findings-acl.219</doi>
       <video href="2024.findings-acl.219.mp4"/>
@@ -9058,7 +9058,7 @@
       <author><first>Nghi</first><last>Bui</last></author>
       <pages>3686-3704</pages>
       <abstract>Code Large Language Models (CodeLLMs) have ushered in a new era in code generation advancements. However, selecting the best code solutions from all possible CodeLLM outputs remains a challenge. Previous methods often overlooked the intricate functional similarities and interactions between solution clusters. We introduce SRank, a novel reranking strategy for selecting the best solutions from code generation, focusing on modeling the relationships between clusters of solutions. By quantifying the functional overlap between solution clusters, our approach provides a better ranking strategy for code solutions. Empirical results show that our method achieves remarkable results on the pass@1 score. For instance, on the Human-Eval benchmark, we achieve 69.66% in pass@1 with Codex002, 75.31% with WizardCoder, 53.99% with StarCoder, and 60.55% with CodeGen, surpassing state-of-the-art code generation reranking methods such as CodeT and Coder-Reviewer on the same CodeLLM by a significant margin approx 6.1% improvement on average. Even in scenarios with a limited number of sampled solutions and test cases, our approach demonstrates robustness and superiority, marking a new benchmark in code generation reranking. Our implementation can be found at https://github.com/FSoft-AI4Code/SRank-CodeRanker.</abstract>
-      <url hash="9803ad2b">2024.findings-acl.220</url>
+      <url hash="c742830d">2024.findings-acl.220</url>
       <bibkey>to-etal-2024-functional</bibkey>
       <doi>10.18653/v1/2024.findings-acl.220</doi>
       <video href="2024.findings-acl.220.mp4"/>
@@ -9075,7 +9075,7 @@
       <author><first>Xiaolong</first><last>Li</last></author>
       <pages>3705-3716</pages>
       <abstract>Human preference alignment is essential to improve the interaction quality of large language models (LLMs). Existing alignment methods depend on manually annotated preference data to guide the LLM optimization directions. However, continuously updating LLMs for alignment raises a distribution gap between model-generated samples and human-annotated responses, hindering training effectiveness. To mitigate this issue, previous methods require additional preference annotation on newly generated samples to adapt to the shifted distribution, which consumes a large amount of annotation resources. Targeting more efficient human preference optimization, we propose an Adversarial Preference Optimization (APO) framework, in which the LLM and the reward model update alternatively via a min-max game. Through adversarial training, the reward model can adapt to the shifted generation distribution of the LLM without any additional annotation. With comprehensive experiments, we find the proposed adversarial training framework further enhances existing alignment baselines in terms of LLM helpfulness and harmlessness. The code is at https://github.com/Linear95/APO.</abstract>
-      <url hash="0a1f73f5">2024.findings-acl.221</url>
+      <url hash="8295ed03">2024.findings-acl.221</url>
       <bibkey>cheng-etal-2024-adversarial</bibkey>
       <doi>10.18653/v1/2024.findings-acl.221</doi>
     </paper>
@@ -9090,7 +9090,7 @@
       <author><first>Xiangjie</first><last>Kong</last></author>
       <pages>3717-3726</pages>
       <abstract>Aspect sentiment quad prediction (ASQP) has garnered significant attention in aspect-based sentiment analysis (ABSA). Current ASQP research primarily relies on pre-trained generative language models to produce templated sequences, often complemented by grid-based auxiliary methods. Despite these efforts, the persistent challenge of generation instability remains unresolved and the effectiveness of grid methods remains underexplored in current studies. To this end, we introduce <b>G</b>rid Noise <b>D</b>iffusion <b>P</b>inpoint Network (<b>GDP</b>), a T5-based generative model aiming to tackle the issue of generation instability. The model consists of three novel modules, including Diffusion Vague Learning (DVL) to facilitate effective model learning and enhance overall robustness; Consistency Likelihood Learning (CLL) to discern the characteristics and commonalities of sentiment elements and thus reduce the impact of distributed noise; and GDP-FOR, a novel generation template, to enable models to generate outputs in a more natural way. Extensive experiments on four datasets demonstrate the remarkable effectiveness of our approach in addressing ASQP tasks.</abstract>
-      <url hash="82257d4f">2024.findings-acl.222</url>
+      <url hash="9354304f">2024.findings-acl.222</url>
       <bibkey>zhu-etal-2024-pinpointing</bibkey>
       <doi>10.18653/v1/2024.findings-acl.222</doi>
       <video href="2024.findings-acl.222.mp4"/>
@@ -9105,7 +9105,7 @@
       <author><first>Bhiksha</first><last>Raj</last><affiliation>Carnegie Mellon University, Carnegie Mellon University and Mohamed bin Zayed University of Artificial Intelligence</affiliation></author>
       <pages>3727-3741</pages>
       <abstract>Recently, neural networks have shown impressive progress across diverse fields, with speech processing being no exception. However, recent breakthroughs in this area require extensive offline training using large datasets and tremendous computing resources. Unfortunately, these models struggle to retain their previously acquired knowledge when learning new tasks continually. In this paper, we investigate the problem of learning sequence-to-sequence models for spoken language understanding in a class-incremental learning (CIL) setting and we propose COCONUT, a CIL method that relies on the combination of experience replay and contrastive learning. Through a modified version of the standard supervised contrastive loss, COCONUT preserves the learned representations by pulling closer samples from the same class and pushing away the others. Moreover, we leverage a multimodal contrastive loss that helps the model learn more discriminative representations of the new data by aligning audio and text features. We also investigate different contrastive designs to combine the strengths of the contrastive loss with teacher-student architectures used for distillation. Experiments on two established SLU datasets reveal the effectiveness of our proposed approach and significant improvements over the baselines. We also show that COCONUT can be combined with methods that operate on the decoder side of the model, resulting in further metrics improvements.</abstract>
-      <url hash="6de60823">2024.findings-acl.223</url>
+      <url hash="7e02f2a2">2024.findings-acl.223</url>
       <bibkey>cappellazzo-etal-2024-continual</bibkey>
       <doi>10.18653/v1/2024.findings-acl.223</doi>
     </paper>
@@ -9117,7 +9117,7 @@
       <author><first>Siqiang</first><last>Luo</last><affiliation>Nanyang Technological University</affiliation></author>
       <pages>3742-3759</pages>
       <abstract>Knowledge Graph (KG) inductive reasoning, which aims to infer missing facts from new KGs that are not seen during training, has been widely adopted in various applications. One critical challenge of KG inductive reasoning is handling low-resource scenarios with scarcity in both textual and structural aspects. In this paper, we attempt to address this challenge with Large Language Models (LLMs). Particularly, we utilize the state-of-the-art LLMs to generate a graph-structural prompt to enhance the pre-trained Graph Neural Networks (GNNs), which brings us new methodological insights into the KG inductive reasoning methods, as well as high generalizability in practice. On the methodological side, we introduce a novel pretraining and prompting framework ProLINK, designed for low-resource inductive reasoning across arbitrary KGs without requiring additional training. On the practical side, we experimentally evaluate our approach on 36 low-resource KG datasets and find that ProLINK outperforms previous methods in three-shot, one-shot, and zero-shot reasoning tasks, exhibiting average performance improvements by 20%, 45%, and 147%, respectively. Furthermore, ProLINK demonstrates strong robustness for various LLM promptings as well as full-shot scenarios.</abstract>
-      <url hash="7a56f307">2024.findings-acl.224</url>
+      <url hash="58822065">2024.findings-acl.224</url>
       <bibkey>wang-etal-2024-llm</bibkey>
       <doi>10.18653/v1/2024.findings-acl.224</doi>
       <video href="2024.findings-acl.224.mp4"/>
@@ -9130,7 +9130,7 @@
       <author><first>Yusuke</first><last>Miyao</last><affiliation>The University of Tokyo</affiliation></author>
       <pages>3760-3772</pages>
       <abstract>Unsupervised constituency parsing focuses on identifying word sequences that form a syntactic unit (i.e., constituents) in target sentences. Linguists identify the constituent by evaluating a set of Predicate-Argument Structure (PAS) equivalent sentences where we find the constituent appears more frequently than non-constituents (i.e., the constituent corresponds to a frequent word sequence within the sentence set). However, such frequency information is unavailable in previous parsing methods that identify the constituent by observing sentences with diverse PAS. In this study, we empirically show that constituents correspond to frequent word sequences in the PAS-equivalent sentence set. We propose a frequency-based parser, span-overlap, that (1) computes the span-overlap score as the word sequence’s frequency in the PAS-equivalent sentence set and (2) identifies the constituent structure by finding a constituent tree with the maximum span-overlap score. The parser achieves state-of-the-art level parsing accuracy, outperforming existing unsupervised parsers in eight out of ten languages. Additionally, we discover a multilingual phenomenon: participant-denoting constituents tend to have higher span-overlap scores than equal-length event-denoting constituents, meaning that the former tend to appear more frequently in the PAS-equivalent sentence set than the latter. The phenomenon indicates a statistical difference between the two constituent types, laying the foundation for future labeled unsupervised parsing research.</abstract>
-      <url hash="4f48610b">2024.findings-acl.225</url>
+      <url hash="33aae9c5">2024.findings-acl.225</url>
       <bibkey>chen-etal-2024-unsupervised</bibkey>
       <doi>10.18653/v1/2024.findings-acl.225</doi>
       <video href="2024.findings-acl.225.mp4"/>
@@ -9144,7 +9144,7 @@
       <author><first>Ying</first><last>Wang</last><affiliation>Jilin University</affiliation></author>
       <pages>3773-3786</pages>
       <abstract>Human-like social bias of pre-trained language models (PLMs) on downstream tasks have attracted increasing attention. The potential flaws in the training data are the main factor that causes unfairness in PLMs. Existing data-centric debiasing strategies mainly leverage explicit bias words (defined as sensitive attribute words specific to demographic groups) for counterfactual data augmentation to balance the training data. However, they lack consideration of implicit bias words potentially associated with explicit bias words in complex distribution data, which indirectly harms the fairness of PLMs. To this end, we propose a **Data**-Centric **Debias**ing method (named Data-Debias), which uses an explainability method to search for implicit bias words to assist in debiasing PLMs. Specifically, we compute the feature attributions of all tokens using the Integrated Gradients method, and then treat the tokens that have a large impact on the model’s decision as implicit bias words. To make the search results more precise, we iteratively train a biased model to amplify the bias with each iteration. Finally, we use the implicit bias words searched in the last iteration to assist in debiasing PLMs. Extensive experimental results on multiple PLMs debiasing on three different classification tasks demonstrate that Data-Debias achieves state-of-the-art debiasing performance and strong generalization while maintaining predictive abilities.</abstract>
-      <url hash="5eee63e5">2024.findings-acl.226</url>
+      <url hash="04b2f80c">2024.findings-acl.226</url>
       <bibkey>li-etal-2024-data</bibkey>
       <doi>10.18653/v1/2024.findings-acl.226</doi>
       <video href="2024.findings-acl.226.mp4"/>
@@ -9157,7 +9157,7 @@
       <author><first>Ramakanth</first><last>Kavuluru</last><affiliation>University of Kentucky</affiliation></author>
       <pages>3787-3797</pages>
       <abstract>Relation extraction (RE) is a well-known NLP application often treated as a sentence or document-level task. However, a handful of recent efforts explore it across documents or in the cross-document setting (CrossDocRE). This is distinct from the single document case because different documents often focus on disparate themes, while text within a document tends to have a single goal.Current CrossDocRE efforts do not consider domain knowledge, which are often assumed to be known to the reader when documents are authored. Here, we propose a novel approach, KXDocRE, that embed domain knowledge of entities with input text for cross-document RE. Our proposed framework has three main benefits over baselines: 1) it incorporates domain knowledge of entities along with documents’ text; 2) it offers interpretability by producing explanatory text for predicted relations between entities 3) it improves performance over the prior methods. Code and models are available at <url>https://github.com/kracr/cross-doc-relation-extraction</url>.</abstract>
-      <url hash="5a7a6749">2024.findings-acl.227</url>
+      <url hash="548b62f6">2024.findings-acl.227</url>
       <bibkey>jain-etal-2024-knowledge</bibkey>
       <doi>10.18653/v1/2024.findings-acl.227</doi>
       <video href="2024.findings-acl.227.mp4"/>
@@ -9168,7 +9168,7 @@
       <author><first>Yun-Nung</first><last>Chen</last><affiliation>Department of Computer Science and Informational Engineering, National Taiwan University</affiliation></author>
       <pages>3798-3812</pages>
       <abstract>Recent research in dialogue systems focuses on two main categories: task-oriented (TOD) and open-domain (chit-chat) dialogues. TOD systems help users complete specific tasks, while open-domain systems aim to create engaging conversations. However, user intents often emerge during interactions. A recent study introduced SalesBot, simulating dialogues that transition from chit-chat to task-oriented scenarios to train sales agents. Unfortunately, the initial data lacked smooth transitions and coherent long dialogues, resulting in unnatural interactions. This paper presents SalesBot 2.0, an improved dataset leveraging commonsense knowledge from large language models (LLMs) through strategic prompting. Additionally, we introduce SalesAgent, a novel model trained on salesperson interactions using chain-of-thought (CoT) reasoning. This model excels in transitioning topics, understanding user intents, and selecting appropriate strategies.Experiments with diverse user simulations validate our method’s effectiveness in controlling dialogue strategies in LLMs. SalesBot 2.0 enhances coherence and reduces aggression, improving model learning for sales-customer interactions.</abstract>
-      <url hash="0e7a651c">2024.findings-acl.228</url>
+      <url hash="a89dc480">2024.findings-acl.228</url>
       <bibkey>chang-chen-2024-injecting</bibkey>
       <doi>10.18653/v1/2024.findings-acl.228</doi>
       <video href="2024.findings-acl.228.mp4"/>
@@ -9184,7 +9184,7 @@
       <author><first>Xiaojie</first><last>Wang</last><affiliation>Beijing University of Post and Telecommunication</affiliation></author>
       <pages>3813-3828</pages>
       <abstract>Although large language models (LLMs) show remarkable capabilities and generalizability across various tasks, they are criticized for lack of expertise. One promising solution is to combine knowledge graphs (KGs) with LLMs, and recent studies focus on integrating KGs into LLMs through prompt-based methods. However, these approaches fail to use the structural information of the KGs, suffer from the problem of knowledge conflict, and over-reliance on super LLMs. To address these challenges, we propose KG-Adapter, a parameter-level KG integration method based on parameter-efficient fine-tuning (PEFT). Specifically, we introduce a novel adapter structure designed for decoder-only LLMs, which can encode KGs from both node-centered and relation-centered perspectives, and then perform joint reasoning with LLMs to generate responses end-to-end. Experiments with diverse models on four datasets for two different tasks all demonstrate significant improvements. With only 28M parameters trained, we make the 7B-parameter LLM outperform the previous full-parameter fine-tuned state-of-the-art method and comparable to the prompt-based ChatGPT methods.</abstract>
-      <url hash="8595c766">2024.findings-acl.229</url>
+      <url hash="e354fbd0">2024.findings-acl.229</url>
       <bibkey>tian-etal-2024-kg</bibkey>
       <doi>10.18653/v1/2024.findings-acl.229</doi>
       <video href="2024.findings-acl.229.mp4"/>
@@ -9203,7 +9203,7 @@
       <author><first>Kun</first><last>Gai</last></author>
       <pages>3829-3852</pages>
       <abstract>Although chain-of-thought (CoT) prompting combined with language models has achieved encouraging results on complex reasoning tasks, the naive greedy decoding used in CoT prompting usually causes the repetitiveness and local optimality. To address this shortcoming, ensemble-optimization tries to obtain multiple reasoning paths to get the final answer assembly. However, current ensemble-optimization methods either simply employ rule-based post-processing such as self-consistency, or train an additional model based on several task-related human annotations to select the best one among multiple reasoning paths, yet fail to generalize to realistic settings where the type of input questions is unknown or the answer format of reasoning paths is unknown. To avoid their limitations, we propose Self-Agreement, a generalizable ensemble-optimization method applying in almost all scenarios where the type of input questions and the answer format of reasoning paths may be known or unknown. Self-agreement firstly samples from language model’s decoder to generate a diverse set of reasoning paths, and subsequently prompts the language model one more time to determine the optimal answer by selecting the most agreed answer among the sampled reasoning paths. Self-agreement simultaneously achieves remarkable performance on six public reasoning benchmarks and superior generalization capabilities.</abstract>
-      <url hash="ed6f3ec2">2024.findings-acl.230</url>
+      <url hash="92b94e51">2024.findings-acl.230</url>
       <bibkey>lin-etal-2024-just</bibkey>
       <doi>10.18653/v1/2024.findings-acl.230</doi>
       <video href="2024.findings-acl.230.mp4"/>
@@ -9217,7 +9217,7 @@
       <author><first>Dan</first><last>Roth</last><affiliation>University of Pennsylvania</affiliation></author>
       <pages>3853-3878</pages>
       <abstract>Large Language Models (LLMs), excel in natural language understanding, but their capability for complex mathematical reasoning with a hybrid of structured tables and unstructured text remain uncertain. This study explores LLMs’ mathematical reasoning on four financial tabular question-answering datasets: TATQA, FinQA, ConvFinQA, and Multihiertt. Through extensive experiments with various models and prompting techniques, we assess how LLMs adapt to complex tables and mathematical tasks. We focus on sensitivity to table complexity and performance variations with an increasing number of arithmetic reasoning steps. The results provide insights into LLMs’ capabilities and limitations in handling complex mathematical scenarios for semi-structured tables. Ultimately, we introduce a novel prompting technique EEDP tailored to semi-structured documents, matching or outperforming baselines performance while providing a nuanced understanding of LLMs abilities.</abstract>
-      <url hash="d5b82f88">2024.findings-acl.231</url>
+      <url hash="6a88b0b3">2024.findings-acl.231</url>
       <bibkey>srivastava-etal-2024-evaluating</bibkey>
       <doi>10.18653/v1/2024.findings-acl.231</doi>
     </paper>
@@ -9232,7 +9232,7 @@
       <author><first>Ruifeng</first><last>Xu</last><affiliation>Harbin Institute of Technology</affiliation></author>
       <pages>3879-3890</pages>
       <abstract>Large language models (LLMs) have achieved promising results in sentiment analysis through the in-context learning (ICL) paradigm. However, their ability to distinguish subtle sentiments still remains a challenge. Inspired by the human ability to adjust understanding via feedback, this paper enhances ICL by incorporating prior predictions and feedback, aiming to rectify sentiment misinterpretation of LLMs. Specifically, the proposed framework consists of three steps: (1) acquiring prior predictions of LLMs, (2) devising predictive feedback based on correctness, and (3) leveraging a feedback-driven prompt to refine sentiment understanding. Experimental results across nine sentiment analysis datasets demonstrate the superiority of our framework over conventional ICL methods, with an average F1 improvement of 5.95%.</abstract>
-      <url hash="337ca60d">2024.findings-acl.232</url>
+      <url hash="1b574cab">2024.findings-acl.232</url>
       <bibkey>xu-etal-2024-improving</bibkey>
       <doi>10.18653/v1/2024.findings-acl.232</doi>
     </paper>
@@ -9246,7 +9246,7 @@
       <author><first>Ji-Rong</first><last>Wen</last><affiliation>Renmin University of China</affiliation></author>
       <pages>3891-3902</pages>
       <abstract>Finding interpretable factors for stock returns is the most vital issue in the empirical asset pricing domain. As data-driven methods, existing factor mining models can be categorized into symbol-based and neural-based models. Symbol-based models are interpretable but inefficient, while neural-based approaches are efficient but lack interpretability. Hence, mining interpretable factors effectively presents a significant challenge. Inspired by the success of Large Language Models (LLMs) in various tasks, we propose a FActor Mining Agent (FAMA) model that enables LLMs to integrate the strengths of both neural and symbolic models for factor mining. In this paper, FAMA consists of two main components: Cross-Sample Selection (CSS) and Chain-of-Experience (CoE). CSS addresses the homogeneity challenges in LLMs during factor mining by assimilating diverse factors as in-context samples, whereas CoE enables LLMs to leverage past successful mining experiences, expediting the mining of effective factors. Experimental evaluations on real-world stock market data demonstrate the effectiveness of our approach by surpassing the SOTA RankIC by 0.006 and RankICIR by 0.105 in predicting S&amp;P 500 returns. Furthermore, the investment simulation shows that our model can achieve superior performance with an annualized return of 38.4% and a Sharpe ratio of 667.2%.</abstract>
-      <url hash="4c247768">2024.findings-acl.233</url>
+      <url hash="664cd33d">2024.findings-acl.233</url>
       <bibkey>li-etal-2024-large-language</bibkey>
       <doi>10.18653/v1/2024.findings-acl.233</doi>
     </paper>
@@ -9260,7 +9260,7 @@
       <author><first>Kang</first><last>Liu</last><affiliation>Institute of automation, Chinese academy of science, Chinese Academy of Sciences</affiliation></author>
       <pages>3903-3922</pages>
       <abstract>Large language models (LLMs) internalize enormous <i>parametric knowledge</i> during pre-training. Concurrently, realistic applications necessitate external <i>contextual knowledge</i> to aid models on the underlying tasks. This raises a crucial dilemma known as <i>knowledge conflicts</i>, where the contextual knowledge clashes with the parametric knowledge. However, existing decoding works are specialized in resolving knowledge conflicts and could inadvertently deteriorate performance in absence of conflicts. In this paper, we propose an adaptive decoding method, termed as contextual information-entropy constraint decoding (COIECD), to discern whether the knowledge conflicts occur and resolve them. It can improve the model’s faithfulness to conflicting context, and simultaneously maintain high performance among non-conflicting context. Our experiments show that COIECD exhibits strong performance and robustness over knowledge conflicts in realistic datasets.</abstract>
-      <url hash="c04daa28">2024.findings-acl.234</url>
+      <url hash="dda8308f">2024.findings-acl.234</url>
       <bibkey>yuan-etal-2024-discerning</bibkey>
       <doi>10.18653/v1/2024.findings-acl.234</doi>
     </paper>
@@ -9276,7 +9276,7 @@
       <author><first>Jing</first><last>Shao</last><affiliation>Shanghai AI Laboratory</affiliation></author>
       <pages>3923-3954</pages>
       <abstract>In the rapidly evolving landscape of Large Language Models (LLMs), ensuring robust safety measures is paramount. To meet this crucial need, we propose SALAD-Bench, a safety benchmark specifically designed for evaluating LLMs, attack, and defense methods. Distinguished by its breadth, SALAD-Bench transcends conventional benchmarks through its large scale, rich diversity, intricate taxonomy spanning three levels, and versatile functionalities.SALAD-Bench is crafted with a meticulous array of questions, from standard queries to complex ones enriched with attack, defense modifications and multiple-choice. To effectively manage the inherent complexity, we introduce an innovative evaluators: the LLM-based MD-Judge for QA pairs with a particular focus on attack-enhanced queries, ensuring a seamless, and reliable evaluation. Above components extend SALAD-Bench from standard LLM safety evaluation to both LLM attack and defense methods evaluation, ensuring the joint-purpose utility. Our extensive experiments shed light on the resilience of LLMs against emerging threats and the efficacy of contemporary defense tactics. Data and evaluator are released under https://github.com/OpenSafetyLab/SALAD-BENCH</abstract>
-      <url hash="497f060c">2024.findings-acl.235</url>
+      <url hash="a3ea7bbf">2024.findings-acl.235</url>
       <bibkey>li-etal-2024-salad</bibkey>
       <doi>10.18653/v1/2024.findings-acl.235</doi>
       <video href="2024.findings-acl.235.mp4"/>
@@ -9290,7 +9290,7 @@
       <author><first>Vladimir</first><last>Araujo</last><affiliation>KU Leuven</affiliation></author>
       <pages>3955-3986</pages>
       <abstract>Advancing representation learning in specialized fields like medicine remains challenging due to the scarcity of expert annotations for text and images. To tackle this issue, we present a novel two-stage framework designed to extract high-quality factual statements from free-text radiology reports in order to improve the representations of text encoders and, consequently, their performance on various downstream tasks.In the first stage, we propose a <i>Fact Extractor</i> that leverages large language models (LLMs) to identify factual statements from well-curated domain-specific datasets. In the second stage, we introduce a <i>Fact Encoder</i> (CXRFE) based on a BERT model fine-tuned with objective functions designed to improve its representations using the extracted factual data. Our framework also includes a new embedding-based metric (CXRFEScore) for evaluating chest X-ray text generation systems, leveraging both stages of our approach. Extensive evaluations show that our fact extractor and encoder outperform current state-of-the-art methods in tasks such as sentence ranking, natural language inference, and label extraction from radiology reports. Additionally, our metric proves to be more robust and effective than existing metrics commonly used in the radiology report generation literature. The code of this project is available at <url>https://github.com/PabloMessina/CXR-Fact-Encoder</url>.</abstract>
-      <url hash="18cbb89e">2024.findings-acl.236</url>
+      <url hash="bdefe52a">2024.findings-acl.236</url>
       <bibkey>messina-etal-2024-extracting</bibkey>
       <doi>10.18653/v1/2024.findings-acl.236</doi>
       <video href="2024.findings-acl.236.mp4"/>
@@ -9304,7 +9304,7 @@
       <author><first>Hinrich</first><last>Schuetze</last></author>
       <pages>3987-4001</pages>
       <abstract>Large Language Models (LLMs) exhibit strong In-Context Learning (ICL) capabilities when prompts with demonstrations are used. However, fine-tuning still remains crucial to further enhance their adaptability. Prompt-based fine-tuning proves to be an effective fine-tuning method in low-data scenarios, but high demands on computing resources limit its practicality. We address this issue by introducing a prompt-based parameter-efficient fine-tuning (PEFT) approach. GNNavi leverages insights into ICL’s information flow dynamics, which indicates that label words act in prompts as anchors for information propagation. GNNavi employs a Graph Neural Network (GNN) layer to precisely guide the aggregation and distribution of information flow during the processing of prompts by hardwiring the desired information flow into the GNN. Our experiments on text classification tasks with GPT-2 and Llama2 show GNNavi surpasses standard prompt-based fine-tuning methods in few-shot settings by updating just 0.2% to 0.5% of parameters. We compare GNNavi with prevalent PEFT approaches, such as prefix tuning, LoRA and Adapter in terms of performance and efficiency. Our analysis reveals that GNNavi enhances information flow and ensures a clear aggregation process.</abstract>
-      <url hash="627550f1">2024.findings-acl.237</url>
+      <url hash="c90aad15">2024.findings-acl.237</url>
       <bibkey>yuan-etal-2024-gnnavi</bibkey>
       <doi>10.18653/v1/2024.findings-acl.237</doi>
       <video href="2024.findings-acl.237.mp4"/>
@@ -9319,7 +9319,7 @@
       <author><first>Stefan</first><last>Winkler</last><affiliation>National University of Singapore</affiliation></author>
       <pages>4002-4042</pages>
       <abstract>There is vivid research on adapting Large Language Models (LLMs) to perform a variety of tasks in high-stakes domains such as healthcare. Despite their popularity, there is a lack of understanding of the extent and contributing factors that allow LLMs to recall relevant knowledge and combine it with presented information in the clinical and biomedical domain: a fundamental pre-requisite for success on down-stream tasks.Addressing this gap, we use Multiple Choice and Abstractive Question Answering to conduct a large-scale empirical study on 22 datasets in three generalist and three specialist biomedical sub-domains. Our multifaceted analysis of the performance of 15 LLMs, further broken down by sub-domain, source of knowledge and model architecture, uncovers success factors such as instruction tuning that lead to improved recall and comprehension. We further show that while recently proposed domain-adapted models may lack adequate knowledge, directly fine-tuning on our collected medical knowledge datasets shows encouraging results, even generalising to unseen specialist sub-domains. We complement the quantitative results with a skill-oriented manual error analysis, which reveals a significant gap between the models’ capabilities to simply recall necessary knowledge and to integrate it with the presented context.To foster research and collaboration in this field we share M-QALM, our resources, standardised methodology, and evaluation results, with the research community to facilitate further advancements in clinical knowledge representation learning within language models.</abstract>
-      <url hash="4027fd7e">2024.findings-acl.238</url>
+      <url hash="5f1e3ca1">2024.findings-acl.238</url>
       <bibkey>subramanian-etal-2024-qalm</bibkey>
       <doi>10.18653/v1/2024.findings-acl.238</doi>
       <video href="2024.findings-acl.238.mp4"/>
@@ -9330,7 +9330,7 @@
       <author><first>Frank</first><last>Keller</last><affiliation>University of Edinburgh</affiliation></author>
       <pages>4043-4050</pages>
       <abstract>Movie screenplay summarization is challenging, as it requires an understanding of long input contexts and various elements unique to movies. Large language models have shown significant advancements in document summarization, but they often struggle with processing long input contexts. Furthermore, while television transcripts have received attention in recent studies, movie screenplay summarization remains underexplored. To stimulate research in this area, we present a new dataset, MovieSum, for abstractive summarization of movie screenplays. This dataset comprises 2200 movie screenplays accompanied by their Wikipedia plot summaries. We manually formatted the movie screenplays to represent their structural elements. Compared to existing datasets, MovieSum possesses several distinctive features: 1) It includes movie screenplays which are longer than scripts of TV episodes. 2) It is twice the size of previous movie screenplay datasets. 3) It provides metadata with IMDb IDs to facilitate access to additional external knowledge. We also show the results of recently released large language models applied to summarization on our dataset to provide a detailed baseline.</abstract>
-      <url hash="de929391">2024.findings-acl.239</url>
+      <url hash="8f593307">2024.findings-acl.239</url>
       <bibkey>saxena-keller-2024-moviesum</bibkey>
       <doi>10.18653/v1/2024.findings-acl.239</doi>
     </paper>
@@ -9350,7 +9350,7 @@
       <author><first>Pablo</first><last>Cesar</last><affiliation>Delft University of Technology and Centrum Wiskunde &amp; Informatica (CWI)</affiliation></author>
       <pages>4051-4066</pages>
       <abstract>Autonomous artificial intelligence (AI) agents have emerged as promising protocols for automatically understanding the language-based environment, particularly with the exponential development of large language models (LLMs). However, a fine-grained, comprehensive understanding of multimodal environments remains under-explored. This work designs an autonomous workflow tailored for integrating AI agents seamlessly into extended reality (XR) applications for fine-grained training. We present a demonstration of a multimodal fine-grained training assistant for LEGO brick assembly in a pilot XR environment. Specifically, we design a cerebral language agent that integrates LLM with memory, planning, and interaction with XR tools and a vision-language agent, enabling agents to decide their actions based on past experiences. Furthermore, we introduce LEGO-MRTA, a multimodal fine-grained assembly dialogue dataset synthesized automatically in the workflow served by a commercial LLM. This dataset comprises multimodal instruction manuals, conversations, XR responses, and vision question answering. Last, we present several prevailing open-resource LLMs as benchmarks, assessing their performance with and without fine-tuning on the proposed dataset. We anticipate that the broader impact of this workflow will advance the development of smarter assistants for seamless user interaction in XR environments, fostering research in both AI and HCI communities.</abstract>
-      <url hash="5de62919">2024.findings-acl.240</url>
+      <url hash="75f92763">2024.findings-acl.240</url>
       <bibkey>pei-etal-2024-autonomous</bibkey>
       <doi>10.18653/v1/2024.findings-acl.240</doi>
       <video href="2024.findings-acl.240.mp4"/>
@@ -9362,7 +9362,7 @@
       <author><first>Diyi</first><last>Yang</last><affiliation>Stanford University</affiliation></author>
       <pages>4067-4081</pages>
       <abstract>English NLP systems have empirically worse performance for dialects other than Standard American English (SAmE). However, how these discrepancies impact use of language technology by speakers of non-SAmE global Englishes is not well understood. We focus on reducing this gap for South Asian Englishes (SAsE), a macro-group of regional varieties with cumulatively more speakers than SAmE, by surveying SAsE speakers about their interactions with language technology and compare their responses to a control survey of SAmE speakers. SAsE speakers are more likely to recall failures with language technology and more likely to reference specific issues with written language technology than their SAmE counterparts. Furthermore, SAsE speakers indicate that they modify both their lexicon and syntax to make technology work better, but that lexical issues are perceived as the most salient challenge. We then assess whether these issues are pervasive in more recently developed Large Language Models (LLMs), introducing two benchmarks for broader SAsE Lexical and Indian English Syntactic understanding and evaluating 11 families of LLMs on them.</abstract>
-      <url hash="f1781a2c">2024.findings-acl.241</url>
+      <url hash="174dd4aa">2024.findings-acl.241</url>
       <bibkey>holt-etal-2024-perceptions</bibkey>
       <doi>10.18653/v1/2024.findings-acl.241</doi>
     </paper>
@@ -9375,7 +9375,7 @@
       <author><first>Christian</first><last>Bartelt</last><affiliation>Universität Mannheim</affiliation></author>
       <pages>4082-4102</pages>
       <abstract>Transformers demonstrate impressive performance on a range of reasoning benchmarks. To evaluate the degree to which these abilities are a result of actual reasoning, existing work has focused on developing sophisticated benchmarks for behavioral studies. However, these studies do not provide insights into the internal mechanisms driving the observed capabilities. To improve our understanding of the internal mechanisms of transformers, we present a comprehensive mechanistic analysis of a transformer trained on a synthetic reasoning task. We identify a set of interpretable mechanisms the model uses to solve the task, and validate our findings using correlational and causal evidence. Our results suggest that it implements a depth-bounded recurrent mechanisms that operates in parallel and stores intermediate results in selected token positions. We anticipate that the motifs we identified in our synthetic setting can provide valuable insights into the broader operating principles of transformers and thus provide a basis for understanding more complex models.</abstract>
-      <url hash="1e7ad06a">2024.findings-acl.242</url>
+      <url hash="529fe999">2024.findings-acl.242</url>
       <bibkey>brinkmann-etal-2024-mechanistic</bibkey>
       <doi>10.18653/v1/2024.findings-acl.242</doi>
       <video href="2024.findings-acl.242.mp4"/>
@@ -9391,7 +9391,7 @@
       <author><first>Tingwen</first><last>Liu</last><affiliation>Institute of Information Engineering, Chinese Academy of Sciences</affiliation></author>
       <pages>4103-4117</pages>
       <abstract>Multimodal entity linking (MEL) aims to link ambiguous mentions in multimodal contexts to entities in a multimodal knowledge graph. A pivotal challenge is to fully leverage multi-element correlations between mentions and entities to bridge modality gap and enable fine-grained semantic matching. Existing methods attempt several local correlative mechanisms, relying heavily on the automatically learned attention weights, which may over-concentrate on partial correlations. To mitigate this issue, we formulate the correlation assignment problem as an optimal transport (OT) problem, and propose a novel MEL framework, namely OT-MEL, with OT-guided correlation assignment. Thereby, we exploit the correlation between multimodal features to enhance multimodal fusion, and the correlation between mentions and entities to enhance fine-grained matching. To accelerate model prediction, we further leverage knowledge distillation to transfer OT assignment knowledge to attention mechanism. Experimental results show that our model significantly outperforms previous state-of-the-art baselines and confirm the effectiveness of the OT-guided correlation assignment.</abstract>
-      <url hash="fbfabe4d">2024.findings-acl.243</url>
+      <url hash="ade62503">2024.findings-acl.243</url>
       <bibkey>zhang-etal-2024-optimal</bibkey>
       <doi>10.18653/v1/2024.findings-acl.243</doi>
       <video href="2024.findings-acl.243.mp4"/>
@@ -9403,7 +9403,7 @@
       <author><first>Ryan</first><last>Cotterell</last><affiliation>Swiss Federal Institute of Technology</affiliation></author>
       <pages>4118-4135</pages>
       <abstract>Recent work by Hewitt et al. (2020) provides an interpretation of the empirical success of recurrent neural networks (RNNs) as language models (LMs). It shows that RNNs can efficiently represent bounded hierarchical structures that are prevalent in human language.This suggests that RNNs’ success might be linked to their ability to model hierarchy. However, a closer inspection of hewitt-etal-2020-rnns construction shows that it is not inherently limited to hierarchical structures. This poses a natural question: What other classes of LMs RNNs can efficiently represent? To this end, we generalize Hewitt et al.’s (2020) construction and show that RNNs can efficiently represent a larger class of LMs than previously claimed—specifically, those that can be represented by a pushdown automaton with a bounded stack and a specific stack update function. Altogether, the efficiency of representing this diverse class of LMs with RNN LMs suggests novel interpretations of their inductive bias.</abstract>
-      <url hash="8eadf4f7">2024.findings-acl.244</url>
+      <url hash="be611676">2024.findings-acl.244</url>
       <bibkey>svete-etal-2024-efficiently</bibkey>
       <doi>10.18653/v1/2024.findings-acl.244</doi>
       <video href="2024.findings-acl.244.mp4"/>
@@ -9416,7 +9416,7 @@
       <author><first>Simone</first><last>Ponzetto</last><affiliation>University of Mannheim</affiliation></author>
       <pages>4136-4155</pages>
       <abstract>In this survey, we provide a systematic review of recent work on modelling morality in text, an area of research that has garnered increasing attention in recent years. Our survey is motivated by the importance of modelling decisions on the created resources, the models trained on these resources and the analyses that result from the models’ predictions. We review work at the interface of NLP, Computational Social Science and Psychology and give an overview of the different goals and research questions addressed in the papers, their underlying theoretical backgrounds and the methods that have been applied to pursue these goals. We then identify and discuss challenges and research gaps, such as the lack of a theoretical framework underlying the operationalisation of morality in text, the low IAA reported for manyhuman-annotated resulting resources and the lack of validation of newly proposed resources and analyses.</abstract>
-      <url hash="3fe6208c">2024.findings-acl.245</url>
+      <url hash="29eec591">2024.findings-acl.245</url>
       <bibkey>reinig-etal-2024-survey</bibkey>
       <doi>10.18653/v1/2024.findings-acl.245</doi>
     </paper>
@@ -9433,7 +9433,7 @@
       <author><first>Heng</first><last>Huang</last><affiliation>University of Maryland, College Park</affiliation></author>
       <pages>4156-4172</pages>
       <abstract>Data selection in instruction tuning emerges as a pivotal process for acquiring high-quality data and training instruction-following large language models (LLMs), but it is still a new and unexplored research area for vision-language models (VLMs). Existing data selection approaches on LLMs either rely on single unreliable scores, or use downstream tasks for selection, which is time-consuming and can lead to potential over-fitting on the chosen evaluation datasets. To address this challenge, we introduce a novel dataset selection method, Self-Filter, that utilizes the VLM itself as a filter. This approach is inspired by the observation that VLMs benefit from training with the most challenging instructions. Self-Filter operates in two stages. In the first stage, we devise a scoring network to evaluate the difficulty of training instructions, which is co-trained with the VLM. In the second stage, we use the trained score net to measure the difficulty of each instruction, select the most challenging samples, and penalize similar samples to encourage diversity. Comprehensive experiments on LLaVA and MiniGPT-4 show that Self-Filter can reach better results compared to full data settings with merely about 15% samples, and can achieve superior performance against competitive baselines.</abstract>
-      <url hash="718cdfd5">2024.findings-acl.246</url>
+      <url hash="24e21c3d">2024.findings-acl.246</url>
       <bibkey>chen-etal-2024-vision</bibkey>
       <doi>10.18653/v1/2024.findings-acl.246</doi>
     </paper>
@@ -9452,7 +9452,7 @@
       <author><first>Maosong</first><last>Sun</last></author>
       <pages>4173-4198</pages>
       <abstract>Large Language Models (LLMs) have demonstrated exceptional coding capability. However, as another critical component of programming proficiency, the debugging capability of LLMs remains relatively unexplored. Previous evaluations of LLMs’ debugging ability are significantly limited by the risk of data leakage, the scale of the dataset, and the variety of tested bugs. To overcome these deficiencies, we introduce ‘DebugBench’, an LLM debugging benchmark consisting of 4,253 instances. It covers four major bug categories and 18 minor types in C++, Java, and Python. To construct DebugBench, we collect code snippets from the LeetCode community, implant bugs into source data with GPT-4, and assure rigorous quality checks. We evaluate two commercial and four open-source models in a zero-shot scenario. We find that (1) while closed-source models exhibit inferior debugging performance compared to humans, open-source models relatively lower pass rate scores; (2) the complexity of debugging notably fluctuates depending on the bug category; (3) incorporating runtime feedback has a clear impact on debugging performance which is not always helpful. As an extension, we also compare LLM debugging and code generation, revealing a strong correlation between them for closed-source models. These findings will benefit the development of LLMs in debugging.</abstract>
-      <url hash="68a0f70d">2024.findings-acl.247</url>
+      <url hash="9c80db78">2024.findings-acl.247</url>
       <bibkey>tian-etal-2024-debugbench</bibkey>
       <doi>10.18653/v1/2024.findings-acl.247</doi>
     </paper>
@@ -9464,7 +9464,7 @@
       <author><first>Hao</first><last>Xu</last><affiliation>Jilin University</affiliation></author>
       <pages>4199-4210</pages>
       <abstract>The objective of the Causal Emotion Entailment (CEE) task is to identify the causes of the target emotional utterances in a given conversation. Most existing studies have focused on a fine-tuning paradigm based on a pretrained model, e.g., the BERT model. However, there are gaps between the pretrained task and the CEE task. Although a pretrained model enhances contextual comprehension to some extent, it cannot acquire specific knowledge that is relevant to the CEE task. In addition, in a typical CEE task, there are peculiarities in the distribution of the positions with different emotion types of emotion utterances and cause utterances in conversations. Existing methods employ a fixed-size window to capture the relationship between neighboring conversations; however, these methods ignore the specific semantic associations between emotions and cause utterances. To address these issues, we propose the Position-oriented Prompt-tuning (POP-CEE) model to solve the CEE task in an end-to-end manner. Specifically, we can model the CEE task by designing prompts with multiple unified goals and by exploring the positional relationship between emotion and cause utterances using a position constraint module. Experimental results demonstrate that the proposed POP-CEE model achieves state-of-the-art performance on a benchmark dataset. Ourcode and data can be found at: https://github.com/Zh0uzh/POP-CEE.</abstract>
-      <url hash="85f54a17">2024.findings-acl.248</url>
+      <url hash="4c87482d">2024.findings-acl.248</url>
       <bibkey>zhou-etal-2024-pop</bibkey>
       <doi>10.18653/v1/2024.findings-acl.248</doi>
       <video href="2024.findings-acl.248.mp4"/>
@@ -9475,7 +9475,7 @@
       <author><first>Zhang</first><last>Huaping</last><affiliation>Beijing Institute of Technology</affiliation></author>
       <pages>4211-4218</pages>
       <abstract>Context length expansion of transformer models is considered a key challenge, especially when handling context beyond the training length during inference stage. In this paper, we propose <tex-math>\textbf{Ge}</tex-math>eneralized extrapolatio<tex-math>\textbf{N}</tex-math> scal<tex-math>\textbf{E}</tex-math> (GeNE), a set of parameterized extrapolation functions applied to each layer and attention head to adaptively adjust its extrapolation scales. Experimental results show that GeNE provides a significant improvement on long context language modeling. By randomly scaling the extrapolation ratio during the finetuning, GeNE achieves stable extrapolation on 64k contexts by training on 16k length text. Further, the instruction following Llama2 model based on GeNE achieved competitive results compared with other open-source models of the same parameter scale.</abstract>
-      <url hash="3939f931">2024.findings-acl.249</url>
+      <url hash="b5c04f73">2024.findings-acl.249</url>
       <bibkey>li-huaping-2024-context</bibkey>
       <doi>10.18653/v1/2024.findings-acl.249</doi>
     </paper>
@@ -9487,7 +9487,7 @@
       <author><first>Luciana</first><last>Benotti</last><affiliation>Universidad nacional de Córdoba</affiliation></author>
       <pages>4219-4229</pages>
       <abstract>Recently, large multi-modal models (LMMs) have emerged with the capacity to perform vision tasks such as captioning and visual question answering (VQA) with unprecedented accuracy. Applications such as helping the blind or visually impaired have a critical need for precise answers. It is specially important for models to be well calibrated and be able to quantify their uncertainty in order to selectively decide when to answer and when to abstain or ask for clarifications. We perform the first in-depth analysis of calibration methods and metrics for VQA with in-context learning LMMs. Studying VQA on two answerability benchmarks, we show that the likelihood score of visually grounded models is better calibrated than in their text-only counterparts for in-context learning, where sampling based methods are generally superior, but no clear winner arises. We propose Avg BLEU, a calibration score combining the benefits of both sampling and likelihood methods across modalities.</abstract>
-      <url hash="ea45c2ae">2024.findings-acl.250</url>
+      <url hash="2fbc8b50">2024.findings-acl.250</url>
       <bibkey>eisenschlos-etal-2024-selectively</bibkey>
       <doi>10.18653/v1/2024.findings-acl.250</doi>
       <video href="2024.findings-acl.250.mp4"/>
@@ -9503,7 +9503,7 @@
       <author><first>Zhou</first><last>Zhao</last><affiliation>Zhejiang University and Zhejiang University</affiliation></author>
       <pages>4230-4242</pages>
       <abstract>We release a multi-accent dataset and propose speech-programming and gradient reversal classifier to improve the generalization.Abstract: Speech-to-SQL (S2SQL) aims to convert spoken questions into SQL queries given relational databases, which has been traditionally implemented in a cascaded manner while facing the following challenges: 1) model training is faced with the major issue of data scarcity, where limited parallel data is available; and 2) the systems should be robust enough to handle diverse out-of-domain speech samples that differ from the source data. In this work, we propose the direct generalizable speech-to-SQL parsing model Wav2SQL which avoids error compounding across cascaded systems. Specifically, 1) to accelerate speech-driven SQL parsing research in the community, we release a large-scale and multi-accent dataset MASpider; 2) leveraging the recent progress in the large-scale pre-training, we show that it alleviates the data scarcity issue and allow for direct speech-to-SQL parsing; and 3) we include the speech re-programming and gradient reversal classifier techniques to reduce acoustic variance and learned style-agnostic representation, improving generalization to unseen out-of-domain custom data. Experimental results demonstrate that Wav2SQL avoids error compounding and achieves state-of-the-art results by up to 4.7% accuracy improvement over the baseline.</abstract>
-      <url hash="29480c7c">2024.findings-acl.251</url>
+      <url hash="9a174ac0">2024.findings-acl.251</url>
       <bibkey>liu-etal-2024-wav2sql</bibkey>
       <doi>10.18653/v1/2024.findings-acl.251</doi>
     </paper>
@@ -9525,7 +9525,7 @@
       <author><first>Bo</first><last>Zheng</last><affiliation>Alibaba Group</affiliation></author>
       <pages>4243-4253</pages>
       <abstract>Training Large Language Models (LLMs) to process extensive context lengths incurs prohibitive computational costs. Prevailing techniques for extending context capabilities in LLMs typically require not only additional training procedures but also access to datasets with long context (e.g., sequences of 32K tokens), presupposing substantial GPU expenditures. To address the aforementioned issues, we introduce a novel solution named Efficient and Extreme length extension for Large Language Models (E2-LLM). E2-LLM entails a singular training process over considerably short sequences (e.g., 4K tokens), which greatly mitigates the cost of continual-pretraining or fine-tuning. Within the training phase, we incorporate a dual augmentation strategy with Rotary Position Embeddings (RoPE) that adjusts the scale and position indices across distinct training samples. E 2 -LLM is meticulously designed to enhance the model’s robustness to diverse relative positions. The experimental results on multiple benchmark datasets demonstrate the superior performance of E 2 -LLM on demanding tasks of processing long contexts.</abstract>
-      <url hash="2705ac85">2024.findings-acl.252</url>
+      <url hash="641435bd">2024.findings-acl.252</url>
       <bibkey>liu-etal-2024-e2</bibkey>
       <doi>10.18653/v1/2024.findings-acl.252</doi>
     </paper>
@@ -9536,7 +9536,7 @@
       <author><first>Adina</first><last>Williams</last><affiliation>FAIR (Meta Platforms Inc.)</affiliation></author>
       <pages>4254-4274</pages>
       <abstract>People tend to use language to mention surprising properties of events: for example, when a banana is blue, we are more likely to mention color than when it is yellow. This fact is taken to suggest that yellowness is somehow a typical feature of bananas, and blueness is exceptional. Similar to how a yellow color is typical of bananas, there may also be genders that are typical of occupations. In this work, we explore this question using information theoretic techniques coupled with corpus statistic analysis. In two distinct large corpora, we do not find strong evidence that occupations and gender display the same patterns of mentioning as do bananas and color. Instead, we find that gender mentioning is correlated with femaleness of occupation in particular, suggesting perhaps that woman-dominated occupations are seen as somehow “more gendered” than male-dominated ones, and thereby they encourage more gender mentioning overall.</abstract>
-      <url hash="1469401a">2024.findings-acl.253</url>
+      <url hash="1f390bcf">2024.findings-acl.253</url>
       <bibkey>ju-etal-2024-female</bibkey>
       <doi>10.18653/v1/2024.findings-acl.253</doi>
       <video href="2024.findings-acl.253.mp4"/>
@@ -9557,7 +9557,7 @@
       <author><first>Qi</first><last>Zhang</last></author>
       <pages>4275-4295</pages>
       <abstract>Large Language Models (LLMs) have shown potential in reasoning over structured environments, e.g., knowledge graphs and tables. Such tasks typically require multi-hop reasoning, i.e., match natural language utterance with instances in the environment. Previous works adopt LLMs to incrementally build a reasoning path, where LLMs either invoke tools or pick up items by step-by-step interacting with the environment. We propose Reasoning-Path-Editing (Readi), a novel framework where LLMs can efficiently and faithfully reason over structured environments. In Readi, LLMs initially generate a reasoning path given a query, and edit the path only when necessary. We instantiate the path on structured environments and provide feedback to edit the path if anything goes wrong. Experimental results on three KGQA and two TableQA datasets show the effectiveness of Readi, significantly surpassing previous LLM-based methods (by 9.1% Hit@1 on WebQSP, 12.4% on MQA-3H and 9.5% on WTQ), comparable with state-of-the-art fine-tuned methods (67% on CWQ and 74.7% on WebQSP) and substantially boosting the vanilla LLMs (by 14.9% on CWQ). Our code will be available on <url>https://aka.ms/readi</url>.</abstract>
-      <url hash="c47bc373">2024.findings-acl.254</url>
+      <url hash="6405f304">2024.findings-acl.254</url>
       <bibkey>cheng-etal-2024-call</bibkey>
       <doi>10.18653/v1/2024.findings-acl.254</doi>
       <video href="2024.findings-acl.254.mp4"/>
@@ -9572,7 +9572,7 @@
       <author><first>Arnab</first><last>Bhattacharya</last><affiliation>IIT Kanpur</affiliation></author>
       <pages>4296-4315</pages>
       <abstract>In the era of Large Language Models (LLMs), predicting judicial outcomes poses significant challenges due to the complexity of legal proceedings and the scarcity of expert-annotated datasets. Addressing this, we introduce <b>Pred</b>iction with <b>Ex</b>planation (PredEx), the largest expert-annotated dataset for legal judgment prediction and explanation in the Indian context, featuring over 15,000 annotations. This groundbreaking corpus significantly enhances the training and evaluation of AI models in legal analysis, with innovations including the application of instruction tuning to LLMs. This method has markedly improved the predictive accuracy and explanatory depth of these models for legal judgments. We employed various transformer-based models, tailored for both general and Indian legal contexts. Through rigorous lexical, semantic, and expert assessments, our models effectively leverage PredEx to provide precise predictions and meaningful explanations, establishing it as a valuable benchmark for both the legal profession and the NLP community.</abstract>
-      <url hash="46262b31">2024.findings-acl.255</url>
+      <url hash="02af7876">2024.findings-acl.255</url>
       <bibkey>nigam-etal-2024-legal</bibkey>
       <doi>10.18653/v1/2024.findings-acl.255</doi>
       <video href="2024.findings-acl.255.mp4"/>
@@ -9585,7 +9585,7 @@
       <author><first>Muhan</first><last>Zhang</last><affiliation>Peking University</affiliation></author>
       <pages>4316-4335</pages>
       <abstract>Knowledge graph reasoning is an important problem for knowledge graphs. In this paper, we propose a novel and principled framework called <b>RulE</b> (stands for Rule Embedding) to effectively leverage logical rules to enhance KG reasoning. Unlike knowledge graph embedding methods, RulE learns rule embeddings from existing triplets and first-order rules by jointly representing <b>entities</b>, <b>relations</b> and <b>logical rules</b> in a unified embedding space. Based on the learned rule embeddings, a confidence score can be calculated for each rule, reflecting its consistency with the observed triplets. This allows us to perform logical rule inference in a soft way, thus alleviating the brittleness of logic. On the other hand, RulE injects prior logical rule information into the embedding space, enriching and regularizing the entity/relation embeddings. This makes KGE alone perform better too. RulE is conceptually simple and empirically effective. We conduct extensive experiments to verify each component of RulE.Results on multiple benchmarks reveal that our model outperforms the majority of existing embedding-based and rule-based approaches.</abstract>
-      <url hash="76462353">2024.findings-acl.256</url>
+      <url hash="3fffcc55">2024.findings-acl.256</url>
       <bibkey>tang-etal-2024-rule</bibkey>
       <doi>10.18653/v1/2024.findings-acl.256</doi>
       <video href="2024.findings-acl.256.mp4"/>
@@ -9597,7 +9597,7 @@
       <author><first>Tianyi</first><last>Zhou</last><affiliation>University of Maryland, College Park</affiliation></author>
       <pages>4336-4347</pages>
       <abstract>Large language models (LLMs), despite their breakthroughs on many challenging benchmark tasks, prefer to generate verbose responses and lack the controllability of output complexity, which is usually preferred by human users in practice. In this paper, we study how to precisely control multiple linguistic complexities of LLM output by finetuning using off-the-shelf data. To this end, we propose multi-control tuning (MCTune), which includes multiple linguistic complexity values of ground-truth responses as controls in the input for instruction tuning. We finetune LLaMA2-7B on Alpaca-GPT4 and WizardLM datasets. Evaluations on widely used benchmarks demonstrate that our method does not only improve LLMs’ multi-complexity controllability substantially but also retains or even enhances the quality of the responses as a side benefit.</abstract>
-      <url hash="41013a42">2024.findings-acl.257</url>
+      <url hash="fed08f29">2024.findings-acl.257</url>
       <bibkey>nguyen-etal-2024-multi</bibkey>
       <doi>10.18653/v1/2024.findings-acl.257</doi>
     </paper>
@@ -9610,7 +9610,7 @@
       <author><first>Jingbo</first><last>Shang</last><affiliation>University of California, San Diego</affiliation></author>
       <pages>4348-4362</pages>
       <abstract>Controlling the attribute intensity of text generation is crucial across scenarios (e.g., writing conciseness, chatting emotion, and explanation clarity). The remarkable capabilities of large language models (LLMs) have revolutionized text generation, prompting us to explore such <i>smooth control</i> of LLM generation. Specifically, we propose metrics to assess the range, calibration, and consistency of the generated text’s attribute intensity in response to varying control values, as well as its relevance to the intended context. To quantify the attribute intensity and context relevance, we leverage an Elo rating system and GPT4, respectively, both renowned for their robust alignment with human judgment. We look into two viable training-free methods for achieving smooth control of LLMs: (1) Prompting with semantic shifters, and (2) Modifying internal model representations. The evaluations of these two methods are conducted on 5 different attributes with various models.</abstract>
-      <url hash="6078929b">2024.findings-acl.258</url>
+      <url hash="581fb56f">2024.findings-acl.258</url>
       <bibkey>zhou-etal-2024-evaluating</bibkey>
       <doi>10.18653/v1/2024.findings-acl.258</doi>
       <video href="2024.findings-acl.258.mp4"/>
@@ -9632,7 +9632,7 @@
       <author><first>Qun</first><last>Liu</last><affiliation>Huawei Noah’s Ark Lab</affiliation></author>
       <pages>4363-4400</pages>
       <abstract>The recent trend of using Large Language Models (LLMs) as tool agents in real-world applications underscores the necessity for comprehensive evaluations of their capabilities, particularly in complex scenarios involving planning, creating, and using tools. However, existing benchmarks typically focus on simple synthesized queries that do not reflect real-world complexity, thereby offering limited perspectives in evaluating tool utilization. To address this issue, we present UltraTool, a novel benchmark designed to improve and evaluate LLMs’ ability in tool utilization within real-world scenarios. UltraTool focuses on the entire process of using tools - from planning and creating to applying them in complex tasks. It emphasizes real-world complexities, demanding accurate, multi-step planning for effective problem-solving. A key feature of UltraTool is its independent evaluation of planning with natural language, which happens before tool usage and simplifies the task solving by mapping out the intermediate steps. Thus, unlike previous work, it eliminates the restriction of pre-defined toolset. Through extensive experiments on various LLMs, we offer novel insights into the evaluation of capabilities of LLMs in tool utilization, thereby contributing a fresh perspective to this rapidly evolving field. The benchmark is publicly available at https://github.com/JoeYing1019/UltraTool.</abstract>
-      <url hash="5523fc06">2024.findings-acl.259</url>
+      <url hash="01281199">2024.findings-acl.259</url>
       <bibkey>huang-etal-2024-planning-creation</bibkey>
       <doi>10.18653/v1/2024.findings-acl.259</doi>
     </paper>
@@ -9644,7 +9644,7 @@
       <author><first>Chris</first><last>Kedzie</last><affiliation>Rasa Technologies, Inc.</affiliation></author>
       <pages>4401-4420</pages>
       <abstract>We design probes trained on the internal representations of a transformer language model to predict its hallucinatory behavior on three grounded generation tasks. To train the probes, we annotate for span-level hallucination on both sampled (organic) and manually edited (synthetic) reference outputs. Our probes are narrowly trained and we find that they are sensitive to their training domain: they generalize poorly from one task to another or from synthetic to organic hallucinations. However, on in-domain data, they can reliably detect hallucinations at many transformer layers, achieving 95% of their peak performance as early as layer 4. Here, probing proves accurate for evaluating hallucination, outperforming several contemporary baselines and even surpassing an expert human annotator in response-level detection F1. Similarly, on span-level labeling, probes are on par or better than the expert annotator on two out of three generation tasks. Overall, we find that probing is a feasible and efficient alternative to language model hallucination evaluation when model states are available.</abstract>
-      <url hash="d2115fcb">2024.findings-acl.260</url>
+      <url hash="5de1d6cc">2024.findings-acl.260</url>
       <bibkey>ch-wang-etal-2024-androids</bibkey>
       <doi>10.18653/v1/2024.findings-acl.260</doi>
     </paper>
@@ -9658,7 +9658,7 @@
       <author><first>Yue</first><last>Yu</last><affiliation>National University of Defense Technology and PengCheng Lab</affiliation></author>
       <pages>4421-4434</pages>
       <abstract>Prior research endeavors of the ensemble Large Language Models (LLMs) achieved great success by employing an individual language model (LM) rank before the text generation. However, the use of an individual LM ranker faces two primary challenges: (1) The time-intensive nature of the ranking process, stemming from the comparisons between models; (2) The issue of error propagation arising from the separate ranking and generation models within the framework. In order to overcome these challenges, we propose a novel ensemble framework, namely Unified Ranking and Generation (URG). URG represents an end-to-end framework that jointly ranks the outputs of LLMs and generates fine-grained fusion results, via utilizing a dedicated cross-attention-based module and noise mitigation training against irrelevant information stemming from bad ranking results. Through extensive experimentation and evaluation, we demonstrate the efficiency and effectiveness of our framework in both the ranking and generation tasks. With the close coordination of the ranking and generation modules, our end-to-end framework achieves the state-of-the-art (SOTA) performance on these tasks, and exhibits substantial enhancements to any of the ensembled models.</abstract>
-      <url hash="524c3b47">2024.findings-acl.261</url>
+      <url hash="770e8e82">2024.findings-acl.261</url>
       <bibkey>lv-etal-2024-urg</bibkey>
       <doi>10.18653/v1/2024.findings-acl.261</doi>
     </paper>
@@ -9674,7 +9674,7 @@
       <author><first>Ivan</first><last>Bulyko</last><affiliation>Amazon</affiliation></author>
       <pages>4435-4446</pages>
       <abstract>Retrieval is a widely adopted approach for improving language models leveraging external information. As the field moves towards multi-modal large language models, it is important to extend the pure text based methods to incorporate other modalities in retrieval as well for applications across the wide spectrum of machine learning tasks and data types. In this work, we propose multi-modal retrieval with two approaches: kNN-LM and cross-attention techniques. We demonstrate the effectiveness of our retrieval approaches empirically by applying them to automatic speech recognition tasks with access to external information. Under this setting, we show that speech-based multi-modal retrieval outperforms text based retrieval, and yields up to improvement in word error rate over the multi-modal language model baseline. Furthermore, we achieve state-of-the-art recognition results on the Spoken-Squad question answering dataset.</abstract>
-      <url hash="acbc18b5">2024.findings-acl.262</url>
+      <url hash="fe008dd6">2024.findings-acl.262</url>
       <bibkey>gourav-etal-2024-multi</bibkey>
       <doi>10.18653/v1/2024.findings-acl.262</doi>
     </paper>
@@ -9689,7 +9689,7 @@
       <author><first>Fei</first><last>Wu</last><affiliation>Zhejiang University</affiliation></author>
       <pages>4447-4462</pages>
       <abstract>Low-Rank Adaptation (LoRA) provides an effective yet efficient solution for fine-tuning large language models (LLMs). The modular and plug-and-play nature of LoRA enables the integration of diverse domain-specific LoRAs to enhance the capabilities of LLMs. Previous research on exploiting multiple LoRAs either focuses on specific isolated downstream tasks or fixes the selection of LoRAs during training. However, in real-world scenarios, LLMs receive diverse prompts covering different tasks, and the pool of candidate LoRAs is often dynamically updated. To bridge this gap, we propose LoraRetriever, a retrieve-then-compose framework that adaptively retrieves and composes multiple LoRAs according to the input prompts. LoraRetriever contains three main components: firstly, identifying and retrieving LoRAs relevant to the given input; secondly, formulating strategies for effectively integrating the retrieved LoRAs; and thirdly, developing efficient batch inference to accommodate heterogeneous requests. Experimental results indicate that LoraRetriever consistently outperforms the baselines, highlighting its practical effectiveness and versatility. Our code is available at https://github.com/StyxXuan/LoraRetriever.</abstract>
-      <url hash="982f6d50">2024.findings-acl.263</url>
+      <url hash="68f7f971">2024.findings-acl.263</url>
       <bibkey>zhao-etal-2024-loraretriever</bibkey>
       <doi>10.18653/v1/2024.findings-acl.263</doi>
       <video href="2024.findings-acl.263.mp4"/>
@@ -9703,7 +9703,7 @@
       <author><first>Liang</first><last>Zhao</last><affiliation>Emory University</affiliation></author>
       <pages>4463-4475</pages>
       <abstract>The deployment and application of Large Language Models (LLMs) is hindered by their memory inefficiency, computational demands, and the high costs of API inferences. Traditional distillation methods, which transfer the capabilities of LLMs to smaller models, often fail to determine whether the knowledge has been sufficiently transferred, potentially resulting in high costs or incomplete distillation. In this paper, we propose an Explanation-Guided LLMs Active Distillation (ELAD) framework that employs an active learning strategy to optimize the balance between annotation costs and model performance. To improve the efficiency of sample selection, we introduce an explanation-guided sample selection method that identifies samples challenging its reasoning by exploiting uncertainties in reasoning explanation steps. Additionally, we present a customized LLM-annotated explanation revision technique where the teacher model detects and corrects flaws in the student model’s reasoning. Our experiments across various reasoning datasets demonstrate that our framework significantly enhances the efficiency of LLMs knowledge distillation.</abstract>
-      <url hash="02ef0b83">2024.findings-acl.264</url>
+      <url hash="ed56bb46">2024.findings-acl.264</url>
       <bibkey>zhang-etal-2024-elad</bibkey>
       <doi>10.18653/v1/2024.findings-acl.264</doi>
       <video href="2024.findings-acl.264.mp4"/>
@@ -9716,7 +9716,7 @@
       <author><first>Anne</first><last>Lauscher</last><affiliation>Universität Hamburg</affiliation></author>
       <pages>4476-4494</pages>
       <abstract>Large language models (LLMs) need to serve everyone, including a global majority of non-English speakers. However, most LLMs today, and open LLMs in particular, are often intended for use in just English (e.g. Llama2, Mistral) or a small handful of high-resource languages (e.g. Mixtral, Qwen). Recent research shows that, despite limits in their intended use, people prompt LLMs in many different languages.Therefore, in this paper, we investigate the basic multilingual capabilities of state-of-the-art open LLMs beyond their intended use.For this purpose, we introduce MultiQ, a new silver standard benchmark for basic open-ended question answering with 27.4k test questions across a typologically diverse set of 137 languages. With MultiQ, we evaluate language fidelity, i.e. whether models respond in the prompted language, and question answering accuracy. All LLMs we test respond faithfully and/or accurately for at least some languages beyond their intended use. Most models are more accurate when they respond faithfully. However, differences across models are large, and there is a long tail of languages where models are neither accurate nor faithful. We explore differences in tokenization as a potential explanation for our findings, identifying possible correlations that warrant further investigation.</abstract>
-      <url hash="060a0d9b">2024.findings-acl.265</url>
+      <url hash="4850595b">2024.findings-acl.265</url>
       <bibkey>holtermann-etal-2024-evaluating</bibkey>
       <doi>10.18653/v1/2024.findings-acl.265</doi>
       <video href="2024.findings-acl.265.mp4"/>
@@ -9728,7 +9728,7 @@
       <author><first>Marten</first><last>Schijndel</last><affiliation>Cornell University</affiliation></author>
       <pages>4495-4504</pages>
       <abstract>Pretrained language model (PLM) hidden states are frequently employed as contextual word embeddings (CWE): high-dimensional representations that encode semantic information given linguistic context. Across many areas of computational linguistics research, similarity between CWEs is interpreted as semantic similarity. However, it remains unclear exactly what information is encoded in PLM hidden states. We investigate this practice by probing PLM representations using minimal orthographic noise. We expect that if CWEs primarily encode semantic information, a single character swap in the input word will not drastically affect the resulting representation, given sufficient linguistic context. Surprisingly, we find that CWEs generated by popular PLMs are highly sensitive to noise in input data, and that this sensitivity is related to subword tokenization: the fewer tokens used to represent a word at input, the more sensitive its corresponding CWE. This suggests that CWEs capture information unrelated to word-level meaning and can be manipulated through trivial modifications of input data. We conclude that these PLM-derived CWEs may not be reliable semantic proxies, and that caution is warranted when interpreting representational similarity.</abstract>
-      <url hash="76bf0d40">2024.findings-acl.266</url>
+      <url hash="1cf69cf5">2024.findings-acl.266</url>
       <bibkey>matthews-etal-2024-semantics</bibkey>
       <doi>10.18653/v1/2024.findings-acl.266</doi>
       <video href="2024.findings-acl.266.mp4"/>
@@ -9748,7 +9748,7 @@
       <author><first>Jiliang</first><last>Tang</last><affiliation>Michigan State University</affiliation></author>
       <pages>4505-4524</pages>
       <abstract>Retrieval-augmented generation (RAG) is a powerful technique to facilitate language model generation with proprietary and private data, where data privacy is a pivotal concern. Whereas extensive research has demonstrated the privacy risks of large language models (LLMs), the RAG technique could potentially reshape the inherent behaviors of LLM generation, posing new privacy issues that are currently under-explored. To this end, we conduct extensive empirical studies with novel attack methods, which demonstrate the vulnerability of RAG systems on leaking the private retrieval database. Despite the new risks brought by RAG on the retrieval data, we further discover that RAG can be used to mitigate the old risks, i.e., the leakage of the LLMs’ training data. In general, we reveal many new insights in this paper for privacy protection of retrieval-augmented LLMs, which could benefit both LLMs and RAG systems builders.</abstract>
-      <url hash="2301acbe">2024.findings-acl.267</url>
+      <url hash="61d22898">2024.findings-acl.267</url>
       <bibkey>zeng-etal-2024-good</bibkey>
       <doi>10.18653/v1/2024.findings-acl.267</doi>
       <video href="2024.findings-acl.267.mp4"/>
@@ -9764,7 +9764,7 @@
       <author><first>Hae</first><last>Park</last><affiliation>Amazon and Massachusetts Institute of Technology</affiliation></author>
       <pages>4525-4536</pages>
       <abstract>Modeling empathy is a complex endeavor that is rooted in interpersonal and experiential dimensions of human interaction, and remains an open problem within AI. Existing empathy datasets fall short in capturing the richness of empathy responses, often being confined to in-lab or acted scenarios, lacking longitudinal data, and missing self-reported labels. We introduce a new multimodal dataset for empathy during personal experience sharing: the EmpathicStories++ dataset containing 53 hours of video, audio, and text data of 41 participants sharing vulnerable experiences and reading empathically resonant stories with an AI agent. EmpathicStories++ is the first longitudinal dataset on empathy, collected over a month-long deployment of social robots in participants’ homes, as participants engage in natural, empathic storytelling interactions with AI agents. We then introduce a novel task of predicting individuals’ empathy toward others’ stories based on their personal experiences, evaluated in two contexts: participants’ own personal shared story context and their reflections on stories they read. We benchmark this task using state-of-the-art models to pave the way for future improvements in contextualized and longitudinal empathy modeling. Our work provides a valuable resource for further research in developing empathetic AI systems and understanding the intricacies of human empathy within genuine, real-world settings.</abstract>
-      <url hash="aea7a2d5">2024.findings-acl.268</url>
+      <url hash="05bb203a">2024.findings-acl.268</url>
       <bibkey>shen-etal-2024-empathicstories</bibkey>
       <doi>10.18653/v1/2024.findings-acl.268</doi>
       <video href="2024.findings-acl.268.mp4"/>
@@ -9777,7 +9777,7 @@
       <author><first>Reut</first><last>Tsarfaty</last><affiliation>Google and Bar-Ilan University, Technion</affiliation></author>
       <pages>4537-4550</pages>
       <abstract>Syntactic parsing remains a critical tool for relation extraction and information extraction, especially in resource-scarce languages where LLMs are lacking. Yet in morphologically rich languages (MRLs), where parsers need to identify multiple lexical units in each token, existing systems suffer in latency and setup complexity. Some use a pipeline to peel away the layers: first segmentation, then morphology tagging, and then syntax parsing; however, errors in earlier layers are then propagated forward. Others use a joint architecture to evaluate all permutations at once; while this improves accuracy, it is notoriously slow. In contrast, and taking Hebrew as a test case, we present a new “flipped pipeline”: decisions are made directly on the whole-token units by expert classifiers, each one dedicated to one specific task. The classifier predictions are independent of one another, and only at the end do we synthesize their predictions. This blazingly fast approach requires only a single huggingface call, without the need for recourse to lexicons or linguistic resources. When trained on the same training set used in previous studies, our model achieves near-SOTA performance on a wide array of Hebrew NLP tasks. Furthermore, when trained on a newly enlarged training corpus, our model achieves a new SOTA for Hebrew POS tagging and dependency parsing. We release this new SOTA model to the community. Because our architecture does not rely on any language-specific resources, it can serve as a model to develop similar parsers for other MRLs.</abstract>
-      <url hash="c5d83741">2024.findings-acl.269</url>
+      <url hash="98b438a5">2024.findings-acl.269</url>
       <bibkey>shmidman-etal-2024-mrl</bibkey>
       <doi>10.18653/v1/2024.findings-acl.269</doi>
       <video href="2024.findings-acl.269.mp4"/>
@@ -9789,7 +9789,7 @@
       <author><first>Mennatallah</first><last>El-Assady</last><affiliation>Department of Computer Science, ETHZ - ETH Zurich</affiliation></author>
       <pages>4551-4566</pages>
       <abstract>To harness the power of large language models in safety-critical domains, we need to ensure the explainability of their predictions. However, despite the significant attention to model interpretability, there remains an unexplored domain in explaining sequence-to-sequence tasks using methods tailored for textual data. This paper introduces *SyntaxShap*, a local, model-agnostic explainability method for text generation that takes into consideration the syntax in the text data. The presented work extends Shapley values to account for parsing-based syntactic dependencies. Taking a game theoric approach, SyntaxShap only considers coalitions constraint by the dependency tree. We adopt a model-based evaluation to compare SyntaxShap and its weighted form to state-of-the-art explainability methods adapted to text generation tasks, using diverse metrics including faithfulness, coherency, and semantic alignment of the explanations to the model. We show that our syntax-aware method produces explanations that help build more faithful and coherent explanations for predictions by autoregressive models. Confronted with the misalignment of human and AI model reasoning, this paper also highlights the need for cautious evaluation strategies in explainable AI.</abstract>
-      <url hash="36dd0798">2024.findings-acl.270</url>
+      <url hash="a5e34db7">2024.findings-acl.270</url>
       <bibkey>amara-etal-2024-syntaxshap</bibkey>
       <doi>10.18653/v1/2024.findings-acl.270</doi>
     </paper>
@@ -9803,7 +9803,7 @@
       <author><first>Shomir</first><last>Wilson</last><affiliation>Pennsylvania State University</affiliation></author>
       <pages>4567-4574</pages>
       <abstract>Privacy policies are crucial for informing users about data practices, yet their length and complexity often deter users from reading them. In this paper, we propose an automated approach to identify and visualize data practices within privacy policies at different levels of detail. Leveraging crowd-sourced annotations from the ToS;DR platform, we experiment with various methods to match policy excerpts with predefined data practice descriptions. We further conduct a case study to evaluate our approach on a real-world policy, demonstrating its effectiveness in simplifying complex policies. Experiments show that our approach accurately matches data practice descriptions with policy excerpts, facilitating the presentation of simplified privacy information to users.</abstract>
-      <url hash="b2e80fbf">2024.findings-acl.271</url>
+      <url hash="6a5489c2">2024.findings-acl.271</url>
       <bibkey>srinath-etal-2024-automated</bibkey>
       <doi>10.18653/v1/2024.findings-acl.271</doi>
     </paper>
@@ -9817,7 +9817,7 @@
       <author><first>Hao</first><last>Yang</last><affiliation>Visa Research</affiliation></author>
       <pages>4575-4589</pages>
       <abstract>Knowledge Graph Embedding (KGE) is a powerful technique for predicting missing links in Knowledge Graphs (KGs) by learning the entities and relations. Hyperbolic space has emerged as a promising embedding space for KGs due to its ability to represent hierarchical data. Nevertheless, most existing hyperbolic KGE methods rely on tangent approximation and are not fully hyperbolic, resulting in distortions and inaccuracies. To overcome this limitation, we propose LorentzKG, a fully hyperbolic KGE method that represents entities as points in the Lorentz model and represents relations as the intrinsic transformation—the Lorentz transformations between entities. We demonstrate that the Lorentz transformation, which can be decomposed into Lorentz rotation/reflection and Lorentz boost, captures various types of relations including hierarchical structures. Experimental results show that our LorentzKG achieves state-of-the-art performance.</abstract>
-      <url hash="9f4517d0">2024.findings-acl.272</url>
+      <url hash="049c5e3e">2024.findings-acl.272</url>
       <bibkey>fan-etal-2024-enhancing</bibkey>
       <doi>10.18653/v1/2024.findings-acl.272</doi>
       <video href="2024.findings-acl.272.mp4"/>
@@ -9829,7 +9829,7 @@
       <author><first>Bryan</first><last>Plummer</last><affiliation>Boston University</affiliation></author>
       <pages>4590-4611</pages>
       <abstract>Mobile app user interfaces (UIs) are rich with action, text, structure, and image content that can be utilized to learn generic UI representations for tasks like automating user commands, summarizing content, and evaluating the accessibility of user interfaces. Prior work has learned strong visual representations with local or global captioning losses, but fails to retain both granularities.To combat this, we propose Textual Foresight, a novel pretraining objective for learning UI screen representations. Textual Foresight generates global text descriptions of future UI states given a current UI and local action taken. Our approach requires joint reasoning over elements and entire screens, resulting in improved UI features: on generation tasks, UI agents trained with Textual Foresight outperform state-of-the-art by 2% with 28x fewer images. We train with our newly constructed mobile app dataset, OpenApp, which results in the first public dataset for app UI representation learning. OpenApp enables new baselines, and we find Textual Foresight improves average task performance over them by 5.7% while having access to 2x less data.</abstract>
-      <url hash="49beed71">2024.findings-acl.273</url>
+      <url hash="6215fea7">2024.findings-acl.273</url>
       <bibkey>burns-etal-2024-tell</bibkey>
       <doi>10.18653/v1/2024.findings-acl.273</doi>
       <video href="2024.findings-acl.273.mp4"/>
@@ -9842,7 +9842,7 @@
       <author><first>Youcheng</first><last>Sun</last><affiliation>The University of Manchester</affiliation></author>
       <pages>4612-4628</pages>
       <abstract>The proliferation of Conversational AI agents (CAAs) has emphasised the need to distinguish between human and machine-generated texts, with implications spanning digital forensics and cybersecurity. While prior research primarily focussed on distinguishing human from machine-generated text, our study takes a more refined approach by analysing different CAAs. We construct linguistic profiles for five CAAs, aiming to identify Uniquely Identifiable Linguistic Patterns (UILPs) for each model using authorship attribution techniques. Authorship attribution (AA) is the task of identifying the author of an unknown text from a pool of known authors. Our research seeks to answer crucial questions about the existence of UILPs in CAAs, the linguistic overlap between various text types generated by these models, and the feasibility of Authorship Attribution (AA) for CAAs based on UILPs. Promisingly, we are able to attribute CAAs based on their original texts with a weighted F1-score of 96.94%. Further, we are able to attribute CAAs according to their writing style (as specified by prompts), yielding a weighted F1-score of 95.84%, which sets the baseline for this task. By employing principal component analysis (PCA), we identify the top 100 most informative linguistic features for each CAA, achieving a weighted F1-score ranging from 86.04% to 97.93%, and an overall weighted F1-score of 93.86%.</abstract>
-      <url hash="8151f61d">2024.findings-acl.274</url>
+      <url hash="11f2f65d">2024.findings-acl.274</url>
       <bibkey>zahid-etal-2024-probing</bibkey>
       <doi>10.18653/v1/2024.findings-acl.274</doi>
       <video href="2024.findings-acl.274.mp4"/>
@@ -9853,7 +9853,7 @@
       <author><first>Fred</first><last>Morstatter</last><affiliation>University of Southern California and USC/ISI</affiliation></author>
       <pages>4629-4651</pages>
       <abstract>Large Language Models (LLMs) are regularly being used to label data across many domains and for myriad tasks. By simply asking the LLM for an answer, or “prompting,” practitioners are able to use LLMs to quickly get a response for an arbitrary task. This prompting is done through a series of decisions by the practitioner, from simple wording of the prompt, to requesting the output in a certain data format, to jailbreaking in the case of prompts that address more sensitive topics. In this work, we ask: do variations in the way a prompt is constructed change the ultimate decision of the LLM? We answer this using a series of prompt variations across a variety of text classification tasks. We find that even the smallest of perturbations, such as adding a space at the end of a prompt, can cause the LLM to change its answer. Further, we find that requesting responses in XML and commonly used jailbreaks can have cataclysmic effects on the data labeled by LLMs.</abstract>
-      <url hash="b79cf0b1">2024.findings-acl.275</url>
+      <url hash="3817edb3">2024.findings-acl.275</url>
       <bibkey>salinas-morstatter-2024-butterfly</bibkey>
       <doi>10.18653/v1/2024.findings-acl.275</doi>
     </paper>
@@ -9866,7 +9866,7 @@
       <author><first>Wenpeng</first><last>Yin</last><affiliation>Pennsylvania State University</affiliation></author>
       <pages>4652-4665</pages>
       <abstract>In recent years, few-shot and zero-shot learning, which learn to predict labels with limited annotated instances, have garnered significant attention. Traditional approaches often treat frequent-shot (freq-shot; labels with abundant instances), few-shot, and zero-shot learning as distinct challenges, optimizing systems for just one of these scenarios. Yet, in real-world settings, label occurrences vary greatly. Some of them might appear thousands of times, while others might only appear sporadically or not at all. For practical deployment, it is crucial that a system can adapt to any label occurrence. We introduce a novel classification challenge: **X-shot**, reflecting a real-world context where freq-shot, few-shot, and zero-shot labels co-occur without predefined limits. Here, **X** can span from 0 to positive infinity. The crux of **X-shot** centers on open-domain generalization and devising a system versatile enough to manage various label scenarios. To solve **X-shot**, we propose **BinBin** (**B**inary **IN**ference **B**ased on **IN**struction following) that leverages the Indirect Supervision from a large collection of NLP tasks via instruction following, bolstered by Weak Supervision provided by large language models. **BinBin** surpasses previous state-of-the-art techniques on three benchmark datasets across multiple domains. To our knowledge, this is the first work addressing **X-shot** learning, where **X** remains variable.</abstract>
-      <url hash="23c728bb">2024.findings-acl.276</url>
+      <url hash="fccf00f5">2024.findings-acl.276</url>
       <bibkey>xu-etal-2024-x</bibkey>
       <doi>10.18653/v1/2024.findings-acl.276</doi>
       <video href="2024.findings-acl.276.mp4"/>
@@ -9881,7 +9881,7 @@
       <author><first>Ashton</first><last>Anderson</last><affiliation>Department of Computer Science, University of Toronto</affiliation></author>
       <pages>4666-4682</pages>
       <abstract>Among the many tasks that Large Language Models (LLMs) have revolutionized is text classification. Current text classification paradigms, however, rely solely on the output of the final layer in the LLM, with the rich information contained in internal neurons largely untapped. In this study, we present SPIN: a model-agnostic framework that sparsifies and integrates internal neurons of intermediate layers of LLMs for text classification. Specifically, SPIN sparsifies internal neurons by linear probing-based salient neuron selection layer by layer, avoiding noise from unrelated neurons and ensuring efficiency. The cross-layer salient neurons are then integrated to serve as multi-layered features for the classification head. Extensive experimental results show our proposed SPIN significantly improves text classification accuracy, efficiency, and interpretability.</abstract>
-      <url hash="b5f7c211">2024.findings-acl.277</url>
+      <url hash="0ab11e11">2024.findings-acl.277</url>
       <bibkey>jiao-etal-2024-spin</bibkey>
       <doi>10.18653/v1/2024.findings-acl.277</doi>
       <video href="2024.findings-acl.277.mp4"/>
@@ -9893,7 +9893,7 @@
       <author><first>Shohei</first><last>Hidaka</last><affiliation>Japan Advanced Institute of Science and Technology, Tokyo Institute of Technology</affiliation></author>
       <pages>4683-4700</pages>
       <abstract>This study addresses the interpretability of word representations through an investigation of a count-based co-occurrence matrix. Employing the mathematical methodology of Formal Concept Analysis, we reveal an underlying structure that is amenable to human interpretation. Furthermore, we unveil the emergence of hierarchical and geometrical structures within word vectors as consequences of word usage. Our experiments on the PPMI matrix demonstrate that the formal concepts that we identified align with interpretable categories, as shown in the category completion task.</abstract>
-      <url hash="92f6d339">2024.findings-acl.278</url>
+      <url hash="63677265">2024.findings-acl.278</url>
       <bibkey>maeda-etal-2024-decomposing</bibkey>
       <doi>10.18653/v1/2024.findings-acl.278</doi>
       <video href="2024.findings-acl.278.mp4"/>
@@ -9905,7 +9905,7 @@
       <author><first>Slobodan</first><last>Vucetic</last><affiliation>Temple University and Temple University</affiliation></author>
       <pages>4701-4714</pages>
       <abstract>Radiology reports are highly technical documents aimed primarily at doctor-doctor communication. There has been an increasing interest in sharing those reports with patients, necessitating providing them patient-friendly simplifications of the original reports. This study explores the suitability of large language models in automatically generating those simplifications. We examine the usefulness of chain-of-thought and self-correction prompting mechanisms in this domain. We also propose a new evaluation protocol that employs radiologists and laypeople, where radiologists verify the factual correctness of simplifications, and laypeople assess simplicity and comprehension. Our experimental results demonstrate the effectiveness of self-correction prompting in producing high-quality simplifications. Our findings illuminate the preferences of radiologists and laypeople regarding text simplification, informing future research on this topic.</abstract>
-      <url hash="c69a2c06">2024.findings-acl.279</url>
+      <url hash="e72128a0">2024.findings-acl.279</url>
       <bibkey>yang-etal-2024-two</bibkey>
       <doi>10.18653/v1/2024.findings-acl.279</doi>
     </paper>
@@ -9915,7 +9915,7 @@
       <author><first>Yu</first><last>Zhang</last><affiliation>Harbin Institute of Technology</affiliation></author>
       <pages>4715-4729</pages>
       <abstract>In the field of education, for better assessment of students’ abilities, generated questions often need to meet experts’ requirements, indicating the need for controllable question generation (CQG). However, current CQG methods mainly focus on difficulty control, neglecting the control of question content and assessed abilities, which are also crucial in educational QG. In this paper, we propose an LLM-guided method PFQS (for Planning First, Question Second), which utilizes Llama 2 to generate an answer plan and then generates questions based on it. The plan not only includes candidate answers but also integrates LLM’s understanding and multiple requirements, which make question generation simple and controllable. We evaluate our approach on the FairytaleQA dataset, a well-structured QA dataset derived from child-friendly storybooks. In the dataset, the attribute label represents content control, while the local_or_sum and ex_or_im labels denote difficulty control. Experimental results demonstrate that our approach outperforms previous state-of-the-art results and achieves better consistency with requirements compared to prompt-based method. Further application of our method to Llama 2 and Mistral also leads to improved requirement consistency in a zero-shot setting.</abstract>
-      <url hash="a45afb73">2024.findings-acl.280</url>
+      <url hash="de4cc04a">2024.findings-acl.280</url>
       <bibkey>li-zhang-2024-planning</bibkey>
       <doi>10.18653/v1/2024.findings-acl.280</doi>
       <video href="2024.findings-acl.280.mp4"/>
@@ -9931,7 +9931,7 @@
       <author><first>Tianyu</first><last>Du</last><affiliation>Zhejiang University</affiliation></author>
       <pages>4730-4749</pages>
       <abstract>Large language models (LLMs) demonstrate exceptional performance in numerous tasks but still heavily rely on knowledge stored in their parameters. Moreover, updating this knowledge incurs high training costs. Retrieval-augmented generation (RAG) methods address this issue by integrating external knowledge. The model can answer questions it couldn’t previously by retrieving knowledge relevant to the query. This approach improves performance in certain scenarios for specific tasks. However, if irrelevant texts are retrieved, it may impair model performance. In this paper, we propose Retrieval Augmented Iterative Self-Feedback (RA-ISF), a framework that iteratively decomposes tasks and processes them in three submodules to enhance the model’s problem-solving capabilities. Experiments show that our method outperforms existing benchmarks, performing well on models like GPT3.5, Llama2, significantly enhancing factual reasoning capabilities and reducing hallucinations.</abstract>
-      <url hash="3e9ba282">2024.findings-acl.281</url>
+      <url hash="f9e92551">2024.findings-acl.281</url>
       <bibkey>liu-etal-2024-ra</bibkey>
       <doi>10.18653/v1/2024.findings-acl.281</doi>
     </paper>
@@ -9942,7 +9942,7 @@
       <author><first>Ekapol</first><last>Chuangsuwanich</last><affiliation>Chulalongkorn University</affiliation></author>
       <pages>4750-4762</pages>
       <abstract>Large Language Models (LLMs) often struggle with hallucinations and outdated information. To address this, Information Retrieval (IR) systems can be employed to augment LLMs with up-to-date knowledge. However, existing IR techniques contain deficiencies, posing a performance bottleneck. Given the extensive array of IR systems, combining diverse approaches presents a viable strategy. Nevertheless, prior attempts have yielded restricted efficacy. In this work, we propose an approach that leverages learning-to-rank techniques to combine heterogeneous IR systems. We demonstrate the method on two Retrieval Question Answering (ReQA) tasks. Our empirical findings exhibit a significant performance enhancement, outperforming previous approaches and achieving state-of-the-art results on ReQA SQuAD.</abstract>
-      <url hash="b8f2df26">2024.findings-acl.282</url>
+      <url hash="7a0db51a">2024.findings-acl.282</url>
       <bibkey>khamnuansin-etal-2024-mrrank</bibkey>
       <doi>10.18653/v1/2024.findings-acl.282</doi>
       <video href="2024.findings-acl.282.mp4"/>
@@ -9956,7 +9956,7 @@
       <author><first>Zhendong</first><last>Mao</last><affiliation>University of Science and Technology of China</affiliation></author>
       <pages>4763-4776</pages>
       <abstract>Complex KBQA leverages the knowledge base (KB) to answer complex natural questions involving complicated semantics like multi-hop reasoning. Existing methods involve a question decomposition process, i.e., breaking a complex question into several simpler sub-questions, to assist obtaining logical forms for querying the KB. However, existing question decomposition process derives all sub-questions directly according to the original question, resulting in limitations when one sub-question relies on the answer from a previous one. In this work, we propose Chain-of-Question, a progressive question decomposition approach to address complex KBQA challenges. First, inspired by chain-of-thought, we design a prompt to guide LLM to sequentially decompose multiple semantically clear sub-questions and provide corresponding reference answers, where each step of the decomposition relies on the previous results. Next, we utilize the decomposition result to select relevant patterns (relation-entity pairs) as accurate and faithful auxiliary information for the following logical form generation. Finally, we jointly perform logical form generation and answer prediction, utilizing the predicted answer to supplement non-executable logical forms. Experimental results demonstrate that our method achieves state-of-the-art performance on multiple datasets.</abstract>
-      <url hash="3340bc88">2024.findings-acl.283</url>
+      <url hash="ce4f5787">2024.findings-acl.283</url>
       <bibkey>yixing-etal-2024-chain</bibkey>
       <doi>10.18653/v1/2024.findings-acl.283</doi>
     </paper>
@@ -9968,7 +9968,7 @@
       <author><first>Xuejie</first><last>Zhang</last><affiliation>Yunnan University</affiliation></author>
       <pages>4777-4788</pages>
       <abstract>Aspect-based sentiment analysis (ABSA) identifies sentiment information related to specific aspects and provides deeper market insights to businesses and organizations. With the emergence of large language models (LMs), recent studies have proposed using fixed examples for instruction tuning to reformulate ABSA as a generation task. However, the performance is sensitive to the selection of in-context examples; several retrieval methods are based on surface similarity and are independent of the LM generative objective. This study proposes an instruction learning method with retrieval-based example ranking for ABSA tasks. For each target sample, an LM was applied as a scorer to estimate the likelihood of the output given the input and a candidate example as the prompt, and training examples were labeled as positive or negative by ranking the scores. An alternating training schema is proposed to train both the retriever and LM. Instructional prompts can be constructed using high-quality examples. The LM is used for both scoring and inference, improving the generation efficiency without incurring additional computational costs or training difficulties. Extensive experiments on three ABSA subtasks verified the effectiveness of the proposed method, demonstrating its superiority over various strong baseline models. Code and data are released at https://github.com/zgMin/IT-RER-ABSA.</abstract>
-      <url hash="8bea2ec6">2024.findings-acl.284</url>
+      <url hash="dedfa779">2024.findings-acl.284</url>
       <bibkey>zheng-etal-2024-instruction</bibkey>
       <doi>10.18653/v1/2024.findings-acl.284</doi>
       <video href="2024.findings-acl.284.mp4"/>
@@ -9980,7 +9980,7 @@
       <author><first>Xuanjing</first><last>Huang</last><affiliation>Fudan University</affiliation></author>
       <pages>4789-4809</pages>
       <abstract>Social media has emerged as a cornerstone of social movements, wielding significant influence in driving societal change. Simulating the response of the public and forecasting the potential impact has become increasingly important. However, existing methods for simulating such phenomena encounter challenges concerning their efficacy and efficiency in capturing the behaviors of social movement participants. In this paper, we introduce a hybrid framework for social media user simulation, wherein users are categorized into two types. Core users are driven by Large Language Models, while numerous ordinary users are modeled by deductive agent-based models. We further construct a Twitter-like environment to replicate their response dynamics following trigger events. Subsequently, we develop a multi-faceted benchmark SoMoSiMu-Bench for evaluation and conduct comprehensive experiments across real-world datasets. Experimental results demonstrate the effectiveness and flexibility of our method.</abstract>
-      <url hash="39bef448">2024.findings-acl.285</url>
+      <url hash="6a746deb">2024.findings-acl.285</url>
       <bibkey>mou-etal-2024-unveiling</bibkey>
       <doi>10.18653/v1/2024.findings-acl.285</doi>
       <video href="2024.findings-acl.285.mp4"/>
@@ -9993,7 +9993,7 @@
       <author><first>Takuya</first><last>Ohko</last><affiliation>IBM Research - Tokyo, International Business Machines</affiliation></author>
       <pages>4810-4817</pages>
       <abstract>This paper exploits a sentiment extractor supported by syntactic and lexical resources to enhance multilingual sentiment classification solved through the generative approach, without retraining LLMs. By adding external information of words and phrases that have positive/negative polarities, the multilingual sentiment classification error was reduced by up to 33 points, and the combination of two approaches performed best especially in high-performing pairs of LLMs and languages.</abstract>
-      <url hash="3d0d7157">2024.findings-acl.286</url>
+      <url hash="58300818">2024.findings-acl.286</url>
       <bibkey>kanayama-etal-2024-incorporating</bibkey>
       <doi>10.18653/v1/2024.findings-acl.286</doi>
       <video href="2024.findings-acl.286.mp4"/>
@@ -10005,7 +10005,7 @@
       <author><first>Chang</first><last>Xu</last><affiliation>University of Sydney</affiliation></author>
       <pages>4818-4832</pages>
       <abstract>Relational concepts are indeed foundational to the structure of knowledge representation, as they facilitate the association between various entity concepts, allowing us to express and comprehend complex world knowledge.By expressing relational concepts in natural language prompts, people can effortlessly interact with large language models (LLMs) and recall desired factual knowledge. However, the process of knowledge recall lacks interpretability, and representations of relational concepts within LLMs remain unknown to us. In this paper, we identify hidden states that can express entity and relational concepts through causal mediation analysis in fact recall processes. Our finding reveals that at the last token position of the input prompt, there are hidden states that solely express the causal effects of relational concepts. Based on this finding, we assume that these hidden states can be treated as relational representations and we can successfully extract them from LLMs. The experimental results demonstrate high credibility of the relational representations: they can be flexibly transplanted into other fact recall processes, and can also be used as robust entity connectors. Moreover, we also show that the relational representations exhibit significant potential for controllable fact recall through relation rewriting.</abstract>
-      <url hash="753dd161">2024.findings-acl.287</url>
+      <url hash="576d253f">2024.findings-acl.287</url>
       <bibkey>wang-etal-2024-locating</bibkey>
       <doi>10.18653/v1/2024.findings-acl.287</doi>
       <video href="2024.findings-acl.287.mp4"/>
@@ -10019,7 +10019,7 @@
       <author><first>Weinan</first><last>Zhang</last><affiliation>Harbin Institute of Technology</affiliation></author>
       <pages>4833-4850</pages>
       <abstract>Although Retrieval-Augmented Large Language Models (RALMs) demonstrate their superiority in terms of factuality, they do not consistently outperform the original retrieval-free Language Models (LMs). Our experiments reveal that this example-level performance inconsistency exists not only between retrieval-augmented and retrieval-free LM but also among different retrievers. To understand this phenomenon, we investigate the degeneration behavior of RALMs and theoretically decompose it into four categories. Further analysis based on our decomposition reveals that the innate difference in knowledge sources and the unpredictable degeneration of the reader model contribute most to the inconsistency. Drawing from our analysis, we introduce Ensemble of Retrievers (EoR), a trainable framework that can adaptively retrieve from different knowledge sources and effectively decrease unpredictable reader errors. Our experiments on Open Domain Question Answering show that EoR substantially improves performance over the RALM with a single retriever by considerably reducing inconsistent behaviors.</abstract>
-      <url hash="993ad49b">2024.findings-acl.288</url>
+      <url hash="efcfa19e">2024.findings-acl.288</url>
       <bibkey>li-etal-2024-unraveling</bibkey>
       <doi>10.18653/v1/2024.findings-acl.288</doi>
     </paper>
@@ -10030,7 +10030,7 @@
       <author><first>Erik</first><last>Cambria</last><affiliation>Nanyang Technological University</affiliation></author>
       <pages>4851-4863</pages>
       <abstract>The success of state-of-the-art Natural Language Processing (NLP) systems heavily depends on deep neural networks, which excel in various tasks through strong data fitting and latent feature modeling abilities. However, certain challenges linked to deep neural networks and supervised deep learning deserve considerations, e.g., extensive computing resources, knowledge forgetting, etc. Previous research attempted to tackle these challenges individually through irrelative techniques. However, they do not instigate fundamental shifts in the learning paradigm. In this work, we propose a novel neurosymbolic method for sentiment analysis to tackle these issues. We also propose a novel sentiment-pragmatic knowledge base that places emphasis on human subjectivity within varying domain annotations. We conducted extensive experiments to show that our neurosymbolic framework for sentiment analysis stands out for its lightweight nature, robustness across domains and languages, efficient few-shot training, and rapid convergence.</abstract>
-      <url hash="65c429bc">2024.findings-acl.289</url>
+      <url hash="d4e65a09">2024.findings-acl.289</url>
       <bibkey>zhang-etal-2024-senticvec</bibkey>
       <doi>10.18653/v1/2024.findings-acl.289</doi>
       <video href="2024.findings-acl.289.mp4"/>
@@ -10047,7 +10047,7 @@
       <author><first>Jing</first><last>Shao</last><affiliation>Shanghai AI Laboratory</affiliation></author>
       <pages>4864-4888</pages>
       <abstract>Ensuring the trustworthiness of large language models (LLMs) is crucial. Most studies concentrate on fully pre-trained LLMs to better understand and improve LLMs’ trustworthiness. In this paper, to reveal the untapped potential of pre-training, we pioneer the exploration of LLMs’ trustworthiness during this period, focusing on five key dimensions: reliability, privacy, toxicity, fairness, and robustness. To begin with, we apply linear probing to LLMs. The high probing accuracy suggests that <i>LLMs in early pre-training can already distinguish concepts in each trustworthiness dimension</i>. Therefore, to further uncover the hidden possibilities of pre-training, we extract steering vectors from a LLM’s pre-training checkpoints to enhance the LLM’s trustworthiness. Finally, inspired by the theoretical result that mutual information estimation is bounded by linear probing accuracy, we also probe LLMs with mutual information to investigate the dynamics of trustworthiness during pre-training. We are the first to observe a similar two-phase phenomenon: fitting and compression. This research provides an initial exploration of trustworthiness modeling during LLM pre-training, seeking to unveil new insights and spur further developments in the field.</abstract>
-      <url hash="91ba51f8">2024.findings-acl.290</url>
+      <url hash="0aabbaa0">2024.findings-acl.290</url>
       <bibkey>qian-etal-2024-towards</bibkey>
       <doi>10.18653/v1/2024.findings-acl.290</doi>
       <video href="2024.findings-acl.290.mp4"/>
@@ -10061,7 +10061,7 @@
       <author><first>Chang</first><last>Zhou</last></author>
       <pages>4889-4901</pages>
       <abstract>In this paper, we begin by illustrating that, when presented with a query, Large Language Models (LLMs) capable of providing accurate responses tend to exhibit a more uniform probability distribution compared to their less proficient counterparts. Building upon this observation, we introduce a novel self-assessment criterion termed ProbDiff for evaluating the performance of diverse LLMs. This method eliminates the need for training an additional evaluation model or relying on external proprietary models such as GPT-4 as a judger. Instead, it solely relies on the LLMs under evaluation to compute the probability discrepancy between the original response generation and its revised versions. A higher discrepancy in two LLMs for the same query suggests a relatively weaker ability. We discover that ProbDiff yields comparable results to mainstream GPT-4-based evaluations on various scenarios including NLG tasks like translation and summarization, as well as LLM evaluation benchmarks such as AlignBench, MT-Bench, and AlpacaEval, across LLMs of different sizes.</abstract>
-      <url hash="01d45e27">2024.findings-acl.291</url>
+      <url hash="cad3d7ee">2024.findings-acl.291</url>
       <bibkey>xia-etal-2024-language</bibkey>
       <doi>10.18653/v1/2024.findings-acl.291</doi>
     </paper>
@@ -10075,7 +10075,7 @@
       <author><first>Fangyuan</first><last>Zhang</last></author>
       <pages>4902-4922</pages>
       <abstract>Deep neural networks exhibit vulnerability to word-level adversarial attacks in natural language processing. Most of these attack methods adopt synonymous substitutions to perturb original samples for crafting adversarial examples while attempting to maintain semantic consistency with the originals. Some of them claim that they could achieve over 90% attack success rate, thereby raising serious safety concerns. However, our investigation reveals that many purportedly successful adversarial examples are actually invalid due to significant changes in semantic meanings compared to their originals. Even when equipped with semantic constraints such as BERTScore, existing attack methods can generate up to 87.9% invalid adversarial examples. Building on this insight, we first curate a 13K dataset for adversarial validity evaluation with the help of GPT-4. Then, an open-source large language model is fine-tuned to offer an interpretable validity score for assessing the semantic consistency between original and adversarial examples. Finally, this validity score can serve as a guide for existing adversarial attack methods to generate valid adversarial examples. Comprehensive experiments demonstrate the effectiveness of our method in evaluating and refining the quality of adversarial examples.</abstract>
-      <url hash="282dfb71">2024.findings-acl.292</url>
+      <url hash="77d2c936">2024.findings-acl.292</url>
       <bibkey>zhou-etal-2024-evaluating-validity</bibkey>
       <doi>10.18653/v1/2024.findings-acl.292</doi>
       <video href="2024.findings-acl.292.mp4"/>
@@ -10094,7 +10094,7 @@
       <author><first>Yuki</first><last>Mitsufuji</last><affiliation>Sony AI, Sony Group Corporation, Tokyo Institute of Technology, Tokyo Institute of Technology and Sony Group Corporation</affiliation></author>
       <pages>4923-4940</pages>
       <abstract>Contrastive cross-modal models such as CLIP and CLAP aid various vision-language (VL) and audio-language (AL) tasks. However, there has been limited investigation of and improvement in their language encoder – the central component of encoding natural language descriptions of image/audio into vector representations. We extensively evaluate how unsupervised and supervised sentence embedding training affect language encoder quality and cross-modal task performance. In VL pretraining, we found that sentence embedding training enhances language encoder quality and aids in cross-modal tasks, improving contrastive VL models such as CyCLIP. Sentence embedding training benefits AL tasks when the amount of training data is large. We analyze the representation spaces to understand the strengths of sentence embedding training, and find that it improves text-space uniformity, at the cost of decreased cross-modal alignment.</abstract>
-      <url hash="c62dd811">2024.findings-acl.293</url>
+      <url hash="cdbeb589">2024.findings-acl.293</url>
       <bibkey>zhao-etal-2024-language</bibkey>
       <doi>10.18653/v1/2024.findings-acl.293</doi>
       <video href="2024.findings-acl.293.mp4"/>
@@ -10107,7 +10107,7 @@
       <author><first>He</first><last>He</last><affiliation>New York University</affiliation></author>
       <pages>4941-4957</pages>
       <abstract>Language agents that interact with the world on their own have great potential for automating digital tasks. While large language model (LLM) agents have made progress in understanding and executing tasks such as textual games and webpage control, many real-world tasks also require collaboration with humans or other LLMs in equal roles, which involves intent understanding, task coordination, and communication. To test LLM’s ability to collaborate, we design a blocks-world environment, where two agents, each having unique goals and skills, build a target structure together. To complete the goals, they can act in the world and communicate in natural language. Under this environment, we design increasingly challenging settings to evaluate different collaboration perspectives, from independent to more complex, dependent tasks. We further adopt chain-of-thought prompts that include intermediate reasoning steps to model the partner’s state and identify and correct execution errors. Both human-machine and machine-machine experiments show that LLM agents have strong grounding capacities, and our approach significantly improves the evaluation metric.</abstract>
-      <url hash="ba9fd5b5">2024.findings-acl.294</url>
+      <url hash="c6836a23">2024.findings-acl.294</url>
       <bibkey>wu-etal-2024-co</bibkey>
       <doi>10.18653/v1/2024.findings-acl.294</doi>
       <video href="2024.findings-acl.294.mp4"/>
@@ -10122,7 +10122,7 @@
       <author><first>Longyue</first><last>Wang</last></author>
       <pages>4958-4976</pages>
       <abstract>Large language models (LLMs) predominantly employ decoder-only transformer architectures, necessitating the retention of keys/values information for historical tokens to provide contextual information and avoid redundant computation. However, the substantial size and parameter volume of these LLMs require massive GPU memory. This memory demand increases with the length of the input text, leading to an urgent need for more efficient methods of information storage and processing. This study introduces Anchor-based LLMs (AnLLMs), which utilize an innovative anchor-based self-attention network (AnSAN) and also an anchor-based inference strategy. This approach enables LLMs to compress sequence information into an anchor token, reducing the keys/values cache and enhancing inference efficiency. Experiments on question-answering benchmarks reveal that AnLLMs maintain similar accuracy levels while achieving up to 99% keys/values cache reduction and up to 3.5 times faster inference. Despite a minor compromise in accuracy, the substantial enhancements of AnLLMs employing the AnSAN technique in resource utilization and computational efficiency underscore their potential for practical LLM applications.</abstract>
-      <url hash="08ffe0c8">2024.findings-acl.295</url>
+      <url hash="b5bc318c">2024.findings-acl.295</url>
       <bibkey>pang-etal-2024-anchor</bibkey>
       <doi>10.18653/v1/2024.findings-acl.295</doi>
       <video href="2024.findings-acl.295.mp4"/>
@@ -10142,7 +10142,7 @@
       <author><first>Yu</first><last>Huang</last><affiliation>Peking University</affiliation></author>
       <pages>4977-4997</pages>
       <abstract>Medical visual question answering (MVQA) requires in-depth understanding of medical images and questions to provide reliable answers. We summarize multi-level progressive capabilities that models need to focus on in MVQA: recognition, details, diagnosis, knowledge, and reasoning. Existing MVQA models tend to ignore the above capabilities due to unspecific data and plain architecture. To address these issues, this paper proposes Multi-level Visual Language Model (MLeVLM) for MVQA. On the data side, we construct a high-quality multi-level instruction dataset MLe-VQA via GPT-4, which covers multi-level questions and answers as well as reasoning processes from visual clues to semantic cognition. On the architecture side, we propose a multi-level feature alignment module, including attention-based token selector and context merger, which can efficiently align features at different levels from visual to semantic. To better evaluate the model’s capabilities, we manually construct a multi-level MVQA evaluation benchmark named MLe-Bench. Extensive experiments demonstrate the effectiveness of our constructed multi-level instruction dataset and the multi-level feature alignment module. It also proves that MLeVLM outperforms existing medical multimodal large language models.</abstract>
-      <url hash="f80a0518">2024.findings-acl.296</url>
+      <url hash="a42deb21">2024.findings-acl.296</url>
       <bibkey>xu-etal-2024-mlevlm</bibkey>
       <doi>10.18653/v1/2024.findings-acl.296</doi>
     </paper>
@@ -10154,7 +10154,7 @@
       <author><first>Chelsea</first><last>Finn</last><affiliation>Stanford University and Google</affiliation></author>
       <pages>4998-5017</pages>
       <abstract>Reinforcement Learning from Human Feedback (RLHF) has been a crucial component in the recent success of Large Language Models. However, RLHF is know to exploit biases in human preferences, such as verbosity. A well-formatted and eloquent answer is often more highly rated by users, even when it is less helpful and objective. A number of approaches have been developed to control those biases in the classical RLHF literature, but the problem remains relatively under-explored for Direct Alignment Algorithms such as Direct Preference Optimization (DPO). Unlike classical RLHF, DPO does not train a separate reward model or use reinforcement learning directly, so previous approaches developed to control verbosity cannot be directly applied to this setting. Our work makes several contributions. For the first time, we study the length problem in the DPO setting, showing significant exploitation in DPO and linking it to out-of-distribution bootstrapping. We then develop a principled but simple regularization strategy that prevents length exploitation, while still maintaining improvements in model quality. We demonstrate these affects across datasets on summarization and dialogue, where we achieve up to 20% improvement in win rates when controlling for length, despite the GPT4 judge’s well-known verbosity bias.</abstract>
-      <url hash="0da61a50">2024.findings-acl.297</url>
+      <url hash="34b7fade">2024.findings-acl.297</url>
       <bibkey>park-etal-2024-disentangling</bibkey>
       <doi>10.18653/v1/2024.findings-acl.297</doi>
       <video href="2024.findings-acl.297.mp4"/>
@@ -10172,7 +10172,7 @@
       <author><first>Bozhong</first><last>Tian</last></author>
       <pages>5018-5029</pages>
       <abstract>Multimodal knowledge editing represents a critical advancement in enhancing the capabilities of Multimodal Large Language Models (MLLMs). Despite its potential, current benchmarks predominantly focus on coarse-grained knowledge, leaving the intricacies of fine-grained (FG) multimodal entity knowledge largely unexplored. This gap presents a notable challenge, as FG entity recognition is pivotal for the practical deployment and effectiveness of MLLMs in diverse real-world scenarios. To bridge this gap, we introduce MIKE, a comprehensive benchmark and dataset specifically designed for the FG multimodal entity knowledge editing. MIKE encompasses a suite of tasks tailored to assess different perspectives, including Vanilla Name Answering, Entity-Level Caption, and Complex-Scenario Recognition. In addition, a new form of knowledge editing, Multi-step Editing, is introduced to evaluate the editing efficiency. Through our extensive evaluations, we demonstrate that the current state-of-the-art methods face significant challenges in tackling our proposed benchmark, underscoring the complexity of FG knowledge editing in MLLMs. Our findings spotlight the urgent need for novel approaches in this domain, setting a clear agenda for future research and development efforts within the community.</abstract>
-      <url hash="f897d861">2024.findings-acl.298</url>
+      <url hash="9240956e">2024.findings-acl.298</url>
       <bibkey>li-etal-2024-mike</bibkey>
       <doi>10.18653/v1/2024.findings-acl.298</doi>
       <video href="2024.findings-acl.298.mp4"/>
@@ -10186,7 +10186,7 @@
       <author><first>Sadao</first><last>Kurohashi</last><affiliation>Kyoto University</affiliation></author>
       <pages>5030-5041</pages>
       <abstract>While large language models (LLMs) like GPT-4 have recently demonstrated astonishing zero-shot capabilities in general domain tasks, they often generate content with hallucinations in specific domains such as Chinese law, hindering their application in these areas. This is typically due to the absence of training data that encompasses such a specific domain, preventing GPT-4 from acquiring in-domain knowledge. A pressing challenge is that it’s not plausible to continue training LLMs of the GPT-4’s scale on in-domain data.This paper introduces a simple yet effective domain adaptation framework for GPT-4 by reformulating generation as an adapt-retrieve-revise process. The initial step is to adapt an affordable 7B LLM to the Chinese legal domain by continuing learning in-domain data. When solving an in-domain task, we leverage the adapted LLM to generate a draft answer given a task query. Then, the draft answer will be used to retrieve supporting evidence candidates from an external in-domain knowledge base. Finally, the draft answer and retrieved evidence are concatenated into a whole prompt to let GPT-4 assess the evidence and revise the draft answer to generate the final answer. Our proposal combines the advantages of the efficiency of adapting a smaller 7B model with the evidence-assessing capability of GPT-4 and effectively prevents GPT-4 from generating hallucinatory content. In the zero-shot setting of four Chinese legal tasks, our method improves the average score by +33.6 points, compared to GPT-4 direct generation. When compared to two stronger retrieval-based baselines, our method outperforms them by +17.0 and +23.5.</abstract>
-      <url hash="85146a38">2024.findings-acl.299</url>
+      <url hash="e8582862">2024.findings-acl.299</url>
       <bibkey>wan-etal-2024-reformulating</bibkey>
       <doi>10.18653/v1/2024.findings-acl.299</doi>
     </paper>
@@ -10198,7 +10198,7 @@
       <author><first>Tanmoy</first><last>Chakraborty</last><affiliation>Indian Institute of Technology, Delhi</affiliation></author>
       <pages>5042-5078</pages>
       <abstract>Memes have evolved as a prevalent medium for diverse communication, ranging from humour to propaganda. With the rising popularity of image-focused content, there is a growing need to explore its potential harm from different aspects. Previous studies have analyzed memes in closed settings - detecting harm, applying semantic labels, and offering natural language explanations. To extend this research, we introduce MemeMQA, a multimodal question-answering framework aiming to solicit accurate responses to structured questions while providing coherent explanations. We curate MemeMQACorpus, a new dataset featuring 1,880 questions related to 1,122 memes with corresponding answer-explanation pairs. We further propose ARSENAL, a novel two-stage multimodal framework that leverages the reasoning capabilities of LLMs to address MemeMQA. We benchmark MemeMQA using competitive baselines and demonstrate its superiority - ~18% enhanced answer prediction accuracy and distinct text generation lead across various metrics measuring lexical and semantic alignment over the best baseline. We analyze ARSENAL’s robustness through diversification of question-set, confounder-based evaluation regarding MemeMQA’s generalizability, and modality-specific assessment, enhancing our understanding of meme interpretation in the multimodal communication landscape.</abstract>
-      <url hash="4133926c">2024.findings-acl.300</url>
+      <url hash="0581da88">2024.findings-acl.300</url>
       <bibkey>agarwal-etal-2024-mememqa</bibkey>
       <doi>10.18653/v1/2024.findings-acl.300</doi>
       <video href="2024.findings-acl.300.mp4"/>
@@ -10214,7 +10214,7 @@
       <author><first>Min</first><last>Zhang</last><affiliation>Harbin Institute of Technology, Shenzhen</affiliation></author>
       <pages>5079-5101</pages>
       <abstract>Large language models have been widely adopted in natural language processing, yet they face the challenge of generating unreliable content. Recent works aim to reduce misinformation and hallucinations by resorting to attribution as a means to provide evidence (i.e., citations). However, current attribution methods usually focus on the retrieval stage and automatic evaluation that neglect mirroring the citation mechanisms in human scholarly writing to bolster credibility. In this paper, we address these challenges by modelling the attribution task as preference learning and introducing an Automatic Preference Optimization (APO) framework. First, we create a curated collection for post-training with 6,330 examples by collecting and filtering from existing datasets. Second, considering the high cost of labelling preference data, we further propose an automatic method to synthesize attribution preference data resulting in 95,263 pairs. Moreover, inspired by the human citation process, we further propose a progressive preference optimization method by leveraging fine-grained information. Extensive experiments on three datasets (i.e., ASQA, StrategyQA, and ELI5) demonstrate that APO achieves state-of-the-art citation F1 with higher answer quality.</abstract>
-      <url hash="81b3b69d">2024.findings-acl.301</url>
+      <url hash="97da5991">2024.findings-acl.301</url>
       <bibkey>li-etal-2024-improving-attributed</bibkey>
       <doi>10.18653/v1/2024.findings-acl.301</doi>
     </paper>
@@ -10226,7 +10226,7 @@
       <author><first>SangKeun</first><last>Lee</last><affiliation>Korea University</affiliation></author>
       <pages>5102-5119</pages>
       <abstract>The Korean writing system, Hangeul, has a unique character representation rigidly following the invention principles recorded in Hunminjeongeum. However, existing pre-trained language models (PLMs) for Korean have overlooked these principles. In this paper, we introduce a novel framework for Korean PLMs called KOMBO, which firstly brings the invention principles of Hangeul to represent character. Our proposed method, KOMBO, exhibits notable experimental proficiency across diverse NLP tasks. In particular, our method outperforms the state-of-the-art Korean PLM by an average of 2.11% in five Korean natural language understanding tasks. Furthermore, extensive experiments demonstrate that our proposed method is suitable for comprehending the linguistic features of the Korean language. Consequently, we shed light on the superiority of using subcharacters over the typical subword-based approach for Korean PLMs. Our code is available at: https://github.com/SungHo3268/KOMBO.</abstract>
-      <url hash="72fdbe43">2024.findings-acl.302</url>
+      <url hash="d6cd6c31">2024.findings-acl.302</url>
       <bibkey>kim-etal-2024-kombo</bibkey>
       <doi>10.18653/v1/2024.findings-acl.302</doi>
       <video href="2024.findings-acl.302.mp4"/>
@@ -10238,7 +10238,7 @@
       <author><first>Yohei</first><last>Oseki</last><affiliation>University of Tokyo</affiliation></author>
       <pages>5120-5134</pages>
       <abstract>Syntactic Language Models (SLMs) can be trained efficiently to reach relatively high performance; however, they have trouble with inference efficiency due to the explicit generation of syntactic structures. In this paper, we propose a new method dubbed tree-planting: instead of explicitly generating syntactic structures, we “plant” trees into attention weights of unidirectional Transformer LMs to implicitly reflect syntactic structures of natural language. Specifically, unidirectional Transformer LMs trained with tree-planting will be called Tree-Planted Transformers (TPT), which inherit the training efficiency from SLMs without changing the inference efficiency of their underlying Transformer LMs. Targeted syntactic evaluations on the SyntaxGym benchmark demonstrated that TPTs, despite the lack of explicit generation of syntactic structures, significantly outperformed not only vanilla Transformer LMs but also various SLMs that generate hundreds of syntactic structures in parallel. This result suggests that TPTs can learn human-like syntactic knowledge as data-efficiently as SLMs while maintaining the modeling space of Transformer LMs unchanged.</abstract>
-      <url hash="3a2803c8">2024.findings-acl.303</url>
+      <url hash="5cb863be">2024.findings-acl.303</url>
       <bibkey>yoshida-etal-2024-tree</bibkey>
       <doi>10.18653/v1/2024.findings-acl.303</doi>
       <video href="2024.findings-acl.303.mp4"/>
@@ -10253,7 +10253,7 @@
       <author id="yang-liu"><first>Yang</first><last>Liu</last><affiliation>Nanyang Technological University</affiliation></author>
       <pages>5135-5147</pages>
       <abstract>With the development of LLMs, the security threats of LLMs are getting more and more attention. Numerous jailbreak attacks have been proposed to assess the security defense of LLMs. Current jailbreak attacks primarily utilize scenario camouflage techniques. However their explicitly mention of malicious intent will be easily recognized and defended by LLMs. In this paper, we propose an indirect jailbreak attack approach, Puzzler, which can bypass the LLM’s defensive strategies and obtain malicious response by implicitly providing LLMs with some clues about the original malicious query. In addition, inspired by the wisdom of “When unable to attack, defend” from Sun Tzu’s Art of War, we adopt a defensive stance to gather clues about the original malicious query through LLMs. The experimental results indicate that the Query Success Rate of the Puzzler is 14.0%-82.7% higher than baselines on the most prominent LLMs. Furthermore, when tested against the state-of-the-art jailbreak detection approaches, Puzzler proves to be more effective at evading detection compared to baselines.</abstract>
-      <url hash="0f720fbd">2024.findings-acl.304</url>
+      <url hash="1541070a">2024.findings-acl.304</url>
       <bibkey>chang-etal-2024-play</bibkey>
       <doi>10.18653/v1/2024.findings-acl.304</doi>
       <video href="2024.findings-acl.304.mp4"/>
@@ -10277,7 +10277,7 @@
       <author><first>Edward</first><last>Choi</last><affiliation>Korea Advanced Institute of Science and Technology</affiliation></author>
       <pages>5148-5168</pages>
       <abstract>The development of large language models tailored for handling patients’ clinical notes is often hindered by the limited accessibility and usability of these notes due to strict privacy regulations.To address these challenges, we first create synthetic large-scale clinical notes using publicly available case reports extracted from biomedical literature.We then use these synthetic notes to train our specialized clinical large language model, Asclepius.While Asclepius is trained on synthetic data, we assess its potential performance in real-world applications by evaluating it using real clinical notes.We benchmark Asclepius against several other large language models, including GPT-3.5-turbo and other open-source alternatives. To further validate our approach using synthetic notes, we also compare Asclepius with its variants trained on real clinical notes. Our findings convincingly demonstrate that synthetic clinical notes can serve as viable substitutes for real ones when constructing high-performing clinical language models. This conclusion is supported by detailed evaluations conducted by both GPT-4 and medical professionals. All resources—including weights, codes, and data—used in the development of Asclepius will be made publicly accessible for future research.</abstract>
-      <url hash="091fcd80">2024.findings-acl.305</url>
+      <url hash="e6224896">2024.findings-acl.305</url>
       <bibkey>kweon-etal-2024-publicly</bibkey>
       <doi>10.18653/v1/2024.findings-acl.305</doi>
       <video href="2024.findings-acl.305.mp4"/>
@@ -10293,7 +10293,7 @@
       <author><first>Wei</first><last>Han</last><affiliation>Huawei Tech. Investment Co., Limited</affiliation></author>
       <pages>5169-5181</pages>
       <abstract>Transformer based Large Language Models (LLMs) often impose limitations on the length of the text input to ensure the generation of fluent and relevant responses due to the quadratic complexity. These constraints restrict their applicability in long text scenarios. In this paper, we propose a novel semantic compression method that enables generalization to texts that are 6-8 times longer without incurring significant computational costs or requiring fine-tuning. Our proposed framework draws inspiration from source coding in information theory and employs a pre-trained model to reduce the semantic redundancy of long inputs before passing them to the LLMs for downstream tasks. Experimental results demonstrate that our method effectively extends the context window of LLMs across a range of tasks including question answering, summarization, few-shot learning, and information retrieval. Furthermore, the proposed semantic compression method exhibits consistent fluency in text generation while reducing the associated computational overhead.</abstract>
-      <url hash="53f91980">2024.findings-acl.306</url>
+      <url hash="040695df">2024.findings-acl.306</url>
       <bibkey>fei-etal-2024-extending</bibkey>
       <doi>10.18653/v1/2024.findings-acl.306</doi>
       <video href="2024.findings-acl.306.mp4"/>
@@ -10305,7 +10305,7 @@
       <author><first>Erik</first><last>Cambria</last><affiliation>Nanyang Technological University</affiliation></author>
       <pages>5182-5192</pages>
       <abstract>The increasing use of complex and opaque black box models requires the adoption of interpretable measures, one such option is extractive rationalizing models, which serve as a more interpretable alternative. These models, also known as Explain-Then-Predict models, employ an explainer model to extract rationales and subsequently condition the predictor with the extracted information. Their primary objective is to provide precise and faithful explanations, represented by the extracted rationales. In this paper, we take a semi-supervised approach to optimize for the plausibility of extracted rationales. We adopt a pre-trained natural language inference (NLI) model and further fine-tune it on a small set of supervised rationales (10%). The NLI predictor is leveraged as a source of supervisory signals to the explainer via entailment alignment. We show that, by enforcing the alignment agreement between the explanation and answer in a question-answering task, the performance can be improved without access to ground truth labels. We evaluate our approach on the ERASER dataset and show that our approach achieves comparable results with supervised extractive models and outperforms unsupervised approaches by &gt; 100%.</abstract>
-      <url hash="7cff551a">2024.findings-acl.307</url>
+      <url hash="3d3ec20b">2024.findings-acl.307</url>
       <bibkey>wei-jie-etal-2024-plausible</bibkey>
       <doi>10.18653/v1/2024.findings-acl.307</doi>
       <video href="2024.findings-acl.307.mp4"/>
@@ -10322,7 +10322,7 @@
       <author><first>Jaegul</first><last>Choo</last><affiliation>Korea Advanced Institute of Science and Technology</affiliation></author>
       <pages>5193-5221</pages>
       <abstract>Building a reliable visual question answering (VQA) system across different languages is a challenging problem, primarily due to the lack of abundant samples for training. To address this challenge, recent studies have employed machine translation systems for the cross-lingual VQA task. This involves translating the evaluation samples into a source language (usually English) and using monolingual models (i.e., translate-test). However, our analysis reveals that translated texts contain unique characteristics distinct from human-written ones, referred to as translation artifacts. We find that these artifacts can significantly affect the models, confirmed by extensive experiments across diverse models, languages, and translation processes. In light of this, we present a simple data augmentation strategy that can alleviate the adverse impacts of translation artifacts.</abstract>
-      <url hash="442a8a8e">2024.findings-acl.308</url>
+      <url hash="3708d6c5">2024.findings-acl.308</url>
       <bibkey>park-etal-2024-translation</bibkey>
       <doi>10.18653/v1/2024.findings-acl.308</doi>
       <video href="2024.findings-acl.308.mp4"/>
@@ -10335,7 +10335,7 @@
       <author><first>Chenrui</first><last>Mao</last></author>
       <pages>5222-5235</pages>
       <abstract>Existing methods for incorporating entities into EAE rely on prompts or NER. They typically fail to explicitly explore the role of entity types, which results in shallow argument comprehension and often encounter three issues: (1) weak semantic associations due to missing role-entity correspondence cues; (2) compromised semantic integrity from abandoning context after recognizing entities regardless of their types; (3) one-sided semantic understanding relying solely on argument role semantics. To tackle these issues, we propose Scented-EAE, an EAE model with stage-customized entity type embedding to explicitly underscore and explore the role of entity types, thus intervening in argument selection. Specifically, at the input stage, we strengthen semantic associations by prompting role-entity correspondence after extending a non-autoregressive decoder as part of the encoder. At the intermediate stage, we preserve semantic integrity by optimizing our proposed BIO-aware NER and EAE via a novel IPE joint learning. At the output stage, we expand semantic understanding dimensions by determining arguments using span selectors from argument roles and entity types. Experiments show that our model achieves state-of-the-art performance on mainstream benchmarks. In addition, it also exhibits robustness in low-resource settings with the help of prompts and entity types.</abstract>
-      <url hash="d99ea79c">2024.findings-acl.309</url>
+      <url hash="817849b7">2024.findings-acl.309</url>
       <bibkey>yang-etal-2024-scented</bibkey>
       <doi>10.18653/v1/2024.findings-acl.309</doi>
     </paper>
@@ -10346,7 +10346,7 @@
       <author><first>William</first><last>Cheung</last><affiliation>Hong Kong Baptist University</affiliation></author>
       <pages>5236-5249</pages>
       <abstract>Low-rank adaptation (LoRA) achieves parameter efficient fine-tuning for large language models (LLMs) by decomposing the model weight update into a pair of low-rank projection matrices. Yet, the memory overhead restricts it to scale up when the model size increases. We propose Randomized LoRA (RLoRA) which adopts Randomized Walsh-Hadamard Transform to achieve significant reduction in the size of trainable parameters compared to LoRA. At the same time, it allows a PAC-Bayes regularizer to be efficiently incorporated to improve generalization. We evaluate the effectiveness of RLoRA on LLMs RoBERTa, GPT-2 and LLaMA-7B using GLUE, E2E and math reasoning benchmarks. With a much lower memory requirement, RLoRA can give similar performance as the SOTA low-rank adaptation methods for these three tasks and significantly better performance under few-shot settings.</abstract>
-      <url hash="1322b74d">2024.findings-acl.310</url>
+      <url hash="987a2d30">2024.findings-acl.310</url>
       <bibkey>lei-etal-2024-fast</bibkey>
       <doi>10.18653/v1/2024.findings-acl.310</doi>
       <video href="2024.findings-acl.310.mp4"/>
@@ -10358,7 +10358,7 @@
       <author><first>Yanfeng</first><last>Wang</last><affiliation>Shanghai Jiao Tong University</affiliation></author>
       <pages>5250-5261</pages>
       <abstract>In the realm of text-conditioned image retrieval, models utilize a query composed of a reference image and modification text to retrieve corresponding images. Despite its significance, this task is fraught with challenges, including small-scale datasets due to labeling costs and the complexity of attributes in modification texts. These challenges often result in models learning a generalized representation of the query, thereby missing the semantic correlations of image and text attributes.In this paper, we introduce a general boosting framework designed to address these issues by employing semantic discrepancy alignment. Our framework first leverages the ChatGPT to augment text data by modifying the original modification text’s attributes. The augmented text is then combined with the original reference image to create an augmented composed query. Then we generate corresponding images using GPT-4 for the augmented composed query.We realize the cross-modal semantic discrepancy alignment by formulating distance consistency and neighbor consistency between the image and text domains. Through this novel approach, attribute in the text domain can be more effectively transferred to the image domain, enhancing retrieval performance. Extensive experiments on three prominent datasets validate the effectiveness of our approach, with state-of-the-art results on a majority of evaluation metrics compared to various baseline methods.</abstract>
-      <url hash="d02354d3">2024.findings-acl.311</url>
+      <url hash="775e41dc">2024.findings-acl.311</url>
       <bibkey>yang-etal-2024-sda</bibkey>
       <doi>10.18653/v1/2024.findings-acl.311</doi>
     </paper>
@@ -10374,7 +10374,7 @@
       <author><first>Qi</first><last>Zhang</last></author>
       <pages>5262-5284</pages>
       <abstract>The remarkable capability of large language models(LLMs) for in-context learning(ICL) needs to be activated by demonstration examples. Prior work has extensively explored the selection of examples for ICL, predominantly following the “select then organize” paradigm, such approaches often neglect the internal relationships between examples and exist an inconsistency between the training and inference. In this paper, we formulate the problem as a <tex-math>Se</tex-math>quential <tex-math>Se</tex-math>lection problem and introduce <tex-math>Se^2</tex-math>, a sequential-aware method that leverages the LLM’s feedback on varying context, aiding in capturing inter-relationships and sequential information among examples, significantly enriching the contextuality and relevance of ICL prompts. Meanwhile, we utilize beam search to seek and construct example sequences, enhancing both quality and diversity. Extensive experiments across 23 NLP tasks from 8 distinct categories illustrate that <tex-math>Se^2</tex-math> markedly surpasses competitive baselines and achieves 42% relative improvement over random selection. Further in-depth analysis shows the effectiveness of proposed strategies, highlighting <tex-math>Se^2</tex-math>‘s exceptional stability and adaptability across various scenarios. Code available at https://github.com/microsoft/LMOps.</abstract>
-      <url hash="f1d64f74">2024.findings-acl.312</url>
+      <url hash="e8b9acc9">2024.findings-acl.312</url>
       <bibkey>liu-etal-2024-se2</bibkey>
       <doi>10.18653/v1/2024.findings-acl.312</doi>
     </paper>
@@ -10388,7 +10388,7 @@
       <author><first>Rong</first><last>Xiao</last></author>
       <pages>5285-5299</pages>
       <abstract>This research aims to accelerate the inference speed of large language models (LLMs) with billions of parameters. We propose Smart Parallel Auto-Correct dEcoding (SPACE), an approach designed for achieving lossless acceleration of LLMs. By integrating semi-autoregressive inference and speculative decoding capabilities, SPACE uniquely enables autoregressive LLMs to parallelize token generation and verification. This is realized through a specialized semi-autoregressive supervised fine-tuning process that equips existing LLMs with the ability to simultaneously predict multiple tokens. Additionally, an auto-correct decoding algorithm facilitates the simultaneous generation and verification of token sequences within a single model invocation. Through extensive experiments on a range of LLMs, SPACE has demonstrated inference speedup ranging from 2.7x-4.0x on HumanEval-X while maintaining output quality.</abstract>
-      <url hash="7a449f27">2024.findings-acl.313</url>
+      <url hash="6b75779e">2024.findings-acl.313</url>
       <bibkey>yi-etal-2024-generation</bibkey>
       <doi>10.18653/v1/2024.findings-acl.313</doi>
       <video href="2024.findings-acl.313.mp4"/>
@@ -10404,7 +10404,7 @@
       <author><first>Le</first><last>Sun</last><affiliation>Institute of Software, Chinese Academy of Sciences</affiliation></author>
       <pages>5300-5318</pages>
       <abstract>Evaluation is the baton for the development of large language models. Current evaluations typically employ a single-item assessment paradigm for each atomic test objective, which struggle to discern whether a model genuinely possesses the required capabilities or merely memorizes/guesses the answers to specific questions. To this end, this paper proposes a novel evaluation framework referred to as StructEval. Starting from an atomic test objective, StructEval deepens and broadens the evaluation by conducting a structured assessment across multiple cognitive levels and critical concepts, and therefore offers a comprehensive, robust and consistent evaluations for large language models. Experiments on three widely-used benchmarks demonstrate that StructEval serves as a reliable tool for resisting the risk of data contamination, and reducing the interference of potential biases, thereby providing a more reliable and consistent conclusion regarding model capabilities. Our framework also sheds light on the design of future principled and trustworthy LLM evaluation protocols.</abstract>
-      <url hash="9c8ca6d4">2024.findings-acl.314</url>
+      <url hash="7e90522f">2024.findings-acl.314</url>
       <bibkey>cao-etal-2024-structeval</bibkey>
       <doi>10.18653/v1/2024.findings-acl.314</doi>
     </paper>
@@ -10416,7 +10416,7 @@
       <author><first>Deyi</first><last>Xiong</last><affiliation>Tianjin University</affiliation></author>
       <pages>5319-5332</pages>
       <abstract>Protecting privacy leakage in large language models remains a paramount challenge. In this paper, we reveal Privacy Seesaw in LLM privacy safeguarding, a phenomenon where measures to secure specific private information inadvertently heighten exposure risks for other privacy. Through comprehensive analysis, we identify the amount of targeted privacy data and the volume of edited privacy neurons as the two central triggers to this issue. To mitigate privacy seesaw, we propose Augmented Privacy Neuron Editing via Activation Patching (APNEAP), a novel framework designed to well balance model performance with privacy protection. The proposed APNEAP augments collected private data by automatically synthesizing new private data, which deactivates the first trigger to the privacy seesaw issue. Additionally, it adapts activation patching to privacy neuron editing for switching off the second trigger to the privacy seesaw problem. Experimental results show that the proposed APNEAP is capable of alleviating the privacy seesaw phenomenon and offers a more stable and reliable approach to privacy protection in LLMs than previous methods.</abstract>
-      <url hash="119a1cc5">2024.findings-acl.315</url>
+      <url hash="c8107da8">2024.findings-acl.315</url>
       <bibkey>wu-etal-2024-mitigating-privacy</bibkey>
       <doi>10.18653/v1/2024.findings-acl.315</doi>
     </paper>
@@ -10427,7 +10427,7 @@
       <author><first>Majed</first><last>El Helou</last><affiliation>ETHZ - ETH Zurich</affiliation></author>
       <pages>5333-5338</pages>
       <abstract>Understanding the nature of high-quality summaries is crucial to further improve the performance of multi-document summarization. We propose an approach to characterize human-written summaries using partial information decomposition, which decomposes the mutual information provided by all source documents into union, redundancy, synergy, and unique information. Our empirical analysis on different MDS datasets shows that there is a direct dependency between the number of sources and their contribution to the summary.</abstract>
-      <url hash="1eea0a80">2024.findings-acl.316</url>
+      <url hash="5aaa7e7a">2024.findings-acl.316</url>
       <bibkey>mascarell-etal-2024-information</bibkey>
       <doi>10.18653/v1/2024.findings-acl.316</doi>
     </paper>
@@ -10441,7 +10441,7 @@
       <author><first>Zheli</first><last>Liu</last></author>
       <pages>5339-5352</pages>
       <abstract>Backdoor attacks pose an increasingly severe security threat to Deep Neural Networks (DNNs) during their development stage. In response, backdoor sample purification has emerged as a promising defense mechanism, aiming to eliminate backdoor triggers while preserving the integrity of the clean content in the samples. However, existing approaches have been predominantly focused on the word space, which are ineffective against feature-space triggers and significantly impair performance on clean data. To address this, we introduce a universal backdoor defense that purifies backdoor samples in the activation space by drawing abnormal activations towards optimized minimum clean activation distribution intervals. The advantages of our approach are twofold: (1) By operating in the activation space, our method captures from surface-level information like words to higher-level semantic concepts such as syntax, thus counteracting diverse triggers; (2) the fine-grained continuous nature of the activation space allows for more precise preservation of clean content while removing triggers. Furthermore, we propose a detection module based on statistical information of abnormal activations, to achieve a better trade-off between clean accuracy and defending performance. Extensive experiments on diverse datasets and against diverse attacks (including syntax and style attacks) demonstrate that our defense achieves state-of-the-art performance.</abstract>
-      <url hash="a0f95398">2024.findings-acl.317</url>
+      <url hash="2570f5bd">2024.findings-acl.317</url>
       <bibkey>yi-etal-2024-badacts</bibkey>
       <doi>10.18653/v1/2024.findings-acl.317</doi>
       <video href="2024.findings-acl.317.mp4"/>
@@ -10458,7 +10458,7 @@
       <author><first>Tat-Seng</first><last>Chua</last><affiliation>National University of Singapore</affiliation></author>
       <pages>5353-5377</pages>
       <abstract>Molecule-text modeling, which aims to facilitate molecule-relevant tasks with a textual interface and textual knowledge, is an emerging research direction. Beyond single molecules, studying reaction-text modeling holds promise for helping the synthesis of new materials and drugs. However, previous works mostly neglect reaction-text modeling: they primarily focus on modeling individual molecule-text pairs or learning chemical reactions without texts in context. Additionally, one key task of reaction-text modeling – experimental procedure prediction – is less explored due to the absence of an open-source dataset. The task is to predict step-by-step actions of conducting chemical experiments and is crucial to automating chemical synthesis. To resolve the challenges above, we propose a new pretraining method, ReactXT, for reaction-text modeling, and a new dataset, OpenExp, for experimental procedure prediction. Specifically, ReactXT features three types of input contexts to incrementally pretrain LMs. Each of the three input contexts corresponds to a pretraining task to improve the text-based understanding of either reactions or single molecules. ReactXT demonstrates consistent improvements in experimental procedure prediction and molecule captioning and offers competitive results in retrosynthesis. Our code is available at https://github.com/syr-cn/ReactXT.</abstract>
-      <url hash="14714801">2024.findings-acl.318</url>
+      <url hash="28b0618a">2024.findings-acl.318</url>
       <bibkey>liu-etal-2024-reactxt</bibkey>
       <doi>10.18653/v1/2024.findings-acl.318</doi>
       <video href="2024.findings-acl.318.mp4"/>
@@ -10470,7 +10470,7 @@
       <author><first>Jianxin</first><last>Wang</last><affiliation>Central South University</affiliation></author>
       <pages>5378-5389</pages>
       <abstract>Medical Visual Question Answering (Med-VQA) seeks to accurately respond to queries regarding medical images, a task particularly challenging for open-ended questions. This study unveils the Multi-modal Concept Alignment Pre-training (MMCAP) approach for generative Med-VQA, leveraging a knowledge graph sourced from medical image-caption datasets and the Unified Medical Language System. MMCAP advances the fusion of visual and textual medical knowledge via a graph attention network and a transformer decoder. Additionally, it incorporates a Type Conditional Prompt in the fine-tuning phase, markedly boosting the accuracy and relevance of answers to open-ended questions. Our tests on benchmark datasets illustrate MMCAP’s superiority over existing methods, demonstrating its high efficiency in data-limited settings and effective knowledge-image alignment capability.</abstract>
-      <url hash="81ce4a42">2024.findings-acl.319</url>
+      <url hash="7a33c7f2">2024.findings-acl.319</url>
       <bibkey>yan-etal-2024-multi</bibkey>
       <doi>10.18653/v1/2024.findings-acl.319</doi>
     </paper>
@@ -10485,7 +10485,7 @@
       <author><first>Prasanna</first><last>Srinivasa Murthy</last><affiliation>Amazon</affiliation></author>
       <pages>5390-5404</pages>
       <abstract>Ordinal Classification (OC) is a widely encountered challenge in Natural Language Processing (NLP), with applications in various domains such as sentiment analysis, rating prediction, and more. Previous approaches to tackle OC have primarily focused on modifying existing or creating novel loss functions that explicitly account for the ordinal nature of labels. However, with the advent of Pre-trained Language Models (PLMs), it became possible to tackle ordinality through the implicit semantics of the labels as well. This paper provides a comprehensive theoretical and empirical examination of both these approaches. Furthermore, we also offer strategic recommendations regarding the most effective approach to adopt based on specific settings.</abstract>
-      <url hash="fd86b76a">2024.findings-acl.320</url>
+      <url hash="3f6c082d">2024.findings-acl.320</url>
       <bibkey>kasa-etal-2024-exploring</bibkey>
       <doi>10.18653/v1/2024.findings-acl.320</doi>
       <video href="2024.findings-acl.320.mp4"/>
@@ -10503,7 +10503,7 @@
       <author><first>Irene</first><last>Li</last></author>
       <pages>5405-5418</pages>
       <abstract>Educational materials such as survey articles in specialized fields like computer science traditionally require tremendous expert inputs and are therefore expensive to create and update. Recently, Large Language Models (LLMs) have achieved significant success across various general tasks. However, their effectiveness and limitations in the education domain are yet to be fully explored. In this work, we examine the proficiency of LLMs in generating succinct survey articles specific to the niche field of NLP in computer science, focusing on a curated list of 99 topics. Automated benchmarks reveal that GPT-4 surpasses its predecessors, inluding GPT-3.5, PaLM2, and LLaMa2 by margins ranging from 2% to 20% in comparison to the established ground truth. We compare both human and GPT-based evaluation scores and provide in-depth analysis. While our findings suggest that GPT-created surveys are more contemporary and accessible than human-authored ones, certain limitations were observed. Notably, GPT-4, despite often delivering outstanding content, occasionally exhibited lapses like missing details or factual errors. At last, we compared the rating behavior between humans and GPT-4 and found systematic bias in using GPT evaluation.</abstract>
-      <url hash="4fab8f17">2024.findings-acl.321</url>
+      <url hash="14a4cdec">2024.findings-acl.321</url>
       <bibkey>gao-etal-2024-evaluating-large</bibkey>
       <doi>10.18653/v1/2024.findings-acl.321</doi>
       <video href="2024.findings-acl.321.mp4"/>
@@ -10518,7 +10518,7 @@
       <author><first>Xueqi</first><last>Cheng</last><affiliation>, Chinese Academy of Sciences</affiliation></author>
       <pages>5419-5437</pages>
       <abstract>Although model editing has shown promise in revising knowledge in Large Language Models (LLMs), its impact on the inherent capabilities of LLMs is often overlooked. In this work, we reveal a critical phenomenon: even a single edit can trigger model collapse, manifesting as significant performance degradation in various benchmark tasks. However, benchmarking LLMs after each edit, while necessary to prevent such collapses, is impractically time-consuming and resource-intensive. To mitigate this, we propose using perplexity as a surrogate metric, validated by extensive experiments demonstrating changes in an edited model’s perplexity are strongly correlated with its downstream task performances. We further conduct an in-depth study on sequential editing, a practical setting for real-world scenarios, across various editing methods and LLMs, focusing on hard cases from our previous single edit studies. The results indicate that nearly all examined editing methods result in model collapse after only few edits. To facilitate further research, we have utilized GPT-3.5 to develop a new dataset, HardEdit, based on those hard cases. This dataset aims to establish the foundation for pioneering research in reliable model editing and the mechanisms underlying editing-induced model collapse. We hope this work can draw the community’s attention to the potential risks inherent in model editing practices.</abstract>
-      <url hash="83364d13">2024.findings-acl.322</url>
+      <url hash="671d287a">2024.findings-acl.322</url>
       <bibkey>yang-etal-2024-butterfly</bibkey>
       <doi>10.18653/v1/2024.findings-acl.322</doi>
       <video href="2024.findings-acl.322.mp4"/>
@@ -10529,7 +10529,7 @@
       <author><first>Xiaowen</first><last>Chu</last><affiliation>Hong Kong University of Science and Technology (Guangzhou)</affiliation></author>
       <pages>5438-5455</pages>
       <abstract>Model editing has become a promising method for precisely and effectively updating knowledge in language models. In this paper, we investigate knowledge attenuation, in which the retention of updated knowledge within the language model decreases as the number of edits increases after sequential editing. Through empirical study, we discovered that existing editing methods generally suffer from knowledge attenuation. We attribute this phenomenon to two aspects: (1) redundant parameters interference and (2) update weight disentanglement. To this end, we propose the AdaPLE method. It not only mitigates the knowledge attenuation issue but also improves the performance on existing benchmarks. To the best of our knowledge, we are the first to investigate the cause and mitigation of knowledge attenuation in sequential LLM editing.</abstract>
-      <url hash="e3943e84">2024.findings-acl.323</url>
+      <url hash="15096619">2024.findings-acl.323</url>
       <bibkey>li-chu-2024-continually</bibkey>
       <doi>10.18653/v1/2024.findings-acl.323</doi>
     </paper>
@@ -10544,7 +10544,7 @@
       <author><first>Reynold</first><last>Cheng</last></author>
       <pages>5456-5471</pages>
       <abstract>Large Language Models (LLMs) driven by In-Context Learning (ICL) have significantly improved the performance of text-to-SQL. Previous methods generally employ a two-stage reasoning framework, namely 1) schema linking and 2) logical synthesis, making the framework not only effective but also interpretable. Despite these advancements, the inherent bad nature of the generalization of LLMs often results in hallucinations, which limits the full potential of LLMs. In this work, we first identify and categorize the common types of hallucinations at each stage in text-to-SQL. We then introduce a novel strategy, Task Alignment (TA), designed to mitigate hallucinations at each stage. TA encourages LLMs to take advantage of experiences from similar tasks rather than starting the tasks from scratch. This can help LLMs reduce the burden of generalization, thereby mitigating hallucinations effectively. We further propose TA-SQL, a text-to-SQL framework based on this strategy. The experimental results and comprehensive analysis demonstrate the effectiveness and robustness of our framework. Specifically, it enhances the performance of the GPT-4 baseline by 21.23% relatively on BIRD dev and it yields significant improvements across six models and four mainstream, complex text-to-SQL benchmarks.</abstract>
-      <url hash="3ef1c457">2024.findings-acl.324</url>
+      <url hash="f7f5d488">2024.findings-acl.324</url>
       <bibkey>qu-etal-2024-generation</bibkey>
       <doi>10.18653/v1/2024.findings-acl.324</doi>
       <video href="2024.findings-acl.324.mp4"/>
@@ -10559,7 +10559,7 @@
       <author><first>Jinsong</first><last>Su</last><affiliation>Xiamen University</affiliation></author>
       <pages>5472-5485</pages>
       <abstract>In-image machine translation (IIMT) aims to translate an image containing texts in source language into an image containing translations in target language. In this regard, conventional cascaded methods suffer from issues such as error propagation, massive parameters, and difficulties in deployment and retaining visual characteristics of the input image.Thus, constructing end-to-end models has become an option, which, however, faces two main challenges: 1) the huge modeling burden, as it is required to simultaneously learn alignment across languages and preserve the visual characteristics of the input image; 2) the difficulties of directly predicting excessively lengthy pixel sequences.In this paper, we propose <tex-math>\textit{Translatotron-V(ision)}</tex-math>, an end-to-end IIMT model consisting of four modules. In addition to an image encoder, and an image decoder, our model contains a target text decoder and an image tokenizer. Among them, the target text decoder is used to alleviate the language alignment burden, and the image tokenizer converts long sequences of pixels into shorter sequences of visual tokens, preventing the model from focusing on low-level visual features. Besides, we present a two-stage training framework for our model to assist the model in learning alignment across modalities and languages. Finally, we propose a location-aware evaluation metric called Structure-BLEU to assess the translation quality of the generated images. Experimental results demonstrate that our model achieves competitive performance compared to cascaded models with only 70.9% of parameters, and significantly outperforms the pixel-level end-to-end IIMT model.</abstract>
-      <url hash="c28a4b7c">2024.findings-acl.325</url>
+      <url hash="58a3ae45">2024.findings-acl.325</url>
       <bibkey>lan-etal-2024-translatotron</bibkey>
       <doi>10.18653/v1/2024.findings-acl.325</doi>
     </paper>
@@ -10574,7 +10574,7 @@
       <author><first>Kurt</first><last>Stockinger</last><affiliation>ZHAW - Zürcher Hochschule für Angewandte Wissenschaften</affiliation></author>
       <pages>5486-5507</pages>
       <abstract>The potential for improvements brought by Large Language Models (LLMs) in Text-to-SQL systems is mostly assessed on monolingual English datasets. However, LLMs’ performance for other languages remains vastly unexplored. In this work, we release the StatBot.Swiss dataset, the first bilingual benchmark for evaluating Text-to-SQL systems based on real-world applications. The StatBot.Swiss dataset contains 455 natural language/SQL-pairs over 35 big databases with varying level of complexity for both English and German.We evaluate the performance of state-of-the-art LLMs such as GPT-3.5-Turbo and mixtral-8x7b-instruct for the Text-to-SQL translation task using an in-context learning approach. Our experimental analysis illustrates that current LLMs struggle to generalize well in generating SQL queries on our novel bilingual dataset.</abstract>
-      <url hash="17292046">2024.findings-acl.326</url>
+      <url hash="f42e1cd6">2024.findings-acl.326</url>
       <bibkey>nooralahzadeh-etal-2024-statbot</bibkey>
       <doi>10.18653/v1/2024.findings-acl.326</doi>
       <video href="2024.findings-acl.326.mp4"/>
@@ -10587,7 +10587,7 @@
       <author><first>Wei</first><last>Ma</last></author>
       <pages>5508-5519</pages>
       <abstract>The widespread adoption of Large Language Models (LLMs) has led to an increase in AI-generated text on the Internet, presenting a crucial challenge to differentiate AI-created content from human-written text. This challenge is critical to prevent issues of authenticity, trust, and potential copyright violations. Current research focuses on watermarking LLM-generated text, but traditional techniques struggle to balance robustness with text quality. We introduce a novel watermarking approach, Robust and Imperceptible Watermarking (RIW) for LLMs, which leverages token prior probabilities to improve detectability and maintain watermark imperceptibility. RIW methodically embeds watermarks by partitioning selected tokens into two distinct groups based on their prior probabilities and employing tailored strategies for each group. In the detection stage, the RIW method employs the ‘voted z-test’ to provide a statistically robust framework to identify the presence of a watermark accurately. The effectiveness of RIW is evaluated across three key dimensions: success rate, text quality, and robustness against removal attacks. Our experimental results on various LLMs, including GPT2-XL, OPT-1.3B, and LLaMA2-7B, indicate that RIW surpasses existing models, and also exhibits increased robustness against various attacks and good imperceptibility, thus promoting the responsible use of LLMs.</abstract>
-      <url hash="f9eb8e01">2024.findings-acl.327</url>
+      <url hash="6c90417d">2024.findings-acl.327</url>
       <bibkey>ren-etal-2024-subtle</bibkey>
       <doi>10.18653/v1/2024.findings-acl.327</doi>
     </paper>
@@ -10599,7 +10599,7 @@
       <author><first>Jinyu</first><last>Guo</last></author>
       <pages>5520-5532</pages>
       <abstract>There are two key challenges remaining for the document-level event argument extraction (D-EAE) tasks: key feature forgetting and cross-event argument confusion. The emergence capability of large language models (LLMs) holds promise for solving the above two challenges. In this paper, we propose a document-level event argument extraction method based on guided summarization and reasoning (EAESR), which leverages the emergence capabilities of LLMs to highlight key event information and to clarify the explicit and implicit association between multiple events. Specifically, we generate document summarization information that shorten the length of the event context while preserving the key event features. In addition, we generate inter-event reasoning information, which helps EAESR make sense of the correlations between events and reduces their dependence on the event context, especially to better cope with the few-shot D-EAE task. Then, we obtain named entity information to enable EAESR to learn argument boundary features to improve the sensitivity of its argument boundary recognition. Eventually, we fused the above features and sentence features to make EAESR have summarizing and reasoning capabilities simultaneously. Extensive experiments on WIKIEVENTS and RAMS have shown that EAESR achieves a new state-of-the-art that outperforms the baseline models by 1.3% F1 and 1.6% F1, respectively, and averages 11% F1 in few-shot settings.</abstract>
-      <url hash="c370078b">2024.findings-acl.328</url>
+      <url hash="7f55b042">2024.findings-acl.328</url>
       <bibkey>shuang-etal-2024-thinking</bibkey>
       <doi>10.18653/v1/2024.findings-acl.328</doi>
     </paper>
@@ -10614,7 +10614,7 @@
       <author><first>Baobao</first><last>Chang</last><affiliation>Peking University</affiliation></author>
       <pages>5533-5546</pages>
       <abstract>Distantly-Supervised Named Entity Recognition (DS-NER) effectively alleviates the burden of annotation, but meanwhile suffers from the label noise. Recent works attempt to adopt the teacher-student framework to gradually refine the training labels and improve the overall robustness. However, we argue that these teacher-student methods achieve limited performance because the poor calibration of the teacher network produces incorrectly pseudo-labeled samples, leading to error propagation. Therefore, we attempt to mitigate this issue by proposing: (1) Uncertainty-Aware Teacher Learning that leverages the prediction uncertainty to reduce the number of incorrect pseudo labels in the self-training stage; (2) Student-Student Collaborative Learning that allows the transfer of reliable labels between two student networks instead of indiscriminately relying on all pseudo labels from its teacher. This approach further enables a full exploration of mislabeled samples rather than simply filtering unreliable pseudo-labeled samples. We evaluate our proposed method on five DS-NER datasets, demonstrating that our method is superior to the state-of-the-art DS-NER denoising methods.</abstract>
-      <url hash="c5f6c770">2024.findings-acl.329</url>
+      <url hash="7edd1412">2024.findings-acl.329</url>
       <bibkey>si-etal-2024-improving</bibkey>
       <doi>10.18653/v1/2024.findings-acl.329</doi>
     </paper>
@@ -10626,7 +10626,7 @@
       <author><first>Christopher</first><last>Manning</last><affiliation>Computer Science Department, Stanford University</affiliation></author>
       <pages>5547-5558</pages>
       <abstract>Social media advertising offers a platform for fossil fuel value chain companies and their agents to reinforce their narratives, often emphasizing economic, labor market, and energy security benefits to promote oil and gas policy and products. Whether such narratives can be detected automatically and the extent to which the cost of human annotation can be reduced is our research question. We introduce a task of classifying narratives into seven categories, based on existing definitions and data.Experiments showed that RoBERTa-large outperforms other methods, while GPT-4 Turbo can serve as a viable annotator for the task, thereby reducing human annotation costs. Our findings and insights provide guidance to automate climate-related ad analysis and lead to more scalable ad scrutiny.</abstract>
-      <url hash="e93caae2">2024.findings-acl.330</url>
+      <url hash="da863651">2024.findings-acl.330</url>
       <bibkey>rowlands-etal-2024-predicting</bibkey>
       <doi>10.18653/v1/2024.findings-acl.330</doi>
       <video href="2024.findings-acl.330.mp4"/>
@@ -10642,7 +10642,7 @@
       <author><first>Jianxin</first><last>Liao</last></author>
       <pages>5559-5570</pages>
       <abstract>Language Models (LMs) acquire factual knowledge during pre-training and store it in the parameters, which can be valuable for downstream tasks. As world evolves, some facts may be incorrectly induced or become obsolete over time. Various model editing methods have been proposed to modify specific examples in LMs. However, existing training-based methods still suffer from sub-optimal locality, where irrelevant neighborhood examples can be adversely influenced. Model’s gradients are still struggling to identify the appropriate direction when updating the parameters. To address this issue, we find that directing the hidden state of the edit example towards spaces where semantics are sparse tends to help preserve the semantics of irrelevant neighborhood examples. Based on this hypothesis, we propose a novel metric, named SSS, to evaluate the degree of sparsity around a sentence embedding in the semantic space without any human or machine annotation. Subsequently, we incorporate SSS into the original loss function of the existing training-based methods to enhance locality. Experiments conducted on two datasets across various models demonstrate that SSS is effective in improving both locality and reasoning capability.</abstract>
-      <url hash="0baa375d">2024.findings-acl.331</url>
+      <url hash="e33cb671">2024.findings-acl.331</url>
       <bibkey>wang-etal-2024-sss</bibkey>
       <doi>10.18653/v1/2024.findings-acl.331</doi>
       <video href="2024.findings-acl.331.mp4"/>
@@ -10656,7 +10656,7 @@
       <author><first>Heinz</first><last>Koeppl</last></author>
       <pages>5571-5597</pages>
       <abstract>Recent advances in measuring hardness-wise properties of data guide language models in sample selection within low-resource scenarios. However, class-specific properties are overlooked for task setup and learning. How will these properties influence model learning and is it generalizable across datasets? To answer this question, this work formally initiates the concept of <tex-math>\textit{class-wise hardness}</tex-math>. Experiments across eight natural language understanding (NLU) datasets demonstrate a consistent hardness distribution across learning paradigms, models, and human judgment. Subsequent experiments unveil a notable challenge in measuring such class-wise hardness with instance-level metrics in previous works. To address this, we propose <tex-math>\textit{GeoHard}</tex-math> for class-wise hardness measurement by modeling class geometry in the semantic embedding space. <tex-math>\textit{GeoHard}</tex-math> surpasses instance-level metrics by over 59 percent on <tex-math>\textit{Pearson}</tex-math>‘s correlation on measuring class-wise hardness. Our analysis theoretically and empirically underscores the generality of <tex-math>\textit{GeoHard}</tex-math> as a fresh perspective on data diagnosis. Additionally, we showcase how understanding class-wise hardness can practically aid in improving task learning.</abstract>
-      <url hash="d9c0c173">2024.findings-acl.332</url>
+      <url hash="a20c3adf">2024.findings-acl.332</url>
       <bibkey>cai-etal-2024-geohard</bibkey>
       <doi>10.18653/v1/2024.findings-acl.332</doi>
       <video href="2024.findings-acl.332.mp4"/>
@@ -10669,7 +10669,7 @@
       <author><first>Hsin-Hsi</first><last>Chen</last><affiliation>National Taiwan University</affiliation></author>
       <pages>5598-5621</pages>
       <abstract>In this paper, we investigate the phenomena of “selection biases” in Large Language Models (LLMs), focusing on problems where models are tasked with choosing the optimal option from an ordered sequence. We delve into biases related to option order and token usage, which significantly impact LLMs’ decision-making processes. We also quantify the impact of these biases through an extensive empirical analysis across multiple models and tasks. Furthermore, we propose mitigation strategies to enhance model performance. Our key contributions are threefold: 1) Precisely quantifying the influence of option order and token on LLMs, 2) Developing strategies to mitigate the impact of token and order sensitivity to enhance robustness, and 3) Offering a detailed analysis of sensitivity across models and tasks, which informs the creation of more stable and reliable LLM applications for selection problems.</abstract>
-      <url hash="0735ccb9">2024.findings-acl.333</url>
+      <url hash="56133ee2">2024.findings-acl.333</url>
       <bibkey>wei-etal-2024-unveiling</bibkey>
       <doi>10.18653/v1/2024.findings-acl.333</doi>
     </paper>
@@ -10690,7 +10690,7 @@
       <author><first>Timothy</first><last>Baldwin</last><affiliation>Mohamed bin Zayed University of Artificial Intelligence and The University of Melbourne</affiliation></author>
       <pages>5622-5640</pages>
       <abstract>The focus of language model evaluation has transitioned towards reasoning and knowledge-intensive tasks, driven by advancements in pretraining large models. While state-of-the-art models are partially trained on large Arabic texts, evaluating their performance in Arabic remains challenging due to the limited availability of relevant datasets. To bridge this gap, we present ArabicMMLU, the first multi-task language understanding benchmark for the Arabic language, sourced from school exams across diverse educational levels in different countries spanning North Africa, the Levant, and the Gulf regions. Our data comprises 40 tasks and 14,575 multiple-choice questions in Modern Standard Arabic (MSA) and is carefully constructed by collaborating with native speakers in the region. Our comprehensive evaluations of 35 models reveal substantial room for improvement, particularly among the best open-source models. Notably, BLOOMZ, mT0, LLama2, and Falcon struggle to achieve a score of 50%, while even the top-performing Arabic-centric model only achieves a score of 62.3%.</abstract>
-      <url hash="136d5033">2024.findings-acl.334</url>
+      <url hash="6445e78b">2024.findings-acl.334</url>
       <bibkey>koto-etal-2024-arabicmmlu</bibkey>
       <doi>10.18653/v1/2024.findings-acl.334</doi>
       <video href="2024.findings-acl.334.mp4"/>
@@ -10704,7 +10704,7 @@
       <author><first>Thomas</first><last>Pock</last><affiliation>Graz University of Technology</affiliation></author>
       <pages>5641-5658</pages>
       <abstract>We examine the assumption that hidden-state vectors of recurrent neural networks (RNNs) tend to form clusters of semantically similar vectors, which we dub the clustering hypothesis. While this hypothesis has been assumed in RNN analyses in recent years, its validity has not been studied thoroughly on modern RNN architectures. We first consider RNNs that were trained to recognize regular languages. This enables us to draw on perfect ground-truth automata in our evaluation, against which we can compare the RNN’s accuracy and the distribution of the hidden-state vectors. Then, we consider context-free languages to examine if RNN states form clusters for more expressive languages.For our analysis, we fit (generalized) linear models to classify RNN states into automata states and we apply different unsupervised clustering techniques. With a new ambiguity score, derived from information entropy, we measure how well an abstraction function maps the hidden state vectors to abstract clusters. Our evaluation supports the validity of the clustering hypothesis for regular languages, especially if RNNs are well-trained, i.e., clustering techniques succeed in finding clusters of similar state vectors. However, the clustering accuracy decreases substantially for context-free languages. This suggests that clustering is not a reliable abstraction technique for RNNs used in tasks like natural language processing.</abstract>
-      <url hash="52e5609a">2024.findings-acl.335</url>
+      <url hash="3bd06b57">2024.findings-acl.335</url>
       <bibkey>muskardin-etal-2024-relationship</bibkey>
       <doi>10.18653/v1/2024.findings-acl.335</doi>
       <video href="2024.findings-acl.335.mp4"/>
@@ -10724,7 +10724,7 @@
       <author><first>Le</first><last>Sun</last><affiliation>Institute of Software, Chinese Academy of Sciences</affiliation></author>
       <pages>5659-5672</pages>
       <abstract>The eXtreme Multi-label Classification (XMC) aims at accurately assigning large-scale labels to instances, and is challenging for learning, managing, and predicting over the large-scale and rapidly growing set of labels. Traditional XMC methods, like one-vs-all and tree-based methods struggle with the growing set of labels due to their static label assumptions, and embedding-based methods struggle with the complex mapping relationships due to their late-interaction paradigm. In this paper, we propose a large language model (LLM) powered agent framework for extreme multi-label classification – XMC-Agent, which can effectively learn, manage and predict the extremely large and dynamically increasing set of labels. Specifically, XMC-Agent models the extreme multi-label classification task as a dynamic navigation problem, employing a scalable hierarchical label index to effectively manage the unified label space. Additionally, we propose two algorithms to enhance the dynamic navigation capabilities of XMC-Agent: a self-construction algorithm for building the scalable hierarchical index, and an iterative feedback learning algorithm for adjusting the agent to specific tasks. Experiments show that XMC-Agentachieves the state-of-the-art performance on three standard datasets.</abstract>
-      <url hash="6d251ffc">2024.findings-acl.336</url>
+      <url hash="712b652f">2024.findings-acl.336</url>
       <bibkey>liu-etal-2024-xmc</bibkey>
       <doi>10.18653/v1/2024.findings-acl.336</doi>
     </paper>
@@ -10736,7 +10736,7 @@
       <author><first>Lifan</first><last>Guo</last></author>
       <pages>5673-5693</pages>
       <abstract>In light of recent breakthroughs in large language models (LLMs) that have revolutionized natural language processing (NLP), there is an urgent need for new benchmarks to keep pace with the fast development of LLMs. In this paper, we propose CFLUE, the Chinese Financial Language Understanding Evaluation benchmark, designed to assess the capability of LLMs across various dimensions. Specifically, CFLUE provides datasets tailored for both knowledge assessment and application assessment. In knowledge assessment, it consists of 38K+ multiple-choice questions with associated solution explanations. These questions serve dual purposes: answer prediction and question reasoning. In application assessment, CFLUE features 16K+ test instances across distinct groups of NLP tasks such as text classification, machine translation, relation extraction, reading comprehension, and text generation. Upon CFLUE, we conduct a thorough evaluation of representative LLMs. The results reveal that only Qwen-72B, GPT-4, and GPT-4-turbo achieve an accuracy exceeding 60% in answer prediction for knowledge assessment, suggesting that there is still substantial room for improvement in current LLMs. In application assessment, while GPT-4 and GPT-4-turbo rank as the top two performers on average, their significant advantage over open-source LLMs is noticeably diminished, given that Qwen-72B achieves the best performance in 2 out of 5 tasks. The datasets and scripts associated with CFLUE are openly accessible at <url>https://github.com/aliyun/cflue</url>.</abstract>
-      <url hash="15c222d8">2024.findings-acl.337</url>
+      <url hash="db6ac67f">2024.findings-acl.337</url>
       <bibkey>zhu-etal-2024-benchmarking</bibkey>
       <doi>10.18653/v1/2024.findings-acl.337</doi>
       <video href="2024.findings-acl.337.mp4"/>
@@ -10752,7 +10752,7 @@
       <author><first>Ji-Rong</first><last>Wen</last><affiliation>Renmin University of China</affiliation></author>
       <pages>5694-5711</pages>
       <abstract>Reinforcement learning (RL) has been widely used in training large language models (LLMs) for preventing unexpected outputs, e.g., reducing harmfulness and errors. However, existing RL methods mainly adopt instance-level reward, which cannot provide fine-grained supervision for complex reasoning tasks. As a result, the RL training cannot be fully aware of the specific part or step that actually leads to the incorrectness in model response. To address it, we propose a new RL method named RLMEC that incorporates a generative model as the reward model, which is trained by the erroneous solution rewriting task under the minimum editing constraint, which can produce token-level supervision for RL training. Based 0on the generative reward model, we design the token-level RL objective for training and an imitation-based regularization for stabilizing RL process. And these two objectives focus on the revision of the key tokens for the erroneous solution, reducing the effect of other unimportant tokens. Experiment results on 8 tasks have demonstrated the effectiveness of our approach. Our code and data will be publicly released.</abstract>
-      <url hash="f36ca6b4">2024.findings-acl.338</url>
+      <url hash="c1ad19cc">2024.findings-acl.338</url>
       <bibkey>chen-etal-2024-improving</bibkey>
       <doi>10.18653/v1/2024.findings-acl.338</doi>
       <video href="2024.findings-acl.338.mp4"/>
@@ -10764,7 +10764,7 @@
       <author><first>Yves</first><last>Scherrer</last><affiliation>University of Oslo</affiliation></author>
       <pages>5712-5724</pages>
       <abstract>We use contextualized word definitions generated by large language models as semantic representations in the task of diachronic lexical semantic change detection (LSCD). In short, generated definitions are used as ‘senses’, and the change score of a target word is retrieved by comparing their distributions in two time periods under comparison. On the material of five datasets and three languages, we show that generated definitions are indeed specific and general enough to convey a signal sufficient to rank sets of words by the degree of their semantic change over time. Our approach is on par with or outperforms prior non-supervised sense-based LSCD methods. At the same time, it preserves interpretability and allows to inspect the reasons behind a specific shift in terms of discrete definitions-as-senses. This is another step in the direction of explainable semantic change modeling.</abstract>
-      <url hash="74acbc37">2024.findings-acl.339</url>
+      <url hash="52e5b42d">2024.findings-acl.339</url>
       <bibkey>fedorova-etal-2024-definition</bibkey>
       <doi>10.18653/v1/2024.findings-acl.339</doi>
       <video href="2024.findings-acl.339.mp4"/>
@@ -10782,7 +10782,7 @@
       <author><first>Carleigh</first><last>Wood</last></author>
       <pages>5725-5734</pages>
       <abstract>Research in toxicity detection in natural language processing for the speech modality (audio-based) is quite limited, particularly for languages other than English. To address these limitations and lay the groundwork for truly multilingual audio-based toxicity detection, we introduce MuTox, the first highly multilingual audio-based dataset with toxicity labels which covers 14 different linguistic families. The dataset comprises 20,000 audio utterances for English and Spanish, and 4,000 for the other 28 languages. To demonstrate the quality of this dataset, we trained the MuTox audio-based toxicity classifier, which enables zero-shot toxicity detection across a wide range of languages. This classifier performs on par with existing text-based trainable classifiers, while expanding the language coverage more than tenfold. When compared to a wordlist-based classifier that covers a similar number of languages, MuTox improves F1-Score by an average of 100%. This significant improvement underscores the potential of MuTox in advancing the field of audio-based toxicity detection.</abstract>
-      <url hash="85c65d62">2024.findings-acl.340</url>
+      <url hash="9bb6a974">2024.findings-acl.340</url>
       <bibkey>costa-jussa-etal-2024-mutox</bibkey>
       <doi>10.18653/v1/2024.findings-acl.340</doi>
     </paper>
@@ -10794,7 +10794,7 @@
       <author><first>Xiaojie</first><last>Wang</last><affiliation>Beijing University of Post and Telecommunication</affiliation></author>
       <pages>5735-5748</pages>
       <abstract>Instruction Fine-Tuning, a method enhancing pre-trained language models’ capabilities from mere next-word prediction to complex instruction following, often employs a one-off training approach on diverse instruction dataset. However, this method may not effectively enhance models’ adherence to instructions due to the simultaneous handling of varying instruction complexities. To address this, we propose a novel phased instruction fine-tuning (Phased IFT) method, grounded in the hypothesis of progressive alignment, which posits that the transition of a pre-trained language model from simple next-word prediction to sophisticated instruction following is a gradual learning process. Specifically, we obtain the score of difficulty for each instruction via GPT-4, stratify the instruction data into subsets of increasing difficulty, and sequentially uptrain on these subsets using the standard supervised loss. Through extensive experiments on the pre-trained models Llama-2 7B/13B, and Mistral-7B using the 52K Alpaca instruction data, we demonstrate that Phased IFT significantly surpasses traditional one-off instruction fine-tuning (One-off IFT) method in win rate, empirically validating the progressive alignment hypothesis. Our findings suggest that Phased IFT offers a simple yet effective pathway for elevating the instruction-following capabilities of pre-trained language models.</abstract>
-      <url hash="cec30244">2024.findings-acl.341</url>
+      <url hash="606fa516">2024.findings-acl.341</url>
       <bibkey>pang-etal-2024-phased</bibkey>
       <doi>10.18653/v1/2024.findings-acl.341</doi>
     </paper>
@@ -10815,7 +10815,7 @@
       <author><first>Man</first><last>Lan</last></author>
       <pages>5749-5765</pages>
       <abstract>Topic relevance of an essay demands that the composition adheres to a clear theme and aligns well with the essay prompt requirements, a critical aspect of essay quality evaluation. However, existing research of Automatic Essay Scoring (AES) for Chinese essays has overlooked topic relevance and lacks detailed feedback, while Automatic Essay Comment Generation (AECG) faces much complexity and difficulty. Additionally, current Large Language Models, including GPT-4, often make incorrect judgments and provide overly impractical feedback when evaluating topic relevance. This paper introduces <b>TOREE</b> (<b>To</b>pic <b>Re</b>levance <b>E</b>valuation), a comprehensive dataset developed to assess topic relevance in Chinese primary and middle school students’ essays, which is beneficial for AES, AECG and other applications. Moreover, our proposed two-step method utilizes TOREE through a combination of Supervised Fine-tuning and Preference Learning. Experimental results demonstrate that TOREE is of high quality, and our method significantly enhances models’ performance on two designed tasks for topic relevance evaluation, improving both automatic and human evaluations across four diverse LLMs.</abstract>
-      <url hash="721e2c53">2024.findings-acl.342</url>
+      <url hash="85eb2b7c">2024.findings-acl.342</url>
       <bibkey>zhuang-etal-2024-toree</bibkey>
       <doi>10.18653/v1/2024.findings-acl.342</doi>
     </paper>
@@ -10831,7 +10831,7 @@
       <author><first>JiayeYang</first><last>JiayeYang</last></author>
       <pages>5766-5778</pages>
       <abstract>Temporal Knowledge Graph (TKG) reasoning seeks to predict future incomplete facts leveraging historical data. While existing approaches have shown effectiveness in addressing the task through various perspectives, such as graph learning and logic rules, they are limited in capturing the indeterminacy in future events, particularly in the case of rare/unseen facts. To tackle the highlighted issues, we introduce a novel approach by conceptualizing TKG reasoning as a sequence denoising process for future facts, namely DiffuTKG. Concretely, we first encodes the historical events as the conditional sequence. Then we gradually introduce Gaussian noise to corrupt target facts during the forward process and then employ a transformer-based conditional denoiser to restore them in the reverse phase. Moreover, we introduce an uncertainty regularization loss to mitigate the risk of prediction biases by favoring frequent scenarios over rare/unseen facts. Empirical results on four real-world datasets show that DiffuTKG outperforms state-of-the-art methods across multiple evaluation metrics.</abstract>
-      <url hash="88892a81">2024.findings-acl.343</url>
+      <url hash="83f9700f">2024.findings-acl.343</url>
       <bibkey>cai-etal-2024-predicting</bibkey>
       <doi>10.18653/v1/2024.findings-acl.343</doi>
     </paper>
@@ -10843,7 +10843,7 @@
       <author><first>Yue</first><last>Dong</last><affiliation>University of California, Riverside and McGill University</affiliation></author>
       <pages>5779-5796</pages>
       <abstract>The widespread use of Text-to-Image (T2I) models in content generation requires careful examination of their safety, including their robustness to adversarial attacks. Despite extensive research on adversarial attacks, the reasons for their effectiveness remain underexplored. This paper presents an empirical study on adversarial attacks against T2I models, focusing on analyzing factors associated with attack success rates (ASR). We introduce a new attack objective - entity swapping using adversarial suffixes and two gradient-based attack algorithms. Human and automatic evaluations reveal the asymmetric nature of ASRs on entity swap: for example, it is easier to replace “human” with “robot” in the prompt “a human dancing in the rain.” with an adversarial suffix, but the reverse replacement is significantly harder. We further propose probing metrics to establish indicative signals from the model’s beliefs to the adversarial ASR. We identify conditions that result in a success probability of 60% for adversarial attacks and others where this likelihood drops below 5%. The code and data are available at https://github.com/Patchwork53/AsymmetricAttack</abstract>
-      <url hash="15236d9e">2024.findings-acl.344</url>
+      <url hash="ff3e116b">2024.findings-acl.344</url>
       <bibkey>shahgir-etal-2024-asymmetric</bibkey>
       <doi>10.18653/v1/2024.findings-acl.344</doi>
     </paper>
@@ -10859,7 +10859,7 @@
       <author><first>Bo</first><last>Tang</last></author>
       <pages>5797-5814</pages>
       <abstract>Controlled Text Generation (CTG) aims to produce texts that exhibit specific desired attributes. In this study, we introduce a pluggable CTG framework for Large Language Models (LLMs) named Dynamic Attribute Graphs-based controlled text generation (DATG). This framework utilizes an attribute scorer to evaluate the attributes of sentences generated by LLMs and constructs dynamic attribute graphs. DATG modulates the occurrence of key attribute words and key anti-attribute words, achieving effective attribute control without compromising the original capabilities of the model. We conduct experiments across four datasets in two tasks: toxicity mitigation and sentiment transformation, employing five LLMs as foundational models. Our findings highlight a remarkable enhancement in control accuracy, achieving a peak improvement of 19.29% over baseline methods in the most favorable task across four datasets. Additionally, we observe a significant decrease in perplexity, markedly improving text fluency.</abstract>
-      <url hash="94d92df6">2024.findings-acl.345</url>
+      <url hash="3e32c061">2024.findings-acl.345</url>
       <bibkey>liang-etal-2024-controlled</bibkey>
       <doi>10.18653/v1/2024.findings-acl.345</doi>
     </paper>
@@ -10871,20 +10871,20 @@
       <author><first>SangKeun</first><last>Lee</last><affiliation>Korea University</affiliation></author>
       <pages>5815-5830</pages>
       <abstract>In this paper, we introduce COCONUT to effectively guide the contextualization of structured commonsense knowledge based on largelanguage models. COCONUT employs a contextualized knowledge prompting scheme to gather high-quality contextualization examplesfrom a large language model. These examples are subsequently distilled into small language models to enhance their contextualization capability. Extensive evaluations show that COCONUT considerably improves commonsense reasoning performance across diverse benchmarks, models, and settings, exhibiting its flexibility and universality in generating contextualized commonsense knowledge. Notably,COCONUT consistently outperforms the state-of-the-art technique by an average of 5.8%.</abstract>
-      <url hash="d0222861">2024.findings-acl.346</url>
+      <url hash="598d43fe">2024.findings-acl.346</url>
       <bibkey>park-etal-2024-coconut</bibkey>
       <doi>10.18653/v1/2024.findings-acl.346</doi>
       <video href="2024.findings-acl.346.mp4"/>
     </paper>
     <paper id="347">
       <title>Mass-Editing Memory with Attention in Transformers: A cross-lingual exploration of knowledge</title>
-      <author><first>Daniel</first><last>Tamayo</last><affiliation>Barcelona Supercomputing Center</affiliation></author>
+      <author><first>Daniel</first><last>Mela</last><affiliation>Barcelona Supercomputing Center</affiliation></author>
       <author><first>Aitor</first><last>Gonzalez-Agirre</last></author>
       <author><first>Javier</first><last>Hernando</last><affiliation>Barcelona Supercomputing Center and Universidad Politécnica de Cataluna</affiliation></author>
       <author><first>Marta</first><last>Villegas</last><affiliation>Barcelona Supercomputing Center, Universitat Pompeu Fabra and Universitat Autònoma de Barcelona</affiliation></author>
       <pages>5831-5847</pages>
       <abstract>Recent research has explored methods for updating and modifying factual knowledge in large language models, often focusing on specific multi-layer perceptron blocks. This study expands on this work by examining the effectiveness of existing knowledge editing methods across languages and delving into the role of attention mechanisms in this process. Drawing from the insights gained, we propose Mass-Editing Memory with Attention in Transformers (MEMAT), a method that achieves significant improvements in all metrics while requiring minimal parameter modifications. MEMAT delivers a remarkable 10% increase in magnitude metrics, benefits languages not included in the training data and also demonstrates a high degree of portability. Our code and data are at https://github.com/dtamayo-nlp/MEMAT.</abstract>
-      <url hash="5936a0c3">2024.findings-acl.347</url>
+      <url hash="002782b6">2024.findings-acl.347</url>
       <bibkey>mela-etal-2024-mass</bibkey>
       <doi>10.18653/v1/2024.findings-acl.347</doi>
       <video href="2024.findings-acl.347.mp4"/>
@@ -10899,7 +10899,7 @@
       <author><first>Richard</first><last>Dufour</last><affiliation>Nantes University</affiliation></author>
       <pages>5848-5864</pages>
       <abstract>Large Language Models (LLMs) have demonstrated remarkable versatility in recent years, offering potential applications across specialized domains such as healthcare and medicine. Despite the availability of various open-source LLMs tailored for health contexts, adapting general-purpose LLMs to the medical domain presents significant challenges.In this paper, we introduce BioMistral, an open-source LLM tailored for the biomedical domain, utilizing Mistral as its foundation model and further pre-trained on PubMed Central. We conduct a comprehensive evaluation of BioMistral on a benchmark comprising 10 established medical question-answering (QA) tasks in English. We also explore lightweight models obtained through quantization and model merging approaches. Our results demonstrate BioMistral’s superior performance compared to existing open-source medical models and its competitive edge against proprietary counterparts. Finally, to address the limited availability of data beyond English and to assess the multilingual generalization of medical LLMs, we automatically translated and evaluated this benchmark into 7 other languages. This marks the first large-scale multilingual evaluation of LLMs in the medical domain. Datasets, multilingual evaluation benchmarks, scripts, and all the models obtained during our experiments are freely released.</abstract>
-      <url hash="12baba7f">2024.findings-acl.348</url>
+      <url hash="bc7f6410">2024.findings-acl.348</url>
       <bibkey>labrak-etal-2024-biomistral</bibkey>
       <doi>10.18653/v1/2024.findings-acl.348</doi>
       <video href="2024.findings-acl.348.mp4"/>
@@ -10915,7 +10915,7 @@
       <author><first>Michael</first><last>Lyu</last><affiliation>The Chinese University of Hong Kong</affiliation></author>
       <pages>5865-5877</pages>
       <abstract>Safety lies at the core of developing and deploying large language models (LLMs). However, previous safety benchmarks only concern the safety in one language, e.g. the majority language in the pretraining data such as English. In this work, we build the first multilingual safety benchmark for LLMs, XSafety, in response to the global deployment of LLMs in practice. XSafety covers 14 kinds of commonly used safety issues across 10 languages that span several language families. We utilize XSafety to empirically study the multilingual safety for 4 widely-used LLMs, including both close-API and open-source models. Experimental results show that all LLMs produce significantly more unsafe responses for non-English queries than English ones, indicating the necessity of developing safety alignment for non-English languages. In addition, we propose a simple and effective prompting method to improve the multilingual safety of ChatGPT by enhancing cross-lingual generalization of safety alignment. Our prompting method can significantly reduce the ratio of unsafe responses by 42% for non-English queries. We will release all the data and results to facilitate future research on LLMs’ safety.</abstract>
-      <url hash="31e6c56b">2024.findings-acl.349</url>
+      <url hash="a93216cb">2024.findings-acl.349</url>
       <bibkey>wang-etal-2024-languages</bibkey>
       <doi>10.18653/v1/2024.findings-acl.349</doi>
     </paper>
@@ -10931,7 +10931,7 @@
       <author><first>Vincent</first><last>Ng</last><affiliation>University of Texas at Dallas</affiliation></author>
       <pages>5878-5894</pages>
       <abstract>Legal Judgment Prediction (LJP) refers to the task of automatically predicting judgment results (e.g., charges, law articles and term of penalty) given the fact description of cases. While SOTA models have achieved high accuracy and F1 scores on public datasets, existing datasets fail to evaluate specific aspects of these models (e.g., legal fairness, which significantly impact their applications in real scenarios). Inspired by functional testing in software engineering, we introduce LJPCHECK, a suite of functional tests for LJP models, to comprehend LJP models’ behaviors and offer diagnostic insights. We illustrate the utility of LJPCHECK on five SOTA LJP models. Extensive experiments reveal vulnerabilities in these models, prompting an in-depth discussion into the underlying reasons of their shortcomings.</abstract>
-      <url hash="f9481970">2024.findings-acl.350</url>
+      <url hash="238b9843">2024.findings-acl.350</url>
       <bibkey>zhang-etal-2024-ljpcheck</bibkey>
       <doi>10.18653/v1/2024.findings-acl.350</doi>
       <video href="2024.findings-acl.350.mp4"/>
@@ -10946,7 +10946,7 @@
       <author><first>Vincent</first><last>Ng</last><affiliation>University of Texas at Dallas</affiliation></author>
       <pages>5895-5906</pages>
       <abstract>Legal Judgment Prediction (LJP) has attracted significant attention in recent years. However, previous studies have primarily focused on cases involving only a single defendant, skipping multi-defendant cases due to complexity and difficulty. To advance research, we introduce CMDL, a large-scale real-world Chinese Multi-Defendant LJP dataset, which consists of over 393,945 cases with nearly 1.2 million defendants in total. For performance evaluation, we propose case-level evaluation metrics dedicated for the multi-defendant scenario. Experimental results on CMDL show existing SOTA approaches demonstrate weakness when applied to cases involving multiple defendants. We highlight several challenges that require attention and resolution.</abstract>
-      <url hash="9db72ffb">2024.findings-acl.351</url>
+      <url hash="6b4e97a8">2024.findings-acl.351</url>
       <bibkey>huang-etal-2024-cmdl</bibkey>
       <doi>10.18653/v1/2024.findings-acl.351</doi>
       <video href="2024.findings-acl.351.mp4"/>
@@ -10957,7 +10957,7 @@
       <author><first>Karl</first><last>Stratos</last><affiliation>Rutgers University</affiliation></author>
       <pages>5907-5913</pages>
       <abstract>Standard fine-tuning is considered not as effective as specialized methods for model editing due to its comparatively poor performance. However, it is simple, agnostic to the architectural details of the model being edited, and able to leverage advances in standard training techniques with no additional work (e.g., black-box PEFT for computational efficiency), making it an appealing choice for a model editor. In this work, we show that standard fine-tuning alone can yield competitive model editing performance with two minor modifications. First, we optimize the conditional likelihood rather than the full likelihood. Second, in addition to the typical practice of training on randomly paraphrased edit prompts to encourage generalization, we also train on random or similar unedited facts to encourage locality. Our experiments on the ZsRE and CounterFact datasets demonstrate that these simple modifications allow standard fine-tuning to match or outperform highly specialized editors in terms of edit score.</abstract>
-      <url hash="e17547d8">2024.findings-acl.352</url>
+      <url hash="c42039a2">2024.findings-acl.352</url>
       <bibkey>gangadhar-stratos-2024-model</bibkey>
       <doi>10.18653/v1/2024.findings-acl.352</doi>
     </paper>
@@ -10978,7 +10978,7 @@
       <author><first>Jiamou</first><last>Liu</last><affiliation>The University of Auckland</affiliation></author>
       <pages>5914-5934</pages>
       <abstract>Combining large language models with logical reasoning enhances their capacity to address problems in a robust and reliable manner. Nevertheless, the intricate nature of logical reasoning poses challenges when gathering reliable data from the web to build comprehensive training datasets, subsequently affecting performance on downstream tasks. To address this, we introduce a novel logic-driven data augmentation approach, AMR-LDA. AMR-LDA converts the original text into an Abstract Meaning Representation (AMR) graph, a structured semantic representation that encapsulates the logical structure of the sentence, upon which operations are performed to generate logically modified AMR graphs. The modified AMR graphs are subsequently converted back into text to create augmented data. Notably, our methodology is architecture-agnostic and enhances both generative large language models, such as GPT-3.5 and GPT-4, through prompt augmentation, and discriminative large language models through contrastive learning with logic-driven data augmentation. Empirical evidence underscores the efficacy of our proposed method with improvement in performance across seven downstream tasks, such as reading comprehension requiring logical reasoning, textual entailment, and natural language inference. Furthermore, our method leads on the ReClor leaderboard. The source code and data are publicly available</abstract>
-      <url hash="0d8520c6">2024.findings-acl.353</url>
+      <url hash="1ba65111">2024.findings-acl.353</url>
       <bibkey>bao-etal-2024-abstract</bibkey>
       <doi>10.18653/v1/2024.findings-acl.353</doi>
       <video href="2024.findings-acl.353.mp4"/>
@@ -11001,7 +11001,7 @@
       <author><first>Luan</first><last>Thanh Nguyen</last><affiliation>University of Information Technology, Vietnam National University Ho Chi Minh City</affiliation></author>
       <pages>5948-5961</pages>
       <abstract>Recent advancements in hate speech detection (HSD) in Vietnamese have made significant progress, primarily attributed to the emergence of transformer-based pre-trained language models, particularly those built on the BERT architecture. However, the necessity for specialized fine-tuned models has resulted in the complexity and fragmentation of developing a multitasking HSD system. Moreover, most current methodologies focus on fine-tuning general pre-trained models, primarily trained on formal textual datasets like Wikipedia, which may not accurately capture human behavior on online platforms. In this research, we introduce ViHateT5, a T5-based model pre-trained on our proposed large-scale domain-specific dataset named VOZ-HSD. By harnessing the power of a text-to-text architecture, ViHateT5 can tackle multiple tasks using a unified model and achieve state-of-the-art performance across all standard HSD benchmarks in Vietnamese. Our experiments also underscore the significance of label distribution in pre-training data on model efficacy. We provide our experimental materials for research purposes, including the VOZ-HSD dataset, pre-trained checkpoint, the unified HSD-multitask ViHateT5 model, and related source code on GitHub publicly.</abstract>
-      <url hash="1c103e94">2024.findings-acl.355</url>
+      <url hash="4b3eba80">2024.findings-acl.355</url>
       <bibkey>thanh-nguyen-2024-vihatet5</bibkey>
       <doi>10.18653/v1/2024.findings-acl.355</doi>
       <video href="2024.findings-acl.355.mp4"/>
@@ -11012,7 +11012,7 @@
       <author><first>Katja</first><last>Markert</last><affiliation>Heidelberg University</affiliation></author>
       <pages>5962-5983</pages>
       <abstract>Summarization is an important application of large language models (LLMs). Most previous evaluation of summarization models has focused on their content selection, faithfulness, grammaticality and coherence. However, it is well known that LLMs can reproduce and reinforce harmful social biases. This raises the question: Do biases affect model outputs in a constrained setting like summarization?To help answer this question, we first motivate and introduce a number of definitions for biased behaviours in summarization models, along with practical operationalizations. Since we find that biases inherent to input documents can confound bias analysis in summaries, we propose a method to generate input documents with carefully controlled demographic attributes. This allows us to study summarizer behavior in a controlled setting, while still working with realistic input documents.We measure gender bias in English summaries generated by both purpose-built summarization models and general purpose chat models as a case study. We find content selection in single document summarization to be largely unaffected by gender bias, while hallucinations exhibit evidence of bias.To demonstrate the generality of our approach, we additionally investigate racial bias, including intersectional settings.</abstract>
-      <url hash="7ba5153c">2024.findings-acl.356</url>
+      <url hash="a03e40ee">2024.findings-acl.356</url>
       <bibkey>steen-markert-2024-bias</bibkey>
       <doi>10.18653/v1/2024.findings-acl.356</doi>
       <video href="2024.findings-acl.356.mp4"/>
@@ -11030,7 +11030,7 @@
       <author><first>Bolin</first><last>Ding</last><affiliation>Alibaba Group</affiliation></author>
       <pages>5984-5996</pages>
       <abstract>Despite the success of large language models (LLMs) in natural language generation, much evidence shows that LLMs may produce incorrect or nonsensical text. This limitation highlights the importance of discerning when to trust LLMs, especially in safety-critical domains. Existing methods often express reliability by confidence level, however, their effectiveness is limited by the lack of objective guidance. To address this, we propose CONfidence-Quality-ORDer-preserving alignment approach (CONQORD), which leverages reinforcement learning guided by a tailored dual-component reward function. This function integrates quality reward and order-preserving alignment reward functions. Specifically, the order-preserving reward incentivizes the model to verbalize greater confidence for responses of higher quality to align the order of confidence and quality. Experiments demonstrate that CONQORD significantly improves the alignment performance between confidence and response accuracy, without causing over-cautious. Furthermore, the aligned confidence provided by CONQORD informs when to trust LLMs, and acts as a determinant for initiating the retrieval process of external knowledge. Aligning confidence with response quality ensures more transparent and reliable responses, providing better trustworthiness.</abstract>
-      <url hash="dfedabe9">2024.findings-acl.357</url>
+      <url hash="8b0cc851">2024.findings-acl.357</url>
       <bibkey>tao-etal-2024-trust</bibkey>
       <doi>10.18653/v1/2024.findings-acl.357</doi>
       <video href="2024.findings-acl.357.mp4"/>
@@ -11054,7 +11054,7 @@
       <author><first>Reut</first><last>Tsarfaty</last><affiliation>Google and Bar-Ilan University, Technion</affiliation></author>
       <pages>6008-6022</pages>
       <abstract>Large Vision-Language Models (LVLMs) are an extension of Large Language Models (LLMs) that facilitate processing both image and text inputs, expanding AI capabilities. However, LVLMs struggle with object hallucinations due to their reliance on text cues and learned object co-occurrence biases. While most research quantifies these hallucinations, mitigation strategies are still lacking. Our study introduces a Language Contrastive Decoding (LCD) algorithm that adjusts LVLM outputs based on LLM distribution confidence levels, effectively reducing object hallucinations. We demonstrate the advantages of LCD in leading LVLMs, showing up to %4 improvement in POPE F1 scores and up to %36 reduction in CHAIR scores on the COCO validation set, while also improving captioning quality scores. Our method effectively improves LVLMs without needing complex post-processing or retraining, and is easily applicable to different models. Our findings highlight the potential of further exploration of LVLM-specific decoding algorithms.</abstract>
-      <url hash="900f3a05">2024.findings-acl.359</url>
+      <url hash="95dd84cb">2024.findings-acl.359</url>
       <bibkey>manevich-tsarfaty-2024-mitigating</bibkey>
       <doi>10.18653/v1/2024.findings-acl.359</doi>
       <video href="2024.findings-acl.359.mp4"/>
@@ -11069,7 +11069,7 @@
       <author><first>Laurentiu</first><last>Zoicas</last><affiliation>University of Bucharest</affiliation></author>
       <pages>6023-6035</pages>
       <abstract>We address the open problem of automatically identifying the direction of lexical borrowing, given word pairs in the donor and recipient languages. We propose strong benchmarks for this task, by applying a set of machine learning models. We extract and publicly release a comprehensive borrowings dataset from the recent RoBoCoP cognates and borrowings database for five Romance languages. We experiment on this dataset with both graphic and phonetic representations and with different features, models and architectures. We interpret the results, in terms of F1 score, commenting on the influence of features and model choice, of the imbalanced data and of the inherent difficulty of the task for particular language pairs. We show that automatically determining the direction of borrowing is a feasible task, and propose additional directions for future work.</abstract>
-      <url hash="c2a945fb">2024.findings-acl.360</url>
+      <url hash="b11a1042">2024.findings-acl.360</url>
       <bibkey>dinu-etal-2024-takes</bibkey>
       <doi>10.18653/v1/2024.findings-acl.360</doi>
       <video href="2024.findings-acl.360.mp4"/>
@@ -11080,7 +11080,7 @@
       <author><first>Derek</first><last>Greene</last><affiliation>University College Dublin</affiliation></author>
       <pages>6036-6047</pages>
       <abstract>This paper explores the application of synthetic data in the post-OCR domain on multiple fronts by conducting experiments to assess the impact of data volume, augmentation, and synthetic data generation methods on model performance. Furthermore, we introduce a novel algorithm that leverages computer vision feature detection algorithms to calculate glyph similarity for constructing post-OCR synthetic data. Through experiments conducted across a variety of languages, including several low-resource ones, we demonstrate that models like ByT5 can significantly reduce Character Error Rates (CER) without the need for manually annotated data, and our proposed synthetic data generation method shows advantages over traditional methods, particularly in low-resource languages.</abstract>
-      <url hash="5e2341a7">2024.findings-acl.361</url>
+      <url hash="35e7cb38">2024.findings-acl.361</url>
       <bibkey>guan-greene-2024-advancing</bibkey>
       <doi>10.18653/v1/2024.findings-acl.361</doi>
       <video href="2024.findings-acl.361.mp4"/>
@@ -11096,7 +11096,7 @@
       <author><first>Zhigang</first><last>Chen</last><affiliation>iFLYTEK Research</affiliation></author>
       <pages>6048-6063</pages>
       <abstract>This paper presents a novel solution to tackle the challenges that posed by the abundance of non-standard addresses, which input by users in modern applications such as navigation maps, ride-hailing apps, food delivery platforms, and logistics services. These manually entered addresses often contain irregularities, such as missing information, spelling errors, colloquial descriptions, and directional offsets, which hinder address-related tasks like address matching and linking. To tackle these challenges, we propose GeoAgent, a new framework comprising two main components: a large language model (LLM) and a suite of geographical tools. By harnessing the semantic understanding capabilities of the LLM and integrating specific geospatial tools, GeoAgent incorporates spatial knowledge into address texts and achieves efficient address standardization. Further, to verify the effectiveness and practicality of our approach, we construct a comprehensive dataset of complex non-standard addresses, which fills the gaps in existing datasets and proves invaluable for training and evaluating the performance of address standardization models in this community. Experimental results demonstrate the efficacy of GeoAgent, showcasing substantial improvements in the performance of address-related models across various downstream tasks.</abstract>
-      <url hash="782c0ab8">2024.findings-acl.362</url>
+      <url hash="4676629a">2024.findings-acl.362</url>
       <bibkey>huang-etal-2024-geoagent</bibkey>
       <doi>10.18653/v1/2024.findings-acl.362</doi>
     </paper>
@@ -11108,7 +11108,7 @@
       <author><first>Stefan</first><last>Feuerriegel</last><affiliation>LMU Munich</affiliation></author>
       <pages>6064-6089</pages>
       <abstract>Online propaganda poses a severe threat to the integrity of societies. However, existing datasets for detecting online propaganda have a key limitation: they were annotated using weak labels that can be noisy and even incorrect. To address this limitation, our work makes the following contributions: (1) We present HQP: a novel dataset (N=30000) for detecting online propaganda with high-quality labels. To the best of our knowledge, HQP is the first large-scale dataset for detecting online propaganda that was created through human annotation. (2) We show empirically that state-of-the-art language models fail in detecting online propaganda when trained with weak labels (AUC: 64.03). In contrast, state-of-the-art language models can accurately detect online propaganda when trained with our high-quality labels (AUC: 92.25), which is an improvement of 44%. (3) We show that prompt-based learning using a small sample of high-quality labels can still achieve a reasonable performance (AUC: 80.27) while significantly reducing the cost of labeling. (4) We extend HQP to HQP+ to test how well propaganda across different contexts can be detected. Crucially, our work highlights the importance of high-quality labels for sensitive NLP tasks such as propaganda detection.</abstract>
-      <url hash="f9826b2f">2024.findings-acl.363</url>
+      <url hash="302b075c">2024.findings-acl.363</url>
       <bibkey>maarouf-etal-2024-hqp</bibkey>
       <doi>10.18653/v1/2024.findings-acl.363</doi>
       <video href="2024.findings-acl.363.mp4"/>
@@ -11122,7 +11122,7 @@
       <author><first>JingBo</first><last>Zhu</last><affiliation>Northeastern University</affiliation></author>
       <pages>6090-6101</pages>
       <abstract>Aligning Large Language Models (LLMs) with human intentions and values is crucial yet challenging. Current methods primarily rely on human preferences, which are costly and insufficient in capturing nuanced feedback expressed in natural language. In this paper, we present Self-Refinement Tuning (SRT), a method that leverages model feedback for alignment, thereby reducing reliance on human annotations. SRT uses a base language model (e.g., Tulu2) to generate initial responses, which are critiqued and refined by a more advanced model (e.g., GPT-4-Turbo). This process enables the base model to self-evaluate and improve its outputs, facilitating continuous learning. SRT further optimizes the model by learning from its self-generated feedback and refinements, creating a feedback loop that promotes model improvement. Our empirical evaluations demonstrate that SRT significantly outperforms strong baselines across diverse tasks and model sizes. When applied to a 70B parameter model, SRT increases the win rate from 9.6% to 25.8% on the AlpacaEval 2.0 benchmark, surpassing well-established systems such as GPT-4-0314, Claude 2, and Gemini. Our analysis highlights the crucial role of language feedback in the success of SRT, suggesting potential for further exploration in this direction.</abstract>
-      <url hash="8249fcd5">2024.findings-acl.364</url>
+      <url hash="f6b8a050">2024.findings-acl.364</url>
       <bibkey>hu-etal-2024-teaching</bibkey>
       <doi>10.18653/v1/2024.findings-acl.364</doi>
     </paper>
@@ -11132,7 +11132,7 @@
       <author><first>Lennart</first><last>Wachowiak</last><affiliation>King’s College London, University of London</affiliation></author>
       <pages>6102-6117</pages>
       <abstract>Despite the ubiquity of large language models (LLMs) in AI research, the question of embodiment in LLMs remains underexplored, distinguishing them from embodied systems in robotics where sensory perception directly informs physical action.Our investigation navigates the intriguing terrain of whether LLMs, despite their non-embodied nature, effectively capture implicit human intuitions about fundamental, spatial building blocks of language. We employ insights from spatial cognitive foundations developed through early sensorimotor experiences, guiding our exploration through the reproduction of three psycholinguistic experiments. Surprisingly, correlations between model outputs and human responses emerge, revealing adaptability without a tangible connection to embodied experiences. Notable distinctions include polarized language model responses and reduced correlations in vision language models. This research contributes to a nuanced understanding of the interplay between language, spatial experiences, and the computations made by large language models.Project Website: https://cisnlp.github.io/Spatial_Schemas/</abstract>
-      <url hash="3ce71b2d">2024.findings-acl.365</url>
+      <url hash="aac0ed7d">2024.findings-acl.365</url>
       <bibkey>wicke-wachowiak-2024-exploring</bibkey>
       <doi>10.18653/v1/2024.findings-acl.365</doi>
       <video href="2024.findings-acl.365.mp4"/>
@@ -11145,7 +11145,7 @@
       <author><first>Zhijie</first><last>Deng</last><affiliation>Shanghai Jiaotong University</affiliation></author>
       <pages>6118-6130</pages>
       <abstract>The detection of machine-generated text, especially from large language models (LLMs), is crucial in preventing serious social problems resulting from their misuse. Some methods train dedicated detectors on specific datasets but fall short in generalizing to unseen test data, while other zero-shot ones often yield suboptimal performance. Although the recent DetectGPT has shown promising detection performance, it suffers from significant inefficiency issues, as detecting a single candidate requires querying the source LLM with hundreds of its perturbations. This paper aims to bridge this gap. Concretely, we propose to incorporate a Bayesian surrogate model, which allows us to select typical samples based on Bayesian uncertainty and interpolate scores from typical samples to other samples, to improve query efficiency. Empirical results demonstrate that our method significantly outperforms existing approaches under a low query budget. Notably, when detecting the text generated by LLaMA family models, our method with just 2 or 3 queries can outperform DetectGPT with 200 queries.</abstract>
-      <url hash="681035d3">2024.findings-acl.366</url>
+      <url hash="838572f0">2024.findings-acl.366</url>
       <bibkey>miao-etal-2024-efficient</bibkey>
       <doi>10.18653/v1/2024.findings-acl.366</doi>
       <video href="2024.findings-acl.366.mp4"/>
@@ -11159,7 +11159,7 @@
       <author><first>Shadi</first><last>Rezapour</last><affiliation>Drexel University</affiliation></author>
       <pages>6131-6148</pages>
       <abstract>Online communities such as drug-related subreddits serve as safe spaces for people who use drugs (PWUD), fostering discussions on substance use experiences, harm reduction, and addiction recovery. Users’ shared narratives on these forums provide insights into the likelihood of developing a substance use disorder (SUD) and recovery potential. Our study aims to develop a multi-level, multi-label classification model to analyze online user-generated texts about substance use experiences. For this purpose, we first introduce a novel taxonomy to assess the nature of posts, including their intended connections (Inquisition or Disclosure), subjects (e.g., Recovery, Dependency), and specific objectives (e.g., Relapse, Quality, Safety). Using various multi-label classification algorithms on a set of annotated data, we show that GPT-4, when prompted with instructions, definitions, and examples, outperformed all other models. We apply this model to label an additional 1,000 posts and analyze the categories of linguistic expression used within posts in each class. Our analysis shows that topics such as Safety, Combination of Substances, and Mental Health see more disclosure, while discussions about physiological Effects focus on harm reduction. Our work enriches the understanding of PWUD’s experiences and informs the broader knowledge base on SUD and drug use.</abstract>
-      <url hash="cf6c48ed">2024.findings-acl.367</url>
+      <url hash="709804d6">2024.findings-acl.367</url>
       <bibkey>bouzoubaa-etal-2024-decoding</bibkey>
       <doi>10.18653/v1/2024.findings-acl.367</doi>
       <video href="2024.findings-acl.367.mp4"/>
@@ -11173,7 +11173,7 @@
       <author><first>Boi</first><last>Faltings</last></author>
       <pages>6149-6174</pages>
       <abstract>Crafting an appealing heading is crucial for attracting readers and marketing work or products. A popular way is to summarize the main idea with a refined description and a memorable acronym. However, there lacks a systematic study and a formal benchmark including datasets and metrics. Motivated by this absence, we introduce LOgogram, a novel benchmark comprising 6,653 paper abstracts with corresponding descriptions and acronyms. To measure the quality of heading generation, we propose a set of evaluation metrics from three aspects: summarization, neology, and algorithm. Additionally, we explore three strategies for heading generation(generation ordering, tokenization of acronyms, and framework design) under various prevalent learning paradigms(supervised fine-tuning, in-context learning with Large Language Models(LLMs), and reinforcement learning) on our benchmark. Our experimental results indicate the difficulty in identifying a practice that excels across all summarization, neologistic, and algorithmic aspects.</abstract>
-      <url hash="5160e534">2024.findings-acl.368</url>
+      <url hash="d15224a0">2024.findings-acl.368</url>
       <bibkey>cui-etal-2024-unveiling</bibkey>
       <doi>10.18653/v1/2024.findings-acl.368</doi>
       <video href="2024.findings-acl.368.mp4"/>
@@ -11186,7 +11186,7 @@
       <author><first>Isabelle</first><last>Augenstein</last><affiliation>University of Copenhagen</affiliation></author>
       <pages>6175-6191</pages>
       <abstract>Distorted science communication harms individuals and society as it can lead to unhealthy behavior change and decrease trust in scientific institutions. Given the rapidly increasing volume of science communication in recent years, a fine-grained understanding of how findings from scientific publications are reported to the general public, and methods to detect distortions from the original work automatically, are crucial. Prior work focused on individual aspects of distortions or worked with unpaired data. In this work, we make three foundational contributions towards addressing this problem: (1) annotating 1,600 instances of scientific findings from academic papers paired with corresponding findings as reported in news articles and tweets wrt. four characteristics: causality, certainty, generality and sensationalism; (2) establishing baselines for automatically detecting these characteristics; and (3) analyzing the prevalence of changes in these characteristics in both human-annotated and large-scale unlabeled data. Our results show that scientific findings frequently undergo subtle distortions when reported. Tweets distort findings more often than science news reports. Detecting fine-grained distortions automatically poses a challenging task. In our experiments, fine-tuned task-specific models consistently outperform few-shot LLM prompting.</abstract>
-      <url hash="e5a315f5">2024.findings-acl.369</url>
+      <url hash="d3f8ae49">2024.findings-acl.369</url>
       <bibkey>wuehrl-etal-2024-understanding</bibkey>
       <doi>10.18653/v1/2024.findings-acl.369</doi>
       <video href="2024.findings-acl.369.mp4"/>
@@ -11200,7 +11200,7 @@
       <author><first>Srijan</first><last>Kumar</last><affiliation>Georgia Institute of Technology</affiliation></author>
       <pages>6192-6210</pages>
       <abstract>Social media platforms are hubs for multimodal information exchange, encompassing text, images, and videos, making it challenging for machines to comprehend the information or emotions associated with interactions in online spaces. Multimodal Large Language Models (MLLMs) have emerged as a promising solution to address these challenges, yet struggle with accurately interpreting human emotions and complex contents like misinformation. This paper introduces MM-Soc, a comprehensive benchmark designed to evaluate MLLMs’ understanding of multimodal social media content. MM-Soc compiles prominent multimodal datasets and incorporates a novel large-scale YouTube tagging dataset, targeting a range of tasks from misinformation detection, hate speech detection, and social context generation. Through our exhaustive evaluation on ten size-variants of four open-source MLLMs, we have identified significant performance disparities, highlighting the need for advancements in models’ social understanding capabilities. Our analysis reveals that, in a zero-shot setting, various types of MLLMs generally exhibit difficulties in handling social media tasks. However, MLLMs demonstrate performance improvements post fine-tuning, suggesting potential pathways for improvement.</abstract>
-      <url hash="08ba889b">2024.findings-acl.370</url>
+      <url hash="61a45c91">2024.findings-acl.370</url>
       <bibkey>jin-etal-2024-mm</bibkey>
       <doi>10.18653/v1/2024.findings-acl.370</doi>
       <video href="2024.findings-acl.370.mp4"/>
@@ -11213,7 +11213,7 @@
       <author><first>Ziyu</first><last>Yao</last><affiliation>George Mason University</affiliation></author>
       <pages>6211-6232</pages>
       <abstract>Large language models (LLMs) have revolutionized zero-shot task performance, mitigating the need for task-specific annotations while enhancing task generalizability. Despite its advancements, current methods using trigger phrases such as “Let’s think step by step” remain limited. This study introduces PRomPTed, an approach that optimizes the zero-shot prompts for individual task instances following an innovative manner of “LLMs in the loop”.Our comprehensive evaluation across 13 datasets and 10 task types based on GPT-4 reveals that PRomPTed significantly outperforms both the naive zero-shot approaches and a strong baseline (i.e., “Output Refinement”) which refines the task output instead of the input prompt. Our experimental results also confirmed the generalization of this advantage to the relatively weaker GPT-3.5. Even more intriguingly, we found that leveraging GPT-3.5 to rewrite prompts for the stronger GPT-4 not only matches but occasionally exceeds the efficacy of using GPT-4 as the prompt rewriter. Our research thus presents a huge value in not only enhancing zero-shot LLM performance but also potentially enabling supervising LLMs with their weaker counterparts, a capability attracting much interest recently. Finally, our additional experiments confirm the generalization of the advantages to open-source LLMs such as Mistral 7B and Mixtral 8x7B.</abstract>
-      <url hash="d36c2d87">2024.findings-acl.371</url>
+      <url hash="269db309">2024.findings-acl.371</url>
       <bibkey>srivastava-etal-2024-instances</bibkey>
       <doi>10.18653/v1/2024.findings-acl.371</doi>
       <video href="2024.findings-acl.371.mp4"/>
@@ -11226,7 +11226,7 @@
       <author><first>Aidong</first><last>Zhang</last></author>
       <pages>6233-6251</pages>
       <abstract>While large language models (LLMs) have achieved state-of-the-art performance on a wide range of medical question answering (QA) tasks, they still face challenges with hallucinations and outdated knowledge. Retrieval-augmented generation (RAG) is a promising solution and has been widely adopted. However, a RAG system can involve multiple flexible components, and there is a lack of best practices regarding the optimal RAG setting for various medical purposes. To systematically evaluate such systems, we propose the Medical Information Retrieval-Augmented Generation Evaluation (MIRAGE), a first-of-its-kind benchmark including 7,663 questions from five medical QA datasets. Using MIRAGE, we conducted large-scale experiments with over 1.8 trillion prompt tokens on 41 combinations of different corpora, retrievers, and backbone LLMs through the MedRAG toolkit introduced in this work. Overall, MedRAG improves the accuracy of six different LLMs by up to 18% over chain-of-thought prompting, elevating the performance of GPT-3.5 and Mixtral to GPT-4-level. Our results show that the combination of various medical corpora and retrievers achieves the best performance. In addition, we discovered a log-linear scaling property and the “lost-in-the-middle” effects in medical RAG. We believe our comprehensive evaluations can serve as practical guidelines for implementing RAG systems for medicine.</abstract>
-      <url hash="02496f74">2024.findings-acl.372</url>
+      <url hash="c7cae2ef">2024.findings-acl.372</url>
       <bibkey>xiong-etal-2024-benchmarking</bibkey>
       <doi>10.18653/v1/2024.findings-acl.372</doi>
     </paper>
@@ -11266,7 +11266,7 @@
       <author><first>Yike</first><last>Guo</last></author>
       <pages>6252-6271</pages>
       <abstract>While LLMs demonstrate impressive capabilities in musical knowledge, we find that music reasoning is still an unsolved task.We introduce ChatMusician, an open-source large language model (LLM) that integrates intrinsic musical abilities. It is based on continual pre-training and finetuning LLaMA2 on a text-compatible music representation, ABC notation, and the music is treated as a second language.ChatMusician can understand and generate music with a pure text tokenizer without external multi-modal neural structures or tokenizers. Interestingly, endowing musical abilities does not harm language abilities, even achieving a slightly higher MMLU score.ChatMusician is capable of composing well-structured, full-length music, condition on texts, chords, melodies, motifs, musical forms, etc.On our meticulously curated college-level music understanding benchmark, MusicTheoryBench, ChatMusician surpasses LLaMA2 and GPT-3.5 by a noticeable margin. We show that ChatMusician preserves or even surpasses the original LLaMA2 7B’s language abilities by evaluating on MMLU benchmark.Our work reveals that LLMs can be an excellent compressor for music, which can be seen as humanity’s creative language, but there remains significant territory to be conquered.We release our 5B token music-language corpora MusicPiles, the collected MusicTheoryBench, code, model and demo.</abstract>
-      <url hash="eb44cccb">2024.findings-acl.373</url>
+      <url hash="e4885391">2024.findings-acl.373</url>
       <bibkey>yuan-etal-2024-chatmusician</bibkey>
       <doi>10.18653/v1/2024.findings-acl.373</doi>
     </paper>
@@ -11277,7 +11277,7 @@
       <author><first>Lidong</first><last>Bing</last><affiliation>Alibaba Group</affiliation></author>
       <pages>6272-6286</pages>
       <abstract>Knowledge in the real world is being updated constantly. However, it is costly to frequently update large language models (LLMs). Therefore, it is crucial for LLMs to understand the concept of temporal knowledge. However, prior works on temporal question answering (TQA) did not emphasize multi-answer and multi-hop types of temporal reasoning. In this paper, we propose a complex temporal question-answering dataset Complex-TR that focuses on multi-answer and multi-hop temporal reasoning. Besides, we also propose a novel data augmentation strategy to improve the complex temporal reasoning capability and robustness of LLMs. We conducted experiments on multiple temporal QA datasets. Experimental results show that our method is able to improve LLMs’ performance on temporal QA benchmarks by significant margins.</abstract>
-      <url hash="0edfcb5b">2024.findings-acl.374</url>
+      <url hash="6c716181">2024.findings-acl.374</url>
       <bibkey>tan-etal-2024-towards</bibkey>
       <doi>10.18653/v1/2024.findings-acl.374</doi>
       <video href="2024.findings-acl.374.mp4"/>
@@ -11289,7 +11289,7 @@
       <author><first>Max</first><last>Ryabinin</last><affiliation>Together AI</affiliation></author>
       <pages>6287-6310</pages>
       <abstract>Large language models demonstrate a remarkable capability for learning to solve new tasks from a few examples.The <tex-math>\textit{prompt template}</tex-math>, or the way the input examples are formatted to obtain the prompt, is an important yet often overlooked aspect of in-context learning.In this work, we conduct a comprehensive study of the template format’s influence on the in-context learning performance.We evaluate the impact of the prompt template across 21 models (from 770M to 70B parameters) and 4 standard classification datasets. We show that a poor choice of the template can reduce the performance of the strongest models and inference methods to a random guess level.More importantly, the best templates do not transfer between different setups and even between models of the same family.Our findings show that the currently prevalent approach to evaluation, which ignores template selection, may give misleading results due to different templates in different works.As a first step towards mitigating this issue, we propose <tex-math>\textit{Template Ensembles}</tex-math> that aggregate model predictions across several templates.This simple test-time augmentation boosts average performance while being robust to the choice of random set of templates.</abstract>
-      <url hash="f79c65fd">2024.findings-acl.375</url>
+      <url hash="a27a59f3">2024.findings-acl.375</url>
       <bibkey>voronov-etal-2024-mind</bibkey>
       <doi>10.18653/v1/2024.findings-acl.375</doi>
       <video href="2024.findings-acl.375.mp4"/>
@@ -11303,7 +11303,7 @@
       <author><first>Jundong</first><last>Li</last><affiliation>University of Virginia</affiliation></author>
       <pages>6311-6321</pages>
       <abstract>Large Language Models (LLMs) have shown unprecedented performance in various real-world applications. However, they are known to generate factually inaccurate outputs, a.k.a. the hallucination problem. In recent years, incorporating external knowledge extracted from Knowledge Graphs (KGs) has become a promising strategy to improve the factual accuracy of LLM-generated outputs. Nevertheless, most existing explorations rely on LLMs themselves to perform KG knowledge extraction, which is highly inflexible as LLMs can only provide binary judgment on whether a certain knowledge (e.g., a knowledge path in KG) should be used. In addition, LLMs tend to pick only knowledge with direct semantic relationship with the input text, while potentially useful knowledge with indirect semantics can be ignored. In this work, we propose a principled framework KELP with three stages to handle the above problems. Specifically, KELP is able to achieve finer granularity of flexible knowledge extraction by generating scores for knowledge paths with input texts via latent semantic matching. Meanwhile, knowledge paths with indirect semantic relationships with the input text can also be considered via trained encoding between the selected paths in KG and the input text. Experiments on real-world datasets validate the effectiveness of KELP.</abstract>
-      <url hash="01f2f6d9">2024.findings-acl.376</url>
+      <url hash="be20964c">2024.findings-acl.376</url>
       <bibkey>liu-etal-2024-knowledge-graph</bibkey>
       <doi>10.18653/v1/2024.findings-acl.376</doi>
       <video href="2024.findings-acl.376.mp4"/>
@@ -11318,7 +11318,7 @@
       <author><first>Boxing</first><last>Chen</last><affiliation>Huawei Technologies Ltd.</affiliation></author>
       <pages>6322-6334</pages>
       <abstract>Recently, there has been considerable attention on detecting hallucinations and omissions in Machine Translation (MT) systems. The two dominant approaches to tackle this task involve analyzing the MT system’s internal states or relying on the output of external tools, such as sentence similarity or MT quality estimators. In this work, we introduce OTTAWA, a novel Optimal Transport (OT)-based word aligner specifically designed to enhance the detection of hallucinations and omissions in MT systems. Our approach explicitly models the missing alignments by introducing a “null” vector, for which we propose a novel one-side constrained OT setting to allow an adaptive null alignment. Our approach yields competitive results compared to state-of-the-art methods across 18 language pairs on the HalOmi benchmark. In addition, it shows promising features, such as the ability to distinguish between both error types and perform word-level detection without accessing the MT system’s internal states.</abstract>
-      <url hash="64a1738c">2024.findings-acl.377</url>
+      <url hash="9d1604e9">2024.findings-acl.377</url>
       <bibkey>huang-etal-2024-ottawa</bibkey>
       <doi>10.18653/v1/2024.findings-acl.377</doi>
       <video href="2024.findings-acl.377.mp4"/>
@@ -11333,7 +11333,7 @@
       <author><first>Jie</first><last>Tan</last></author>
       <pages>6335-6350</pages>
       <abstract>In the realm of event prediction, temporal knowledge graph forecasting (TKGF) stands as a pivotal technique. Previous approaches face the challenges of not utilizing experience during testing and relying on a single short-term history, which limits adaptation to evolving data. In this paper, we introduce the Online Neural-Symbolic Event Prediction (ONSEP) framework, which innovates by integrating dynamic causal rule mining (DCRM) and dual history augmented generation (DHAG). DCRM dynamically constructs causal rules from real-time data, allowing for swift adaptation to new causal relationships. In parallel, DHAG merges short-term and long-term historical contexts, leveraging a bi-branch approach to enrich event prediction. Our framework demonstrates notable performance enhancements across diverse datasets, with significant Hit@k (k=1,3,10) improvements, showcasing its ability to augment large language models (LLMs) for event prediction without necessitating extensive retraining. The ONSEP framework not only advances the field of TKGF but also underscores the potential of neural-symbolic approaches in adapting to dynamic data environments.</abstract>
-      <url hash="03b0de24">2024.findings-acl.378</url>
+      <url hash="67fa0c71">2024.findings-acl.378</url>
       <bibkey>yu-etal-2024-onsep</bibkey>
       <doi>10.18653/v1/2024.findings-acl.378</doi>
       <video href="2024.findings-acl.378.mp4"/>
@@ -11348,7 +11348,7 @@
       <author><first>Phil</first><last>Woodland</last><affiliation>University of Cambridge</affiliation></author>
       <pages>6351-6362</pages>
       <abstract>Recently, advancements in large language models (LLMs) have shown an unprecedented ability across various language tasks. This paper investigates the potential application of LLMs to slot filling with noisy ASR transcriptions, via both in-context learning and task-specific fine-tuning. Dedicated prompt designs and noise-robust LoRA fine-tuning are proposed to improve the robustness of LLMs for slot filling with noisy ASR transcriptions. Moreover, a linearised knowledge injection (LKI) scheme is also proposed to integrate dynamic external knowledge into LLMs. Experiments were performed on SLURP to quantify the performance of LLMs, including GPT-3.5-turbo, GPT-4, LLaMA-13B, LLaMA-2-13B and Vicuna-13B (v1.1 and v1.5) with different ASR error rates. The use of the noise-robust fine-tuning together with LKI for Vicuna-13B-v1.5 achieved 6.7% and 17.6% absolute SLU-F1 improvements compared to a fully fine-tuned Flan-T5-XL model on the limited data setup and the zero-shot setup respectively.</abstract>
-      <url hash="38a3f5bb">2024.findings-acl.379</url>
+      <url hash="a9a9b274">2024.findings-acl.379</url>
       <bibkey>sun-etal-2024-speech</bibkey>
       <doi>10.18653/v1/2024.findings-acl.379</doi>
       <video href="2024.findings-acl.379.mp4"/>
@@ -11361,7 +11361,7 @@
       <author><first>Serguei</first><last>Pakhomov</last><affiliation>University of Minnesota - Twin Cities</affiliation></author>
       <pages>6363-6377</pages>
       <abstract>As artificial neural networks grow in complexity, understanding their inner workings becomes increasingly challenging, which is particularly important in healthcare applications. The intrinsic evaluation metrics of autoregressive neural language models (NLMs), perplexity (PPL), can reflect how “surprised” an NLM model is at novel input. PPL has been widely used to understand the behavior of NLMs. Previous findings show that changes in PPL when masking attention layers in pre-trained transformer-based NLMs reflect linguistic anomalies associated with Alzheimer’s disease dementia. Building upon this, we explore a novel bidirectional attention head ablation method that exhibits properties attributed to the concepts of cognitive and brain reserve in human brain studies, which postulate that people with more neurons in the brain and more efficient processing are more resilient to neurodegeneration. Our results show that larger GPT-2 models require a disproportionately larger share of attention heads to be masked/ablated to display degradation of similar magnitude to masking in smaller models. These results suggest that the attention mechanism in transformer models may present an analogue to the notions of cognitive and brain reserve and could potentially be used to model certain aspects of the progression of neurodegenerative disorders and aging.</abstract>
-      <url hash="6686045a">2024.findings-acl.380</url>
+      <url hash="624958b0">2024.findings-acl.380</url>
       <bibkey>li-etal-2024-big</bibkey>
       <doi>10.18653/v1/2024.findings-acl.380</doi>
       <video href="2024.findings-acl.380.mp4"/>
@@ -11374,7 +11374,7 @@
       <author><first>Reut</first><last>Tsarfaty</last><affiliation>Google and Bar-Ilan University, Technion</affiliation></author>
       <pages>6378-6388</pages>
       <abstract>While large language models (LLMs) excel in various natural language tasks in English, their performance in low-resource languages like Hebrew, especially for generative tasks such as abstractive summarization, remains unclear. The high morphological richness in Hebrew adds further challenges due to the ambiguity in sentence comprehension and the complexities in meaning construction.In this paper, we address this evaluation and resource gap by introducing HeSum, a novel benchmark dataset specifically designed for Hebrew abstractive text summarization. HeSum consists of 10,000 article-summary pairs sourced from Hebrew news websites written by professionals. Linguistic analysis confirms HeSum’s high abstractness and unique morphological challenges. We show that HeSum presents distinct difficulties even for state-of-the-art LLMs, establishing it as a valuable testbed for advancing generative language technology in Hebrew, and MRLs generative challenges in general.</abstract>
-      <url hash="5ea495c3">2024.findings-acl.381</url>
+      <url hash="75da99f1">2024.findings-acl.381</url>
       <bibkey>paz-argaman-etal-2024-hesum</bibkey>
       <doi>10.18653/v1/2024.findings-acl.381</doi>
       <video href="2024.findings-acl.381.mp4"/>
@@ -11385,7 +11385,7 @@
       <author><first>Yun</first><last>Zhao</last><affiliation>Meta Platforms, Inc</affiliation></author>
       <pages>6389-6415</pages>
       <abstract>Reasoning about time is essential for understanding the nuances of events described in natural language. Previous research on this topic has been limited in scope, characterized by a lack of standardized benchmarks that would allow for consistent evaluations across different studies. In this paper, we introduce TRAM, a temporal reasoning benchmark composed of ten datasets, encompassing various temporal aspects of events such as order, arithmetic, frequency, and duration, designed to facilitate a comprehensive evaluation of the TeR capabilities of large language models (LLMs). We evaluate popular LLMs like GPT-4 and Llama2 in zero-shot and few-shot scenarios, and establish baselines with BERT-based and domain-specific models. Our findings indicate that the best-performing model lags significantly behind human performance. It is our aspiration that TRAM will spur further progress in enhancing the TeR capabilities of LLMs.</abstract>
-      <url hash="6288e070">2024.findings-acl.382</url>
+      <url hash="535c6969">2024.findings-acl.382</url>
       <bibkey>wang-zhao-2024-tram</bibkey>
       <doi>10.18653/v1/2024.findings-acl.382</doi>
       <video href="2024.findings-acl.382.mp4"/>
@@ -11399,7 +11399,7 @@
       <author><first>William Yang</first><last>Wang</last><affiliation>UC Santa Barbara</affiliation></author>
       <pages>6416-6432</pages>
       <abstract>This paper investigates the capabilities of Large Language Models (LLMs) in understanding their knowledge and uncertainty over questions. Specifically, we focus on addressing known-unknown questions, characterized by high uncertainty due to the absence of definitive answers. To facilitate our study, we collect a new dataset with Known-Unknown Questions (KUQ) and establish a categorization framework to clarify the origins of uncertainty in such queries. Subsequently, we examine the performance of open-source LLMs, fine-tuned using this dataset, in distinguishing between known and unknown queries within open-ended question-answering scenarios. The fine-tuned models demonstrated a significant improvement, achieving a considerable increase in F1-score relative to their pre-fine-tuning state. Through a comprehensive analysis, we reveal insights into the models’ improved uncertainty articulation and their consequent efficacy in multi-agent debates. These findings help us understand how LLMs can be trained to identify and express uncertainty, improving our knowledge of how they understand and express complex or unclear information.</abstract>
-      <url hash="ab26efde">2024.findings-acl.383</url>
+      <url hash="0605c50a">2024.findings-acl.383</url>
       <bibkey>amayuelas-etal-2024-knowledge</bibkey>
       <doi>10.18653/v1/2024.findings-acl.383</doi>
       <video href="2024.findings-acl.383.mp4"/>
@@ -11415,7 +11415,7 @@
       <author><first>Boi</first><last>Faltings</last></author>
       <pages>6433-6452</pages>
       <abstract>Defeasibility in causal reasoning implies that the causal relationship between cause and effect can be strengthened or weakened. Namely, the causal strength between cause and effect should increase or decrease with the incorporation of strengthening arguments (supporters) or weakening arguments (defeaters), respectively. However, existing works ignore defeasibility in causal reasoning and fail to evaluate existing causal strength metrics in defeasible settings. In this work, we present <tex-math>\delta</tex-math>-CAUSAL, the first benchmark dataset for studying defeasibility in causal reasoning. <tex-math>\delta</tex-math>-CAUSAL includes around 11K events spanning ten domains, featuring defeasible causality pairs, namely, cause-effect pairs accompanied by supporters and defeaters. We further show that current causal strength metrics fail to reflect the change of causal strength with the incorporation of supporters or defeaters in <tex-math>\delta</tex-math>-CAUSAL. To this end, we propose CESAR (Causal Embedding aSsociation with Attention Rating), a metric that measures causal strength based on token-level causal relationships. CESAR achieves a significant 69.7% relative improvement over existing metrics, increasing from 47.2% to 80.1% in capturing the causal strength change brought by supporters and defeaters. We further demonstrate even Large Language Models (LLMs) like GPT-3.5 still lag 4.5 and 10.7 points behind humans in generating supporters and defeaters, emphasizing the challenge posed by <tex-math>\delta</tex-math>-CAUSAL.</abstract>
-      <url hash="0694bb00">2024.findings-acl.384</url>
+      <url hash="10430d3b">2024.findings-acl.384</url>
       <bibkey>cui-etal-2024-exploring</bibkey>
       <doi>10.18653/v1/2024.findings-acl.384</doi>
       <video href="2024.findings-acl.384.mp4"/>
@@ -11429,7 +11429,7 @@
       <author><first>Graham</first><last>Neubig</last><affiliation>Carnegie Mellon University</affiliation></author>
       <pages>6453-6466</pages>
       <abstract>Despite recent advances in large language models, building dependable and deployable NLP models typically requires abundant, high-quality training data. However, task-specific data is not available for many use cases, and manually curating task-specific data is labor-intensive. Recent work has studied prompt-driven synthetic data generation using large language models, but these generated datasets tend to lack complexity and diversity. To address these limitations, we introduce a method, _DataTune_, to make better use of existing, publicly available datasets to improve automatic dataset generation. DataTune performs _dataset transformation_, enabling the repurposing of publicly available datasets into a format that is directly aligned with the specific requirements of target tasks. On a diverse set of language-based tasks from the BIG-Bench benchmark, we find that finetuning language models via DataTune improves over a few-shot prompting baseline by 49% and improves over existing methods that use synthetic or retrieved training data by 34%. We find that dataset transformation significantly increases the diversity and difficulty of generated data on many tasks. We release a Python package and open-source repository to make this method accessible to the community (URL will be added upon acceptance).</abstract>
-      <url hash="b7034ab2">2024.findings-acl.385</url>
+      <url hash="682067a9">2024.findings-acl.385</url>
       <bibkey>gandhi-etal-2024-better</bibkey>
       <doi>10.18653/v1/2024.findings-acl.385</doi>
       <video href="2024.findings-acl.385.mp4"/>
@@ -11442,7 +11442,7 @@
       <author><first>Yulan</first><last>He</last><affiliation>King’s College London, University of London</affiliation></author>
       <pages>6467-6481</pages>
       <abstract>In-context learning has become a popular paradigm in natural language processing. However, its performance can be significantly influenced by the order of in-context demonstration examples. In this paper, we found that causal language models (CausalLMs) are more sensitive to this order compared to prefix language models (PrefixLMs). We attribute this phenomenon to the auto-regressive attention masks within CausalLMs, which restrict each token from accessing information from subsequent tokens. This results in different receptive fields for samples at different positions, thereby leading to representation disparities across positions. To tackle this challenge, we introduce an unsupervised fine-tuning method, termed the Information-Augmented and Consistency-Enhanced approach. This approach utilizes contrastive learning to align representations of in-context examples across different positions and introduces a consistency loss to ensure similar representations for inputs with different permutations. This enhances the model’s predictive consistency across permutations. Experimental results on five benchmarks suggest that our proposed method can reduce the sensitivity of CausalLMs to the order of in-context examples and exhibit robust generalizability, particularly when demonstrations are sourced from a candidate pool different from that used in the training phase, or when the number of in-context examples differs from what is used during training.</abstract>
-      <url hash="58f993fc">2024.findings-acl.386</url>
+      <url hash="9de14c37">2024.findings-acl.386</url>
       <bibkey>xiang-etal-2024-addressing</bibkey>
       <doi>10.18653/v1/2024.findings-acl.386</doi>
       <video href="2024.findings-acl.386.mp4"/>
@@ -11454,7 +11454,7 @@
       <author><first>Lucie</first><last>Flek</last><affiliation>Rheinische Friedrich-Wilhelms Universität Bonn</affiliation></author>
       <pages>6482-6497</pages>
       <abstract>Although language model performance across diverse tasks continues to improve, these models still struggle to understand and explain the beliefs of other people. This skill requires perspective-taking, the process of conceptualizing the point of view of another person. Perspective taking becomes challenging when the text reflects more personal and potentially more controversial beliefs.We explore this task through natural language generation of responses to conflict situations. We evaluate novel modifications to recent architectures for conditioning generation on an individual’s comments and self-disclosure statements. Our work extends the Social-Chem-101 corpus, using 95k judgements written by 6k authors from English Reddit data, for each of whom we obtained 20-500 self-disclosure statements. Our evaluation methodology borrows ideas from both personalized generation and theory of mind literature. Our proposed perspective-taking models outperform recent work, especially the twin encoder model conditioned on self-disclosures with high similarity to the conflict situation.</abstract>
-      <url hash="8c68da49">2024.findings-acl.387</url>
+      <url hash="e8d00121">2024.findings-acl.387</url>
       <bibkey>plepi-etal-2024-perspective</bibkey>
       <doi>10.18653/v1/2024.findings-acl.387</doi>
       <video href="2024.findings-acl.387.mp4"/>
@@ -11472,7 +11472,7 @@
       <author><first>Amir</first><last>Gholami</last><affiliation>University of California Berkeley</affiliation></author>
       <pages>6498-6526</pages>
       <abstract>Pretrained large language models (LLMs) are currently state-of-the-art for solving the vast majority of natural language processing tasks. While many real-world applications still require fine-tuning to reach satisfactory levels of performance, many of them are in the low-data regime, making fine-tuning challenging. To address this, we propose LLM2LLM, a targeted and iterative data augmentation strategy that uses a teacher LLM to enhance a small seed dataset by augmenting additional data that can be used for fine-tuning on a specific task. LLM2LLM (1) fine-tunes a baseline student LLM on the initial seed data, (2) evaluates and extracts data points that the model gets wrong, and (3) uses a teacher LLM to generate synthetic data based on these incorrect data points, which are then added back into the training data. This approach amplifies the signal from incorrectly predicted data points by the LLM during training and reintegrates them into the dataset to focus on more challenging examples for the LLM. Our results show that LLM2LLM significantly enhances the performance of LLMs in the low-data regime, outperforming both traditional fine-tuning and other data augmentation baselines. LLM2LLM reduces the dependence on labor-intensive data curation and paves the way for more scalable and performant LLM solutions, allowing us to tackle data-constrained domains and tasks. We achieve improvements up to 24.2% on the GSM8K dataset, 32.6% on CaseHOLD, 32.0% on SNIPS, 52.6% on TREC and 39.8% on SST-2 over regular fine-tuning in the low-data regime using a Llama-2-7B student model. Our code is available at https://github.com/SqueezeAILab/LLM2LLM.</abstract>
-      <url hash="64183a4b">2024.findings-acl.388</url>
+      <url hash="375b4fa7">2024.findings-acl.388</url>
       <bibkey>lee-etal-2024-llm2llm</bibkey>
       <doi>10.18653/v1/2024.findings-acl.388</doi>
       <video href="2024.findings-acl.388.mp4"/>
@@ -11489,7 +11489,7 @@
       <author><first>Ido</first><last>Dagan</last><affiliation>Bar-Ilan University</affiliation></author>
       <pages>6527-6548</pages>
       <abstract>Multi-document summarization (MDS) is a challenging task, often decomposed to subtasks of salience and redundancy detection, followed by text generation.In this context, alignment of corresponding sentences between a reference summary and its source documents has been leveraged to generate training data for some of the component tasks. Yet, this enabling alignment step has usually been applied heuristically on the sentence level on a limited number of subtasks.In this paper, we propose extending the summary-source alignment framework by (1) applying it at the more fine-grained proposition span level, (2) annotating alignment manually in a multi-document setup, and (3) revealing the great potential of summary-source alignments to yield several datasets for at least six different tasks. Specifically, for each of the tasks, we release a manually annotated test set that was derived automatically from the alignment annotation. We also release development and train sets in the same way, but from automatically derived alignments.Using the datasets, each task is demonstrated with baseline models and corresponding evaluation metrics to spur future research on this broad challenge.</abstract>
-      <url hash="9571cec9">2024.findings-acl.389</url>
+      <url hash="783161f3">2024.findings-acl.389</url>
       <bibkey>ernst-etal-2024-power</bibkey>
       <doi>10.18653/v1/2024.findings-acl.389</doi>
       <video href="2024.findings-acl.389.mp4"/>
@@ -11510,7 +11510,7 @@
       <author><first>Robert</first><last>Nowak</last><affiliation>University of Wisconsin, Madison, Toyota Technological Institute at Chicago, Rice University, University of Wisconsin-Madison and University of Wisconsin - Madison</affiliation></author>
       <pages>6549-6560</pages>
       <abstract>Supervised finetuning (SFT) on instruction datasets has played a crucial role in achieving the remarkable zero-shot generalization capabilities observed in modern large language models (LLMs). However, the annotation efforts required to produce high quality responses for instructions are becoming prohibitively expensive, especially as the number of tasks spanned by instruction datasets continues to increase. Active learning is effective in identifying useful subsets of samples to annotate from an unlabeled pool, but its high computational cost remains a barrier to its widespread applicability in the context of LLMs. To mitigate the annotation cost of SFT and circumvent the computational bottlenecks of active learning, we propose using experimental design. Experimental design techniques select the most informative samples to label, and typically maximize some notion of uncertainty and/or diversity. In our work, we implement a framework that evaluates several existing and novel experimental design techniques and find that these methods consistently yield significant gains in label efficiency with little computational overhead. On generative tasks, to reach the same generalization performance, our methods save 50% of the annotation cost compared to random sampling.</abstract>
-      <url hash="cdb105f0">2024.findings-acl.390</url>
+      <url hash="7e840a96">2024.findings-acl.390</url>
       <bibkey>bhatt-etal-2024-experimental</bibkey>
       <doi>10.18653/v1/2024.findings-acl.390</doi>
       <video href="2024.findings-acl.390.mp4"/>
@@ -11522,7 +11522,7 @@
       <author><first>Yan</first><last>Song</last><affiliation>University of Science and Technology of China</affiliation></author>
       <pages>6561-6573</pages>
       <abstract>In many practical scenarios, contents from different modalities are not semantically aligned; for instance, visual and textual information may conflict with each other, resulting in non-compositional expression effects such as irony or humor. Effective modeling and smooth integration of multimodal information are crucial for achieving good understanding of the contrast across modalities. Being focusing on image-text matching, most current studies face challenges in identifying such contrast, leading to limitations in exploring the extended semantics when images and texts do not match. In this paper, we propose an LLM-based approach for learning multimodal contrast following the encoding-decoding paradigm, enhanced by a memory module with reinforced contrast recognition, and use a series of tasks that have the nature of multimodal contrast to verify our approach. The memory module learns the integration between visual and textual features with trainable memory vectors and the reinforced contrast recognition uses self-rejection sampling to optimize the memory to further enhance learning multimodal contrast. The resulted information, accompanied with visual and text features, is finally fed into the LLM to predict corresponding labels. We experiment our approach on four English and Chinese benchmark datasets, where it outperforms strong baselines and state-of-the-art studies.</abstract>
-      <url hash="94a15b17">2024.findings-acl.391</url>
+      <url hash="5492b372">2024.findings-acl.391</url>
       <bibkey>tian-etal-2024-learning</bibkey>
       <doi>10.18653/v1/2024.findings-acl.391</doi>
       <video href="2024.findings-acl.391.mp4"/>
@@ -11534,7 +11534,7 @@
       <author><first>Carsten</first><last>Eickhoff</last><affiliation>Eberhard-Karls-Universität Tübingen</affiliation></author>
       <pages>6574-6584</pages>
       <abstract>Text simplification is the process of rewriting a piece of text using simpler vocabulary and grammatical structure in order to make the text more accessible and understandable for a larger audience. In this paper, we introduce a new text simplification model based on the notion of adaptive teaching using a teacher network and a text generation network. We name this new model Simplification via Adaptive Teaching (SAT). Our proposed model sets a new state-of-the-art performance in terms of standard simplification metrics such as SARI and D-SARI with a significant improvement over the previous state of the art on the D-Wikipedia dataset and the Wiki-Doc benchmark dataset. Moreover, we conduct a human evaluation in terms of text simplicity, correctness, and fluency to substantiate SAT’s performance.</abstract>
-      <url hash="a4a888e1">2024.findings-acl.392</url>
+      <url hash="11e54aa5">2024.findings-acl.392</url>
       <bibkey>bahrainian-etal-2024-text</bibkey>
       <doi>10.18653/v1/2024.findings-acl.392</doi>
     </paper>
@@ -11546,7 +11546,7 @@
       <author><first>Özen</first><last>Dolcerocca</last><affiliation>University of Bologna</affiliation></author>
       <pages>6585-6596</pages>
       <abstract>This paper introduces a multi-level, multi-label text classification dataset comprising over 3000 documents. The dataset features literary and critical texts from 19th-century Ottoman Turkish and Russian. It is the first study to apply large language models (LLMs) to this dataset, sourced from prominent literary periodicals of the era. The texts have been meticulously organized and labeled. This was done according to a taxonomic framework that takes into account both their structural and semantic attributes. Articles are categorized and tagged with bibliometric metadata by human experts. We present baseline classification results using a classical bag-of-words (BoW) naive Bayes model and three modern LLMs: multilingual BERT, Falcon, and Llama-v2. We found that in certain cases, Bag of Words (BoW) outperforms Large Language Models (LLMs), emphasizing the need for additional research, especially in low-resource language settings. This dataset is expected to be a valuable resource for researchers in natural language processing and machine learning, especially for historical and low-resource languages. The dataset is publicly available.</abstract>
-      <url hash="612cf830">2024.findings-acl.393</url>
+      <url hash="94402e21">2024.findings-acl.393</url>
       <bibkey>gokceoglu-etal-2024-multi</bibkey>
       <doi>10.18653/v1/2024.findings-acl.393</doi>
       <video href="2024.findings-acl.393.mp4"/>
@@ -11557,7 +11557,7 @@
       <author><first>Uchenna</first><last>Akujuobi</last><affiliation>Sony Research</affiliation></author>
       <pages>6597-6610</pages>
       <abstract>Aspect-Based Sentiment Analysis (ABSA) involves extracting opinions from textual data about specific entities and their corresponding aspects through various complementary subtasks. Several prior research has focused on developing ad hoc designs of varying complexities for these subtasks. In this paper, we build upon the instruction tuned model proposed by Scaria et al. (2023), who present an instruction-based model with task descriptions followed by in-context examples on ABSA subtasks. We propose PFInstruct, an extension to this instruction learning paradigm by appending an NLP-related task prefix to the task description. This simple approach leads to improved performance across all tested SemEval subtasks, surpassing previous state-of-the-art (SOTA) on the ATE subtask (Rest14) by +3.28 F1-score, and on the AOOE subtask by an average of +5.43 F1-score across SemEval datasets. Furthermore, we explore the impact of the prefix-enhanced prompt quality on the ABSA subtasks and find that even a noisy prefix enhances model performance compared to the baseline. Our method also achieves competitive results on a biomedical domain dataset (ERSA).</abstract>
-      <url hash="8bc04f84">2024.findings-acl.394</url>
+      <url hash="ed1df64b">2024.findings-acl.394</url>
       <bibkey>cabello-akujuobi-2024-simple</bibkey>
       <doi>10.18653/v1/2024.findings-acl.394</doi>
     </paper>
@@ -11569,7 +11569,7 @@
       <author><first>Kristina</first><last>Lerman</last><affiliation>University of Southern California and USC Information Sciences Institute</affiliation></author>
       <pages>6611-6631</pages>
       <abstract>Language models (LMs) are known to represent the perspectives of some social groups better than others, which may impact their performance, especially on subjective tasks such as content moderation and hate speech detection. To explore how LMs represent different perspectives, existing research focused on positional alignment, i.e., how closely the models mimic the opinions and stances of different groups, e.g., liberals or conservatives. However, human communication also encompasses emotional and moral dimensions. We define the problem of affective alignment, which measures how LMs’ emotional and moral tone represents those of different groups. By comparing the affect of responses generated by 36 LMs to the affect of Twitter messages written by two ideological groups, we observe significant misalignment of LMs with both ideological groups. This misalignment is larger than the partisan divide in the U.S. Even after steering the LMs towards specific ideological perspectives, the misalignment and liberal tendencies of the model persist, suggesting a systemic bias within LMs.</abstract>
-      <url hash="5f824e2a">2024.findings-acl.395</url>
+      <url hash="c22bd2e9">2024.findings-acl.395</url>
       <bibkey>he-etal-2024-whose</bibkey>
       <doi>10.18653/v1/2024.findings-acl.395</doi>
       <video href="2024.findings-acl.395.mp4"/>
@@ -11587,7 +11587,7 @@
       <author><first>Xuanjing</first><last>Huang</last><affiliation>Fudan University</affiliation></author>
       <pages>6632-6646</pages>
       <abstract>In the realm of Large Language Models (LLMs), users commonly employ diverse decoding strategies and adjust hyperparameters to control the generated text. However, a critical question emerges: Are LLMs conscious of the existence of these decoding strategies and capable of regulating themselves? The current decoding generation process often relies on empirical and heuristic manual adjustments to hyperparameters based on types of tasks and demands. However, this process is typically cumbersome, and the decoding hyperparameters may not always be optimal for each sample. To address the aforementioned challenges, we propose a novel text generation paradigm termed Hyperparameter Aware Generation (HAG). By leveraging hyperparameter-aware instruction tuning, the LLM autonomously determines the optimal decoding strategy and configs based on the input samples, enabling self-regulation. Our approach eliminates the need for extensive manual tuning, offering a more autonomous, self-regulate model behavior. Experimental results spanning six datasets across reasoning, creativity, translation, and mathematics tasks demonstrate that hyperparameter-aware instruction tuning empowers the LLMs to self-regulate the decoding strategy and hyperparameter. HAG extends the current paradigm in the text generation process, highlighting the feasibility of endowing the LLMs with self-regulate decoding strategies.</abstract>
-      <url hash="52344fca">2024.findings-acl.396</url>
+      <url hash="7d20b169">2024.findings-acl.396</url>
       <bibkey>wang-etal-2024-llm-achieve</bibkey>
       <doi>10.18653/v1/2024.findings-acl.396</doi>
       <video href="2024.findings-acl.396.mp4"/>
@@ -11603,7 +11603,7 @@
       <author><first>James</first><last>Kwok</last><affiliation>Department of Computer Science and Engineering, The Hong Kong University of Science and Technology</affiliation></author>
       <pages>6647-6661</pages>
       <abstract>Self-Consistency samples diverse reasoning chains with answers and chooses the final answer by majority voting. It is based on forward reasoning and cannot further improve performance by sampling more reasoning chains when saturated. To further boost performance, we introduce backward reasoning to verify candidate answers. Specifically, for mathematical tasks, we mask a number in the question and ask the LLM to answer a backward question created by a simple template, i.e., to predict the masked number when a candidate answer is provided. Instead of using forward or backward reasoning alone, we propose **FOBAR** to combine **FO**rward and **BA**ckward **R**easoning for verification. Extensive experiments on six standard mathematical data sets and three LLMs show that FOBAR achieves state-of-the-art performance. In particular, FOBAR outperforms Self-Consistency, which uses forward reasoning alone, demonstrating that combining forward and backward reasoning is more accurate in verification. In addition, FOBAR achieves higher accuracy than existing verification methods, showing the effectiveness of the simple template used in backward reasoning and the proposed combination.</abstract>
-      <url hash="6b0a436e">2024.findings-acl.397</url>
+      <url hash="3fca217b">2024.findings-acl.397</url>
       <bibkey>jiang-etal-2024-forward</bibkey>
       <doi>10.18653/v1/2024.findings-acl.397</doi>
     </paper>
@@ -11614,7 +11614,7 @@
       <author><first>Ehsan</first><last>Shareghi</last><affiliation>Monash University and University of Cambridge</affiliation></author>
       <pages>6662-6685</pages>
       <abstract>While Language Agents have achieved promising success by placing Large Language Models at the core of a more versatile design that dynamically interacts with the external world, the existing approaches neglect the notion of uncertainty during these interactions. We present the Uncertainty-Aware Language Agent (UALA), a framework that orchestrates the interaction between the agent and the external world using uncertainty quantification. Compared with other well-known counterparts like ReAct, our extensive experiments across 3 representative tasks (HotpotQA, StrategyQA, MMLU) and various LLM sizes demonstrate that UALA brings a significant improvement of performance, while having a substantially lower reliance on the external world (i.e., reduced number of tool calls and tokens). Our analyses provide various insights including the great potential of UALA compared with agent fine-tuning, and underscore the unreliability of verbalised confidence of LLMs as a proxy for uncertainty.</abstract>
-      <url hash="04e9dc74">2024.findings-acl.398</url>
+      <url hash="0209686d">2024.findings-acl.398</url>
       <bibkey>han-etal-2024-towards</bibkey>
       <doi>10.18653/v1/2024.findings-acl.398</doi>
       <video href="2024.findings-acl.398.mp4"/>
@@ -11627,7 +11627,7 @@
       <author><first>Shiguang</first><last>Ni</last><affiliation>Tsinghua University, Tsinghua University</affiliation></author>
       <pages>6686-6701</pages>
       <abstract>This research introduces a Positive Reconstruction Framework based on positive psychology theory. Overcoming negative thoughts can be challenging, our objective is to address and reframe them through a positive reinterpretation. To tackle this challenge, a two-fold approach is necessary: identifying cognitive distortions and suggesting a positively reframed alternative while preserving the original thought’s meaning. Recent studies have investigated the application of Natural Language Processing (NLP) models in English for each stage of this process. In this study, we emphasize the theoretical foundation for the Positive Reconstruction Framework, grounded in broaden-and-build theory. We provide a shared corpus containing 4001 instances for detecting cognitive distortions and 1900 instances for positive reconstruction in Mandarin. Leveraging recent NLP techniques, including transfer learning, fine-tuning pretrained networks, and prompt engineering, we demonstrate the effectiveness of automated tools for both tasks. In summary, our study contributes to multilingual positive reconstruction, highlighting the effectiveness of NLP in cognitive distortion detection and positive reconstruction.</abstract>
-      <url hash="2e75a4a0">2024.findings-acl.399</url>
+      <url hash="b6256145">2024.findings-acl.399</url>
       <bibkey>lin-etal-2024-detection</bibkey>
       <doi>10.18653/v1/2024.findings-acl.399</doi>
       <video href="2024.findings-acl.399.mp4"/>
@@ -11640,7 +11640,7 @@
       <author><first>Ehsan</first><last>Shareghi</last><affiliation>Monash University and University of Cambridge</affiliation></author>
       <pages>6702-6718</pages>
       <abstract>Large language models (LLMs) have shown great abilities of solving various natural language tasks in different domains. Due to the training objective of LLMs and their pre-training data, LLMs are not very well equipped for tasks involving structured data generation. We propose a framework, Prompting with Iterative Verification (PiVe), to improve graph-based generative capability of LLMs. We show how a small language model could be trained to act as a verifier module for the output of an LLM(i.e., ChatGPT, GPT-4), and to iteratively improve its performance via fine-grained corrective instructions. We also show how the verifier module could apply iterative corrections offline for a more cost-effective solution to the text-to-graph generation task. Experiments on three graph-based datasets show consistent improvement gained via PiVe. Additionally, we create GenWiki-HIQ and highlight that the verifier module can be used as a data augmentation tool to help improve the quality of automatically generated parallel text-graph datasets.</abstract>
-      <url hash="1937ce6f">2024.findings-acl.400</url>
+      <url hash="6fc6464e">2024.findings-acl.400</url>
       <bibkey>han-etal-2024-pive</bibkey>
       <doi>10.18653/v1/2024.findings-acl.400</doi>
       <video href="2024.findings-acl.400.mp4"/>
@@ -11655,7 +11655,7 @@
       <author><first>Dongsheng</first><last>Li</last></author>
       <pages>6719-6734</pages>
       <abstract>Temporal knowledge graph question answering (TKGQA) poses a significant challenge task, due to the temporal constraints hidden in questions and the answers sought from dynamic structured knowledge. Although large language models (LLMs) have made considerable progress in their reasoning ability over structured data, their application to the TKGQA task is a relatively unexplored area. This paper first proposes a novel generative temporal knowledge graph question answering framework, GenTKGQA, which guides LLMs to answer temporal questions through two phases: Subgraph Retrieval and Answer Generation. First, we exploit LLM’s intrinsic knowledge to mine temporal constraints and structural links in the questions without extra training, thus narrowing down the subgraph search space in both temporal and structural dimensions. Next, we design virtual knowledge indicators to fuse the graph neural network signals of the subgraph and the text representations of the LLM in a non-shallow way, which helps the open-source LLM deeply understand the temporal order and structural dependencies among the retrieved facts through instruction tuning. Experimental results on two widely used datasets demonstrate the superiority of our model.</abstract>
-      <url hash="d829aa22">2024.findings-acl.401</url>
+      <url hash="463c4385">2024.findings-acl.401</url>
       <bibkey>gao-etal-2024-two</bibkey>
       <doi>10.18653/v1/2024.findings-acl.401</doi>
     </paper>
@@ -11668,7 +11668,7 @@
       <author><first>Eric</first><last>Nyberg</last><affiliation>Carnegie Mellon University</affiliation></author>
       <pages>6735-6752</pages>
       <abstract>Verifying a question’s validity before answering is crucial in real-world applications, where users may provide imperfect instructions. In this scenario, an ideal model should address the discrepancies in the query and convey them to the users rather than generating the best possible answer. Addressing this requirement, we introduce a new compositional visual question-answering dataset, VisReas, that consists of answerable and unanswerable visual queries formulated by traversing and perturbing commonalities and differences among objects, attributes, and relations. VisReas contains 2.07M semantically diverse queries generated automatically using Visual Genome scene graphs. The unique feature of this task, validating question answerability with respect to an image before answering, and the poor performance of state-of-the-art models inspired the design of a new modular baseline, Logic2Vision that reasons by producing and executing pseudocode without any external modules to generate the answer. Logic2Vision outperforms generative models in VisReas (+4.82% over LLaVA-1.5; +12.23% over InstructBLIP) and achieves a significant gain in performance against the classification models.</abstract>
-      <url hash="d8a73e27">2024.findings-acl.402</url>
+      <url hash="c78ef7d9">2024.findings-acl.402</url>
       <bibkey>akter-etal-2024-visreas</bibkey>
       <doi>10.18653/v1/2024.findings-acl.402</doi>
       <video href="2024.findings-acl.402.mp4"/>
@@ -11683,7 +11683,7 @@
       <author><first>Ying</first><last>Sha</last></author>
       <pages>6753-6766</pages>
       <abstract>Various euphemisms are emerging in social networks, attracting widespread attention from the natural language processing community. However, existing euphemism datasets are only domain-specific or language-specific. In addition, existing approaches to the study of euphemisms are one-sided. Either only the euphemism detection task or only the euphemism identification task is accomplished, lacking a unified framework. To this end, we construct a large-scale Bilingual Multi-category dataset of Euphemisms named BME, which covers a total of 12 categories for two languages, English and Chinese. Then, we first propose a unified generative model to Jointly conduct the tasks of bilingual Euphemism Detection and Identification named JointEDI. By comparing with LLMs and human evaluation, we demonstrate the effectiveness of the proposed JointEDI and the feasibility of unifying euphemism detection and euphemism identification tasks. Moreover, the BME dataset also provides a new reference standard for euphemism detection and euphemism identification.</abstract>
-      <url hash="70c35733">2024.findings-acl.403</url>
+      <url hash="a3cc5801">2024.findings-acl.403</url>
       <bibkey>hu-etal-2024-unified</bibkey>
       <doi>10.18653/v1/2024.findings-acl.403</doi>
     </paper>
@@ -11700,7 +11700,7 @@
       <author><first>Qingming</first><last>Huang</last><affiliation>University of Chinese Academy of Sciences</affiliation></author>
       <pages>6767-6779</pages>
       <abstract>Given a script, the challenge in Movie Dubbing (Visual Voice Cloning, V2C) is to generate speech that aligns well with the video in both time and emotion, based on the tone of a reference audio track. Existing state-of-the-art V2C models break the phonemes in the script according to the divisions between video frames, which solves the temporal alignment problem but leads to incomplete phoneme pronunciation and poor identity stability. To address this problem, we propose StyleDubber, which switches dubbing learning from the frame level to phoneme level. It contains three main components: (1) A multimodal style adaptor operating at the phoneme level to learn pronunciation style from the reference audio, and generate intermediate representations informed by the facial emotion presented in the video; (2) An utterance-level style learning module, which guides both the mel-spectrogram decoding and the refining processes from the intermediate embeddings to improve the overall style expression; And (3) a phoneme-guided lip aligner to maintain lip sync. Extensive experiments on two of the primary benchmarks, V2C and Grid, demonstrate the favorable performance of the proposed method as compared to the current state-of-the-art. The code will be made available at https://github.com/GalaxyCong/StyleDubber.</abstract>
-      <url hash="c6423d3b">2024.findings-acl.404</url>
+      <url hash="bcfa6051">2024.findings-acl.404</url>
       <bibkey>cong-etal-2024-styledubber</bibkey>
       <doi>10.18653/v1/2024.findings-acl.404</doi>
     </paper>
@@ -11710,7 +11710,7 @@
       <author><first>Yong</first><last>Liu</last><affiliation>Renmin University of China and Institute of information engineering, CAS</affiliation></author>
       <pages>6780-6795</pages>
       <abstract>Transformer Architecture Search (TAS) methods aim to automate searching for the optimal Transformer architecture configurations for a given task. However, they are impeded by the prohibitive cost of evaluating Transformer architectures. Recently, several Zero-Shot TAS methods have been proposed to mitigate this problem by utilizing zero-cost proxies to evaluate Transformer architectures without training. Unfortunately, they are limited to specific computer vision or natural language processing tasks. Nonetheless, most of them are developed based on empirical observations and lack theoretical guarantees. To solve this problem, we develop a new zero-cost proxy called NTSR that combines two theoretically-inspired indicators to measure the trainability and expressivity of Transformer networks separately. We then integrate it into an effective regularized evolution framework called ETAS to demonstrate its efficacy on various tasks. The results show that our proposed NTSR proxy can consistently achieve a higher correlation with the true performance of Transformer networks on both computer vision and natural language processing tasks. Further, it can significantly accelerate the search process for finding the best-performing Transformer architecture configurations.</abstract>
-      <url hash="7d199ee9">2024.findings-acl.405</url>
+      <url hash="bcbd090f">2024.findings-acl.405</url>
       <bibkey>yang-liu-2024-etas</bibkey>
       <doi>10.18653/v1/2024.findings-acl.405</doi>
       <video href="2024.findings-acl.405.mp4"/>
@@ -11724,7 +11724,7 @@
       <author><first>Wenjie</first><last>Li</last><affiliation>The Hong Kong Polytechnic University, The Hong Kong Polytechnic University</affiliation></author>
       <pages>6796-6814</pages>
       <abstract>Medical dialogue systems have attracted significant attention for their potential to act as medical assistants. Enabling these medical systems to emulate clinicians’ diagnostic reasoning process has been the long-standing research focus. Previous studies rudimentarily realized the simulation of clinicians’ diagnostic process by fine-tuning language models on high-quality dialogue datasets. Nonetheless, they overly focus on the outcomes of the clinician’s reasoning process while ignoring their internal thought processes and alignment with clinician preferences. Our work aims to build a medical dialogue system that aligns with clinicians’ diagnostic reasoning processes. We propose a novel framework, Emulation, designed to generate an appropriate response that relies on abductive and deductive diagnostic reasoning analyses and aligns with clinician preferences through thought process modeling. Experimental results on two datasets confirm the efficacy of Emulation. Crucially, our framework furnishes clear explanations for the generated responses, enhancing its transparency in medical consultations.</abstract>
-      <url hash="6f411f71">2024.findings-acl.406</url>
+      <url hash="5a8c01a9">2024.findings-acl.406</url>
       <bibkey>xu-etal-2024-reasoning</bibkey>
       <doi>10.18653/v1/2024.findings-acl.406</doi>
     </paper>
@@ -11745,7 +11745,7 @@
       <author><first>Bo</first><last>Zheng</last><affiliation>Alibaba Group</affiliation></author>
       <pages>6815-6839</pages>
       <abstract>This paper introduces ConceptMath, a bilingual (English and Chinese), fine-grained benchmark that evaluates concept-wise mathematical reasoning of Large Language Models (LLMs). Unlike traditional benchmarks that evaluate general mathematical reasoning with an average accuracy, ConceptMath systemically organizes math problems under a hierarchy of math concepts, so that mathematical reasoning can be evaluated at different granularity with concept-wise accuracies. Based on our ConcepthMath, we then evaluate a broad range of LLMs, and we observe existing LLMs, though achieving high average accuracies on traditional benchmarks, exhibit significant performance variations across different math concepts and may even fail catastrophically on the most basic ones. Besides, we also introduce an efficient fine-tuning strategy to enhance the weaknesses of existing LLMs. Finally, we hope ConceptMath could guide the developers to understand the fine-grained mathematical abilities of their models and facilitate the growth of foundation models. Code is available at https://github.com/conceptmath/conceptmath.</abstract>
-      <url hash="0e27fa08">2024.findings-acl.407</url>
+      <url hash="e8b58423">2024.findings-acl.407</url>
       <bibkey>wu-etal-2024-conceptmath</bibkey>
       <doi>10.18653/v1/2024.findings-acl.407</doi>
       <video href="2024.findings-acl.407.mp4"/>
@@ -11760,7 +11760,7 @@
       <author><first>Le</first><last>Sun</last><affiliation>Institute of Software, Chinese Academy of Sciences</affiliation></author>
       <pages>6840-6856</pages>
       <abstract>Manually annotating instruction data for large language models is difficult, costly, and hard to scale. Meanwhile, current automatic annotation methods typically rely on distilling synthetic data from proprietary LLMs, which not only limits the upper bound of the quality of the instruction data but also raises potential copyright issues. In this paper, we propose REInstruct, a simple and scalable method to automatically build instruction data from an unlabeled corpus without heavy reliance on proprietary LLMs and human annotation.Specifically, REInstruct first selects a subset of unlabeled texts that potentially contain well-structured helpful and insightful content and then generates instructions for these texts. To generate accurate and relevant responses for effective and robust training, REInstruct further proposes a rewriting-based approach to improve the quality of the generated instruction data. By training Llama-7b on a combination of 3k seed data and 32k synthetic data from REInstruct, fine-tuned model achieves a 65.41% win rate on AlpacaEval leaderboard against text-davinci-003, outperforming other open-source, non-distilled instruction data construction methods. The code is publicly available at <url>https://github.com/cs32963/REInstruct</url>.</abstract>
-      <url hash="925018c3">2024.findings-acl.408</url>
+      <url hash="984c699c">2024.findings-acl.408</url>
       <bibkey>chen-etal-2024-reinstruct</bibkey>
       <doi>10.18653/v1/2024.findings-acl.408</doi>
     </paper>
@@ -11774,7 +11774,7 @@
       <author><first>Ke</first><last>Ding</last><affiliation>Intel</affiliation></author>
       <pages>6857-6868</pages>
       <abstract>Knowledge distillation, the technique of transferring knowledge from large, complex models to smaller ones, marks a pivotal step towards efficient AI deployment. Distilling Step-by-Step (DSS), a novel method utilizing chain-of-thought (CoT) distillation, has demonstrated promise by imbuing smaller models with the superior reasoning capabilities of their larger counterparts. In DSS, the distilled model acquires the ability to generate rationales and predict labels concurrently through a multi-task learning framework. However, DSS overlooks the intrinsic relationship between the two training tasks, leading to ineffective integration of CoT knowledge with the task of label prediction. To this end, we investigate the mutual relationship of the two tasks from Information Bottleneck perspective and formulate it as maximizing the mutual information of the representation features of the two tasks. We propose a variational approach to solve this optimization problem using a learning-based method. Our experimental results across four datasets demonstrate that our method outperforms the state-of-the-art DSS. Our findings offer insightful guidance for future research on language model distillation as well as applications involving CoT. Codes are available at https://github.com/xinchen9/cot_distillation_ACL2024.</abstract>
-      <url hash="83540d23">2024.findings-acl.409</url>
+      <url hash="dbe4b3fe">2024.findings-acl.409</url>
       <bibkey>chen-etal-2024-learning-maximize</bibkey>
       <doi>10.18653/v1/2024.findings-acl.409</doi>
       <video href="2024.findings-acl.409.mp4"/>
@@ -11788,7 +11788,7 @@
       <author><first>Jianling</first><last>Sun</last></author>
       <pages>6869-6883</pages>
       <abstract>Parameter-efficient fine-tuning (PEFT) has emerged as an effective method for adapting pre-trained language models to various tasks efficiently. Recently, there has been a growing interest in transferring knowledge from one or multiple tasks to the downstream target task to achieve performance improvements. However, current approaches typically either train adapters on individual tasks or distill shared knowledge from source tasks, failing to fully exploit task-specific knowledge and the correlation between source and target tasks. To overcome these limitations, we propose PEMT, a novel parameter-efficient fine-tuning framework based on multi-task transfer learning. PEMT extends the mixture-of-experts (MoE) framework to capture the transferable knowledge as a weighted combination of adapters trained on source tasks. These weights are determined by a gated unit, measuring the correlation between the target and each source task using task description prompt vectors. To fully exploit the task-specific knowledge, we also propose the Task Sparsity Loss to improve the sparsity of the gated unit. We conduct experiments on a broad range of tasks over 17 datasets. The experimental results demonstrate our PEMT yields stable improvements over full fine-tuning, and state-of-the-art PEFT and knowledge transferring methods on various tasks. The results highlight the effectiveness of our method which is capable of sufficiently exploiting the knowledge and correlation features across multiple tasks.</abstract>
-      <url hash="37cc2bc7">2024.findings-acl.410</url>
+      <url hash="707a7e3f">2024.findings-acl.410</url>
       <bibkey>lin-etal-2024-pemt</bibkey>
       <doi>10.18653/v1/2024.findings-acl.410</doi>
       <video href="2024.findings-acl.410.mp4"/>
@@ -11807,7 +11807,7 @@
       <author><first>Kai</first><last>Chen</last><affiliation>Shanghai AI Laboratory</affiliation></author>
       <pages>6884-6915</pages>
       <abstract>Recent advancements in large language models (LLMs) have showcased significant improvements in mathematics. However, traditional math benchmarks like GSM8k offer a unidimensional perspective, which fall short in providing a holistic assessment of the LLMs’ math capabilities. To address this gap, we introduce MathBench, a new benchmark that rigorously assesses the mathematical capabilities of large language models. MathBench spans a wide range of mathematical disciplines, offering a detailed evaluation of both theoretical understanding and practical problem-solving skills. The benchmark progresses through five distinct stages, from basic arithmetic to college mathematics, and is structured to evaluate models at various depths of knowledge. Each stage includes theoretical questions and application problems, allowing us to measure a model’s mathematical proficiency and its ability to apply concepts in practical scenarios. MathBench aims to enhance the evaluation of LLMs’ mathematical abilities, providing a nuanced view of their knowledge understanding levels and problem solving skills in a bilingual context.</abstract>
-      <url hash="d661569d">2024.findings-acl.411</url>
+      <url hash="11057159">2024.findings-acl.411</url>
       <bibkey>liu-etal-2024-mathbench</bibkey>
       <doi>10.18653/v1/2024.findings-acl.411</doi>
     </paper>
@@ -11822,7 +11822,7 @@
       <author><first>Dahua</first><last>Lin</last><affiliation>The Chinese University of Hong Kong</affiliation></author>
       <pages>6916-6932</pages>
       <abstract>Although large language models (LLMs) have demonstrated remarkable performance, the lack of transparency in their inference logic raises concerns about their trustworthiness. To gain a better understanding of LLMs, we conduct a detailed analysis of the operations of attention heads and aim to better understand the in-context learning of LLMs. Specifically, we investigate whether attention heads encode two types of relationships between tokens present in natural languages: the syntactic dependency parsed from sentences and the relation within knowledge graphs. We find that certain attention heads exhibit a pattern where, when attending to subject tokens, they recall object tokens and increase the output logits of those object tokens. More crucially, the formulation of such semantic induction heads has a close correlation with the emergence of the in-context learning ability of language models. The study of semantic attention heads advances our understanding of the intricate operations of attention heads in transformers, and further provides new insights into the in-context learning of LLMs.</abstract>
-      <url hash="a714a79c">2024.findings-acl.412</url>
+      <url hash="60909ef2">2024.findings-acl.412</url>
       <bibkey>ren-etal-2024-identifying</bibkey>
       <doi>10.18653/v1/2024.findings-acl.412</doi>
       <video href="2024.findings-acl.412.mp4"/>
@@ -11835,7 +11835,7 @@
       <author><first>Min</first><last>Zhang</last><affiliation>Harbin Institute of Technology, Shenzhen</affiliation></author>
       <pages>6933-6943</pages>
       <abstract>This paper emphasizes Chinese spelling correction by means of self-supervised learning, which means there are no annotated errors within the training data. Our intuition is that humans are naturally good correctors with exposure to error-free sentences, which contrasts with current unsupervised methods that strongly rely on the usage of confusion sets to produce parallel sentences. In this paper, we demonstrate that learning a spelling correction model is identical to learning a language model from error-free data alone, with decoding it in a greater search space. We propose <i>Denoising Decoding Correction (D2C)</i>, which selectively imposes noise upon the source sentence to determine the underlying correct characters. Our method is largely inspired by the ability of language models to perform correction, including both BERT-based models and large language models (LLMs). We show that the self-supervised learning manner generally outperforms the confusion set in specific domains because it bypasses the need to introduce error characters to the training data which can impair the error patterns not included in the introduced error characters.</abstract>
-      <url hash="fe3c4fc8">2024.findings-acl.413</url>
+      <url hash="9c3e0b47">2024.findings-acl.413</url>
       <bibkey>jiang-etal-2024-chinese</bibkey>
       <doi>10.18653/v1/2024.findings-acl.413</doi>
     </paper>
@@ -11850,7 +11850,7 @@
       <author><first>Tieniu</first><last>Tan</last><affiliation>Institute of Automation, Chinese Academy of Sciences</affiliation></author>
       <pages>6944-6962</pages>
       <abstract>Object hallucination has been an Achilles’ heel which hinders the broader applications of large vision-language models (LVLMs). Object hallucination refers to the phenomenon that the LVLMs claim non-existent objects in the image. To mitigate the object hallucinations, instruction tuning and external model-based detection methods have been proposed, which either require large-scare computational resources or depend on the detection result of external models. However, there remains an under-explored field to utilize the LVLM itself to alleviate object hallucinations. In this work, we adopt the intuition that the LVLM tends to respond logically consistently for existent objects but inconsistently for hallucinated objects. Therefore, we propose a Logical Closed Loop-based framework for Object Hallucination Detection and Mitigation, namely <tex-math>\textbf{LogicCheckGPT}</tex-math>. In specific, we devise logical consistency probing to raise questions with logical correlations, inquiring about attributes from objects and vice versa. Whether their responses can form a logical closed loop serves as an indicator of object hallucination. As a plug-and-play method, it can be seamlessly applied to all existing LVLMs. Comprehensive experiments conducted on three benchmarks across four LVLMs have demonstrated significant improvements brought by our method, indicating its effectiveness and generality.</abstract>
-      <url hash="77ed38d8">2024.findings-acl.414</url>
+      <url hash="9529e24d">2024.findings-acl.414</url>
       <bibkey>wu-etal-2024-logical</bibkey>
       <doi>10.18653/v1/2024.findings-acl.414</doi>
       <video href="2024.findings-acl.414.mp4"/>
@@ -11862,7 +11862,7 @@
       <author><first>Ling</first><last>Chen</last><affiliation>University of Technology Sydney</affiliation></author>
       <pages>6963-6975</pages>
       <abstract>Adaptive retrieval-augmented generation (ARAG) aims to dynamically determine the necessity of retrieval for queries instead of retrieving indiscriminately to enhance the efficiency and relevance of the sourced information. However, previous works largely overlook the evaluation of ARAG approaches, leading to their effectiveness being understudied. This work presents a benchmark, RetrievalQA, comprising 1,271 short-form questions covering new world and long-tail knowledge. The knowledge necessary to answer the questions is absent from LLMs; therefore, external information must be retrieved to answer correctly. This makes RetrievalQA a suitable testbed to evaluate existing ARAG methods. We observe that calibration-based methods heavily rely on threshold tuning, while vanilla prompting is inadequate for guiding LLMs to make reliable retrieval decisions. Based on our findings, we propose Time-Aware Adaptive Retrieval (TA-ARE), a simple yet effective method that helps LLMs assess the necessity of retrieval without calibration or additional training.</abstract>
-      <url hash="07c115dd">2024.findings-acl.415</url>
+      <url hash="0e680fe8">2024.findings-acl.415</url>
       <bibkey>zhang-etal-2024-retrievalqa</bibkey>
       <doi>10.18653/v1/2024.findings-acl.415</doi>
     </paper>
@@ -11875,7 +11875,7 @@
       <author><first>Satoshi</first><last>Nakamura</last><affiliation>The Chinese University of Hong Kong</affiliation></author>
       <pages>6976-6987</pages>
       <abstract>We introduces ***LLaST***, a framework for building high-performance Large Language model based Speech-to-text Translation systems. We address the limitations of end-to-end speech translation (E2E ST) models by exploring model architecture design and optimization techniques tailored for LLMs. Our approach includes LLM-based speech translation architecture design, ASR-augmented training, multilingual data augmentation, and dual-LoRA optimization. Our approach demonstrates superior performance on the CoVoST-2 benchmark and showcases exceptional scaling capabilities powered by LLMs.We believe this effective method will serve as a strong baseline for speech translation and provide insights for futureimprovements of the LLM-based speech translation framework.</abstract>
-      <url hash="a79cfd6a">2024.findings-acl.416</url>
+      <url hash="23801ba4">2024.findings-acl.416</url>
       <bibkey>chen-etal-2024-llast</bibkey>
       <doi>10.18653/v1/2024.findings-acl.416</doi>
       <video href="2024.findings-acl.416.mp4"/>
@@ -11886,7 +11886,7 @@
       <author><first>Yan</first><last>Yang</last><affiliation>East China Normal University</affiliation></author>
       <pages>6988-7005</pages>
       <abstract>Data augmentation methods have been a promising direction to improve the performance of small models for low-resource dialogue state tracking. However, traditional methods rely on pre-defined user goals and neglect the importance of data complexity in this task. In this paper, we propose EDZ-DA, an Easy-to-Difficult Zero-shot Data Augmentation framework for low-resource dialogue state tracking that utilizes large language models to automatically catch the relationships of different domains and then generate the dialogue data. We also complicate the dialogues based on the domain relation to enhance the model’s capability for co-reference slot tracking. Furthermore, we permute slot values to mitigate the influence of output orders and the problem of incomplete value generation. Experimental results illustrate the superiority of our proposed method compared to previous strong data augmentation baselines on MultiWOZ.</abstract>
-      <url hash="fb8eb601">2024.findings-acl.417</url>
+      <url hash="b255eee0">2024.findings-acl.417</url>
       <bibkey>gu-yang-2024-plan</bibkey>
       <doi>10.18653/v1/2024.findings-acl.417</doi>
       <video href="2024.findings-acl.417.mp4"/>
@@ -11896,7 +11896,7 @@
       <author><first>Shanghaoran</first><last>Quan</last></author>
       <pages>7006-7028</pages>
       <abstract>The performance of the reward model (RM) is a critical factor in improving the effectiveness of the large language model (LLM) during alignment fine-tuning. There remain two challenges in RM training: 1) training the same RM using various categories of data may cause its generalization performance to suffer from multi-task disturbance, and 2) the human annotation consistency rate is generally only 60% to 75%, causing training data to contain a lot of noise. To tackle these two challenges, we introduced the idea of Mixture-of-Experts (MoE) into the field of RM for the first time. We propose the Double-Layer MoE RM (DMoERM). The outer layer MoE is a sparse model. After classifying an input into task categories, we route it to the corresponding inner layer task-specific model. The inner layer MoE is a dense model. We decompose the specific task into multiple capability dimensions and individually fine-tune a LoRA expert on each one. Their outputs are then synthesized by an MLP to compute the final rewards. To minimize costs, we call a public LLM API to obtain the capability preference labels. The validation on manually labeled datasets confirms that our model attains superior consistency with human preference and outstrips advanced generative approaches. Meanwhile, through BoN sampling and RL experiments, we demonstrate that our model outperforms state-of-the-art ensemble methods of RM and mitigates the overoptimization problem. Our code is available at: https://github.com/quanshr/DMoERM.</abstract>
-      <url hash="56c4198a">2024.findings-acl.418</url>
+      <url hash="2aacd263">2024.findings-acl.418</url>
       <bibkey>quan-2024-dmoerm</bibkey>
       <doi>10.18653/v1/2024.findings-acl.418</doi>
     </paper>
@@ -11906,7 +11906,7 @@
       <author><first>Ryokan</first><last>Ri</last><affiliation>SB Intuitions</affiliation></author>
       <pages>7029-7039</pages>
       <abstract>Adapting English-based large language models (LLMs) to other languages has become increasingly popular due to the efficiency and potential of cross-lingual transfer. However, existing language adaptation methods often overlook the benefits of cross-lingual supervision. In this study, we introduce LEIA, a language adaptation tuning method that utilizes Wikipedia entity names aligned across languages. This method involves augmenting the target language corpus with English entity names and training the model using left-to-right language modeling. We assess LEIA on diverse question answering datasets using 7B-parameter LLMs, demonstrating significant performance gains across various non-English languages.</abstract>
-      <url hash="b9bbd7d7">2024.findings-acl.419</url>
+      <url hash="4a8bf01d">2024.findings-acl.419</url>
       <bibkey>yamada-ri-2024-leia</bibkey>
       <doi>10.18653/v1/2024.findings-acl.419</doi>
     </paper>
@@ -11920,7 +11920,7 @@
       <author><first>Jie</first><last>Zhou</last></author>
       <pages>7040-7051</pages>
       <abstract>Code generation aims to understand the problem description and generate corresponding code snippets, where existing works generally decompose such complex tasks into intermediate steps by prompting strategies, such as Chain-of-Thought and its variants. While these studies have achieved some success, their effectiveness is highly dependent on the capabilities of advanced Large Language Models (LLMs) such as GPT-4, particularly in terms of API calls, which significantly limits their practical applicability. Consequently, how to enhance the code generation capabilities of small and medium-scale code LLMs without significantly increasing training costs is an appealing challenge. In this paper, we suggest that code comments are the natural logic pivot between natural language and code language and propose using comments to boost the code generation ability of code LLMs. Concretely, we propose MANGO (comMents As Natural loGic pivOts), including a comment contrastive training strategy and a corresponding logical comment decoding strategy. Experiments are performed on HumanEval and MBPP, utilizing StarCoder and WizardCoder as backbone models, and encompassing model parameter sizes between 3B and 7B. The results indicate that MANGO significantly improves the code pass rate based on the strong baselines. Meanwhile, the robustness of the logical comment decoding strategy is notably higher than the Chain-of-thoughts prompting.</abstract>
-      <url hash="110c4dec">2024.findings-acl.420</url>
+      <url hash="16c368d4">2024.findings-acl.420</url>
       <bibkey>chen-etal-2024-comments</bibkey>
       <doi>10.18653/v1/2024.findings-acl.420</doi>
       <video href="2024.findings-acl.420.mp4"/>
@@ -11938,7 +11938,7 @@
       <author><first>Ji-Rong</first><last>Wen</last><affiliation>Renmin University of China</affiliation></author>
       <pages>7052-7074</pages>
       <abstract>The proliferation of Large Language Models (LLMs) has led to an influx of AI-generated content (AIGC) on the internet, transforming the corpus of Information Retrieval (IR) systems from solely human-written to a coexistence with LLM-generated content. The impact of this surge in AIGC on IR systems remains an open question, with the primary challenge being the lack of a dedicated benchmark for researchers. In this paper, we introduce Cocktail, a comprehensive benchmark tailored for evaluating IR models in this mixed-sourced data landscape of the LLM era. Cocktail consists of 16 diverse datasets with mixed human-written and LLM-generated corpora across various text retrieval tasks and domains. Additionally, to avoid the potential bias from previously included dataset information in LLMs, we also introduce an up-to-date dataset, named NQ-UTD, with queries derived from recent events. Through conducting over 1,000 experiments to assess state-of-the-art retrieval models against the benchmarked datasets in Cocktail, we uncover a clear trade-off between ranking performance and source bias in neural retrieval models, highlighting the necessity for a balanced approach in designing future IR systems. We hope Cocktail can serve as a foundational resource for IR research in the LLM era, with all data and code publicly available at https://github.com/KID-22/Cocktail.</abstract>
-      <url hash="d21d8bb2">2024.findings-acl.421</url>
+      <url hash="8761b73a">2024.findings-acl.421</url>
       <bibkey>dai-etal-2024-cocktail</bibkey>
       <doi>10.18653/v1/2024.findings-acl.421</doi>
     </paper>
@@ -11953,7 +11953,7 @@
       <author><first>Albert</first><last>Lam</last><affiliation>University of Hong Kong and Fano Labs</affiliation></author>
       <pages>7075-7087</pages>
       <abstract>An ideal dialogue system requires continuous skill acquisition and adaptation to new tasks while retaining prior knowledge. Dialogue State Tracking (DST), vital in these systems, often involves learning new services, confronting catastrophic forgetting and a critical capability loss termed the “Value Selection Quandary”. To address these challenges, we introduce the Reason-of-Select (RoS) distillation method by enhancing smaller models with a novel “meta-reasoning” capability. Meta-reasoning, employing an enhanced multi-domain perspective, combines fragments of meta-knowledge from domain-specific dialogues during continual learning, transcending traditional single-perspective reasoning. This domain bootstrapping process enhances the model’s ability to dissect intricate dialogues from multiple possible values, and its domain-agnostic property aligns data distribution across different domains, effectively mitigating forgetting. Besides, two novel improvements, “multi-value resolution” strategy and Semantic Contrastive Reasoning Selection method, significantly enhance RoS by generating DST-specific selection chains and mitigating hallucinations in teachers’ reasoning, ensuring effective and reliable knowledge transfer. Extensive experiments validate the exceptional performance and robust generalization capabilities of our method.</abstract>
-      <url hash="559348f4">2024.findings-acl.422</url>
+      <url hash="ebc095ef">2024.findings-acl.422</url>
       <bibkey>feng-etal-2024-continual</bibkey>
       <doi>10.18653/v1/2024.findings-acl.422</doi>
       <video href="2024.findings-acl.422.mp4"/>
@@ -11968,7 +11968,7 @@
       <author><first>Yue</first><last>Zhang</last><affiliation>Westlake University</affiliation></author>
       <pages>7088-7107</pages>
       <abstract>AI-generated text detection has attracted increasing attention as powerful language models approach human-level generation. Limited work is devoted to detecting (partially) AI-paraphrased texts. However, AI paraphrasing is commonly employed in various application scenarios for text refinement and diversity. To this end, we propose a novel detection framework, paraphrased text span detection (PTD), aiming to identify paraphrased text spans within a text. Different from text-level detection, PTD takes in the full text and assigns each of the sentences with a score indicating the paraphrasing degree. We construct a dedicated dataset, PASTED, for paraphrased text span detection. Both in-distribution and out-of-distribution results demonstrate the effectiveness of PTD models in identifying AI-paraphrased text spans. Statistical and model analysis explains the crucial role of the surrounding context of the paraphrased text spans. Extensive experiments show that PTD models can generalize to versatile paraphrasing prompts as well as multiple paraphrased text spans.</abstract>
-      <url hash="78d1ed94">2024.findings-acl.423</url>
+      <url hash="64bc29a2">2024.findings-acl.423</url>
       <bibkey>li-etal-2024-spotting</bibkey>
       <doi>10.18653/v1/2024.findings-acl.423</doi>
       <video href="2024.findings-acl.423.mp4"/>
@@ -11985,7 +11985,7 @@
       <author><first>Yongbin</first><last>Li</last><affiliation>Alibaba Group</affiliation></author>
       <pages>7108-7136</pages>
       <abstract>The alignment problem in Large Language Models (LLMs) involves adapting them to the broad spectrum of human values. This requirement challenges existing alignment methods due to diversity of preferences and regulatory standards. This paper introduces a novel alignment paradigm, priority rule following, which defines rules as the primary control mechanism in each dialog, prioritizing them over user instructions. Our preliminary analysis reveals that even the advanced LLMs, such as GPT-4, exhibit shortcomings in understanding and prioritizing the rules. Therefore, we present PriorityDistill, a semi-automated approach for distilling priority following signals from LLM simulations to ensure robust rule integration and adherence. Our experiments show that this method not only effectively minimizes misalignments utilizing only one general rule but also adapts smoothly to various unseen rules, ensuring they are shielded from hijacking and that the model responds appropriately.</abstract>
-      <url hash="edb6d6d8">2024.findings-acl.424</url>
+      <url hash="cfb551ac">2024.findings-acl.424</url>
       <bibkey>lu-etal-2024-sofa</bibkey>
       <doi>10.18653/v1/2024.findings-acl.424</doi>
     </paper>
@@ -11995,7 +11995,7 @@
       <author><first>Gabriel</first><last>Stanovsky</last><affiliation>Hebrew University of Jerusalem</affiliation></author>
       <pages>7137-7143</pages>
       <abstract>Recent advances in LLMs have sparked a debate on whether they understand text. In this position paper, we argue that opponents in this debate hold different definitions for understanding, and particularly differ in their view on the role of consciousness. To substantiate this claim, we propose a thought experiment involving an open-source chatbot Z which excels on every possible benchmark, seemingly without subjective experience. We ask whether Z is capable of understanding, and show that different schools of thought within seminal AI research seem to answer this question differently, uncovering their terminological disagreement. Moving forward, we propose two distinct working definitions for understanding which explicitly acknowledge the question of consciousness, and draw connections with a rich literature in philosophy, psychology and neuroscience.</abstract>
-      <url hash="6fbf1d6c">2024.findings-acl.425</url>
+      <url hash="120d2de8">2024.findings-acl.425</url>
       <bibkey>goldstein-stanovsky-2024-zombies</bibkey>
       <doi>10.18653/v1/2024.findings-acl.425</doi>
       <video href="2024.findings-acl.425.mp4"/>
@@ -12009,7 +12009,7 @@
       <author><first>Björn</first><last>Schuller</last><affiliation>Technische Universität München and Imperial College London</affiliation></author>
       <pages>7144-7159</pages>
       <abstract>Telling stories is an integral part of human communication which can evoke emotions and influence the affective states of the audience. Automatically modeling emotional trajectories in stories has thus attracted considerable scholarly interest. However, as most existing works have been limited to unsupervised dictionary-based approaches, there is no benchmark for this task. We address this gap by introducing continuous valence and arousal labels for an existing dataset of children’s stories originally annotated with discrete emotion categories. We collect additional annotations for this data and map the categorical labels to the continuous valence and arousal space. For predicting the thus obtained emotionality signals, we fine-tune a DeBERTa model and improve upon this baseline via a weakly supervised learning approach. The best configuration achieves a Concordance Correlation Coefficient (CCC) of .8221 for valence and .7125 for arousal on the test set, demonstrating the efficacy of our proposed approach. A detailed analysis shows the extent to which the results vary depending on factors such as the author, the individual story, or the section within the story. In addition, we uncover the weaknesses of our approach by investigating examples that prove to be difficult to predict.</abstract>
-      <url hash="0bdea724">2024.findings-acl.426</url>
+      <url hash="a5a917b9">2024.findings-acl.426</url>
       <bibkey>christ-etal-2024-modeling</bibkey>
       <doi>10.18653/v1/2024.findings-acl.426</doi>
     </paper>
@@ -12027,7 +12027,7 @@
       <author><first>Ge</first><last>Li</last><affiliation>Peking University Shenzhen Graduate School</affiliation></author>
       <pages>7160-7174</pages>
       <abstract>Text-Video Retrieval (TVR) aims to align relevant video content with natural language queries. To date, most of the state-of-the-art TVR methods learn image-to-video transfer learning based on the large-scale pre-trained vision-language models (e.g., CLIP). However, fully fine-tuning these pre-trained models for TVR incurs prohibitively expensive computation cost. To this end, we propose to conduct efficient text-video Retrieval with a salient-and-correlated AdaPter (RAP), i.e., fine-tuning the pre-trained model with a few parameterized layers. To accommodate the text-video scenario, we equip our RAP with two indispensable characteristics including temporal sparsity and correlation. Specifically, we propose a low-rank modulation module to refine the per-image features from frozen CLIP backbone, which accentuates silent frames within the video features while alleviating temporal redundancy. Besides, we introduce an asynchronous self-attention mechanism which firstly selects top responsive visual patch and augments the correlation modeling between them with learnable temporal and patch offsets. Extensive experiments on four TVR datasets demonstrate that our RAP achieves superior or comparable performance compared to the fully fine-tuned counterpart and other parameter-efficient finetuning methods.</abstract>
-      <url hash="5e9dd31e">2024.findings-acl.427</url>
+      <url hash="9107da03">2024.findings-acl.427</url>
       <bibkey>cao-etal-2024-rap</bibkey>
       <doi>10.18653/v1/2024.findings-acl.427</doi>
     </paper>
@@ -12045,7 +12045,7 @@
       <author><first>Zhaopeng</first><last>Tu</last><affiliation>Tencent AI Lab</affiliation></author>
       <pages>7175-7187</pages>
       <abstract>Recent studies have illuminated the promising capabilities of large language models (LLMs) in handling long texts. However, their performance in machine translation (MT) of long documents remains underexplored. This paper aims to shed light on how LLMs navigate this complex task, offering a comprehensive evaluation of their capabilities and limitations in long-text MT. First, we collect and construct an instruction-based benchmark dataset, specifically designed for the finetuning and evaluation of LLMs, encompassing multilingual, multi-domain, and document-level parallel data. Second, we conduct a comprehensive comparison between MT and LLM models concerning document-level translation. Our analysis uncovers that LLMs exhibit shortcomings in long-text domains, and their performance diminishes as document size escalates. By exploiting various extrapolation strategies, we enhance the capacity of LLMs to translate longer texts. We release data, code, and models at https://github.com/longyuewangdcu/Document-MT-LLM.</abstract>
-      <url hash="640d3022">2024.findings-acl.428</url>
+      <url hash="9afc9384">2024.findings-acl.428</url>
       <bibkey>wang-etal-2024-benchmarking</bibkey>
       <doi>10.18653/v1/2024.findings-acl.428</doi>
       <video href="2024.findings-acl.428.mp4"/>
@@ -12060,7 +12060,7 @@
       <author><first>Dangyang</first><last>Chen</last><affiliation>Pingan Technology</affiliation></author>
       <pages>7188-7202</pages>
       <abstract>Recently, the topic-grounded dialogue (TGD) system has become increasingly popular as its powerful capability to actively guide users to accomplish specific tasks through topic-guided conversations. Most existing works utilize side information (e.g. topics or personas) in isolation to enhance the topic selection ability. However, due to disregarding the noise within these auxiliary information sources and their mutual influence, current models tend to predict user-uninteresting and contextually irrelevant topics. To build user-engaging and coherent dialogue agent, we propose a personalized topic selection model for topic-grounded dialogue, named PETD, which takes account of the interaction of side information to selectively aggregate such information for more accurately predicting subsequent topics. Specifically, we evaluate the correlation between global topics and personas and selectively incorporate the global topics aligned with user personas. Furthermore, we propose a contrastive learning based persona selector to filter relevant personas under the constraint of lacking pertinent persona annotations. Throughout the selection and generation, diverse relevant side information is considered. Extensive experiments demonstrate that our proposed method can generate engaging and diverse responses, outperforming state-of-the-art baselines across various evaluation metrics.</abstract>
-      <url hash="4ebcb038">2024.findings-acl.429</url>
+      <url hash="14bad267">2024.findings-acl.429</url>
       <bibkey>fan-etal-2024-personalized</bibkey>
       <doi>10.18653/v1/2024.findings-acl.429</doi>
     </paper>
@@ -12079,7 +12079,7 @@
       <author><first>Le</first><last>Sun</last><affiliation>Institute of Software, Chinese Academy of Sciences</affiliation></author>
       <pages>7203-7215</pages>
       <abstract>In-context learning(ICL) has gained considerable attention due to its data efficiency and task adaptability. Unfortunately, ICL suffers from the demonstration bias, i.e., its performance and robustness are severely affected by the selection and ordering of demonstrations. In this paper, we identify that such demonstration bias may primarily stem from the semantic ambiguity induced by demonstrations, i.e., a demonstration may indicate multiple input-to-label mappings and its mapping can be interpreted differently in different contexts by LLMs. Such semantic ambiguity disrupts task comprehension during ICL and results in performance fluctuations. To resolve the semantic ambiguity problem, this paper further proposes two de-biasing strategies to mitigate demonstration bias in in-context learning. Experiments on six datasets show that our methods can effectively alleviate demonstration bias and significantly improve task performance.</abstract>
-      <url hash="02d9f8aa">2024.findings-acl.430</url>
+      <url hash="48a6a987">2024.findings-acl.430</url>
       <bibkey>li-etal-2024-debiasing</bibkey>
       <doi>10.18653/v1/2024.findings-acl.430</doi>
     </paper>
@@ -12090,7 +12090,7 @@
       <author><first>Ion</first><last>Androutsopoulos</last><affiliation>Athens University of Economics and Business</affiliation></author>
       <pages>7216-7240</pages>
       <abstract>Creating effective and reliable task-oriented dialog systems (ToDSs) is challenging, not only because of the complex structure of these systems, but also due to the scarcity of training data, especially when several modules need to be trained separately, each one with its own input/output training examples. Data augmentation (DA), whereby synthetic training examples are added to the training data, has been successful in other NLP systems, but has not been explored as extensively in ToDSs. We empirically evaluate the effectiveness of DA methods in an end-to-end ToDS setting, where a single system is trained to handle all processing stages, from user inputs to system outputs. We experiment with two ToDSs (UBAR, GALAXY) on two datasets (MultiWOZ, KVRET). We consider three types of DA methods (word-level, sentence-level, dialog-level), comparing eight DA methods that have shown promising results in ToDSs and other NLP systems. We show that all DA methods considered are beneficial, and we highlight the best ones, also providing advice to practitioners. We also introduce a more challenging few-shot cross-domain ToDS setting, reaching similar conclusions.</abstract>
-      <url hash="a931c062">2024.findings-acl.431</url>
+      <url hash="38745101">2024.findings-acl.431</url>
       <bibkey>vlachos-etal-2024-comparing</bibkey>
       <doi>10.18653/v1/2024.findings-acl.431</doi>
       <video href="2024.findings-acl.431.mp4"/>
@@ -12103,7 +12103,7 @@
       <author><first>Feng</first><last>Zheng</last><affiliation>Southern University of Science and Technology</affiliation></author>
       <pages>7241-7254</pages>
       <abstract>Sign language understanding has made significant strides; however, there is still no viable solution for generating sign sequences directlyfrom entire spoken content, e.g., text or speech. In this paper, we propose a unified framework for continuous sign language production, easing communication between sign and non-sign language users. In particular, a sequence diffusion model, utilizing embeddings extracted from text or speech, is crafted to generate sign predictions step by step. Moreover, by creating a joint embedding space for text, audio, and sign, we bind these modalities and leverage the semantic consistency among them to provide informative feedback for the model training. This embedding-consistency learning strategy minimizes the reliance on sign triplets and ensures continuous model refinement, evenwith a missing audio modality. Experiments on How2Sign and PHOENIX14T datasets demonstrate that our model achieves competitive performance in sign language production.</abstract>
-      <url hash="d6dd5b2d">2024.findings-acl.432</url>
+      <url hash="4f279ca7">2024.findings-acl.432</url>
       <bibkey>ma-etal-2024-ms2sl</bibkey>
       <doi>10.18653/v1/2024.findings-acl.432</doi>
       <video href="2024.findings-acl.432.mp4"/>
@@ -12120,7 +12120,7 @@
       <author><first>Lingpeng</first><last>Kong</last><affiliation>Department of Computer Science, The University of Hong Kong</affiliation></author>
       <pages>7255-7279</pages>
       <abstract>Multimodal reasoning stands as a pivotal capability for large vision-language models (LVLMs). The integration with Domain-Specific Languages (DSL), offering precise visual representations, equips these models with the opportunity to execute more accurate reasoning in complex and professional domains. However, the vanilla Chain-of-Thought (CoT) prompting method faces challenges in effectively leveraging the unique strengths of visual and DSL representations, primarily due to their differing reasoning mechanisms. Additionally, it often falls short in addressing critical steps in multi-step reasoning tasks. To mitigate these challenges, we introduce the Bi-Modal Behavioral Alignment (BBA) prompting method, designed to maximize the potential of DSL in augmenting complex multi-modal reasoning tasks. This method initiates by guiding LVLMs to create separate reasoning chains for visual and DSL representations. Subsequently, it aligns these chains by addressing any inconsistencies, thus achieving a cohesive integration of behaviors from different modalities. Our experiments demonstrate that BBA substantially improves the performance of GPT-4V(ision) on geometry problem solving (28.34% <tex-math>\to</tex-math> 34.22%), chess positional advantage prediction (42.08% <tex-math>\to</tex-math> 46.99%) and molecular property prediction (77.47% <tex-math>\to</tex-math> 83.52%).</abstract>
-      <url hash="25e90ab1">2024.findings-acl.433</url>
+      <url hash="06ab3568">2024.findings-acl.433</url>
       <bibkey>zhao-etal-2024-bba</bibkey>
       <doi>10.18653/v1/2024.findings-acl.433</doi>
     </paper>
@@ -12135,7 +12135,7 @@
       <author><first>JingBo</first><last>Zhu</last><affiliation>Northeastern University</affiliation></author>
       <pages>7280-7294</pages>
       <abstract>The design choices in Transformer feed-forward neural networks have resulted in significant computational and parameter overhead. In this work, we emphasize the importance of hidden dimensions in designing lightweight FFNs, a factor often overlooked in previous architectures. Guided by this principle, we introduce PartialFormer, a parameter-efficient Transformer architecture utilizing multiple smaller FFNs to reduce parameters and computation while maintaining essential hidden dimensions. These smaller FFNs are integrated into a multi-head attention mechanism for effective collaboration. We also propose a tailored head scaling strategy to enhance PartialFormer’s capabilities. Furthermore, we present a residual-like attention calculation to improve depth scaling within PartialFormer. Extensive experiments on 9 translation tasks and 1 abstractive summarization task validate the effectiveness of our PartialFormer approach on machine translation and summarization tasks. Our code would be available at: https://github.com/zhengkid/PartialFormer.</abstract>
-      <url hash="5a4eb50d">2024.findings-acl.434</url>
+      <url hash="16ba2494">2024.findings-acl.434</url>
       <bibkey>zheng-etal-2024-partialformer</bibkey>
       <doi>10.18653/v1/2024.findings-acl.434</doi>
       <video href="2024.findings-acl.434.mp4"/>
@@ -12150,7 +12150,7 @@
       <author><first>Dongha</first><last>Lee</last><affiliation>Yonsei University</affiliation></author>
       <pages>7295-7303</pages>
       <abstract>In the task of aspect sentiment quad prediction (ASQP), generative methods for predicting sentiment quads have shown promisingresults. However, they still suffer from imprecise predictions and limited interpretability, caused by data scarcity and inadequate modeling of the quadruplet composition process. In this paper, we propose Self-Consistent Reasoning-based Aspect sentiment quadruple Prediction (SCRAP), optimizing its model to generate reasonings and the corresponding sentiment quadruplets in sequence. SCRAP adopts the Extract-Then-Assign reasoning strategy, which closely mimics human cognition. In the end, SCRAP significantly improves the model’s ability to handle complex reasoning tasks and correctly predict quadruplets through consistency voting, resulting in enhanced interpretability and accuracy in ASQP.</abstract>
-      <url hash="60998ee5">2024.findings-acl.435</url>
+      <url hash="8e59ab79">2024.findings-acl.435</url>
       <bibkey>kim-etal-2024-self-consistent</bibkey>
       <doi>10.18653/v1/2024.findings-acl.435</doi>
       <video href="2024.findings-acl.435.mp4"/>
@@ -12164,7 +12164,7 @@
       <author><first>Ge</first><last>Li</last><affiliation>Peking University Shenzhen Graduate School</affiliation></author>
       <pages>7304-7323</pages>
       <abstract>Large language models (LLMs) have showcased remarkable potential across various tasks by conditioning on prompts. However, the quality of different human-written prompts leads to substantial discrepancies in LLMs’ performance, and improving prompts usually necessitates considerable human effort and expertise. To this end, this paper proposes Prompt with Actor-Critic Editing (PACE) for LLMs to enable automatic prompt editing. Drawing inspiration from the actor-critic algorithm in reinforcement learning, PACE leverages LLMs as the dual roles of actors and critics, conceptualizing prompt as a type of policy. PACE refines prompt, taking into account the feedback from both actors performing prompt and critics criticizing response. This process helps LLMs better align prompt to a specific task, thanks to real responses and thinking from LLMs.We conduct extensive experiments on 24 instruction induction tasks and 21 big-bench tasks. Experimental results indicate that PACE elevates the relative performance of medium/low-quality human-written prompts by up to 98%, which has comparable performance to high-quality human-written prompts. Moreover, PACE also exhibits notable efficacy for prompt generation.</abstract>
-      <url hash="9903a9a0">2024.findings-acl.436</url>
+      <url hash="337d9859">2024.findings-acl.436</url>
       <bibkey>dong-etal-2024-pace</bibkey>
       <doi>10.18653/v1/2024.findings-acl.436</doi>
       <video href="2024.findings-acl.436.mp4"/>
@@ -12178,7 +12178,7 @@
       <author><first>Mani</first><last>Srivastava</last><affiliation>Amazon and University of California, Los Angeles</affiliation></author>
       <pages>7324-7341</pages>
       <abstract>Recent developments in Large Language Models (LLMs) have demonstrated their remarkable capabilities across a range of tasks. Questions, however, persist about the nature of LLMs and their potential to integrate common-sense human knowledge when performing tasks involving information about the real physical world. This paper delves into these questions by exploring how LLMs can be extended to interact with and reason about the physical world through IoT sensors and actuators, a concept that we term “Penetrative AI”. The paper explores such an extension at two levels of LLMs’ ability to penetrate into the physical world via the processing of sensory signals. Our preliminary findings indicate that LLMs, with ChatGPT being the representative example in our exploration, have considerable and unique proficiency in employing the embedded world knowledge for interpreting IoT sensor data and reasoning over them about tasks in the physical realm. Not only this opens up new applications for LLMs beyond traditional text-based tasks, but also enables new ways of incorporating human knowledge in cyber-physical systems.</abstract>
-      <url hash="4ee95285">2024.findings-acl.437</url>
+      <url hash="f179a40e">2024.findings-acl.437</url>
       <bibkey>xu-etal-2024-penetrative</bibkey>
       <doi>10.18653/v1/2024.findings-acl.437</doi>
       <video href="2024.findings-acl.437.mp4"/>
@@ -12194,7 +12194,7 @@
       <author><first>Marius</first><last>Mosbach</last><affiliation>McGill University and Mila - Quebec Artificial Intelligence Institute</affiliation></author>
       <pages>7342-7371</pages>
       <abstract>In-context learning is a popular inference strategy where large language models solve a task using only a few labeled demonstrations without needing any parameter updates. Although there have been extensive studies on English in-context learning, multilingual in-context learning remains under-explored, and we lack an in-depth understanding of the role of demonstrations in this context. To address this gap, we conduct a multidimensional analysis of multilingual in-context learning, experimenting with 5 models from different model families, 9 datasets covering classification and generation tasks, and 56 typologically diverse languages. Our results reveal that the effectiveness of demonstrations varies significantly across models, tasks, and languages. We also find that strong instruction-following models including Llama 2-Chat, GPT-3.5, and GPT-4 are largely insensitive to the quality of demonstrations. Instead, a carefully crafted template often eliminates the benefits of demonstrations for some tasks and languages altogether. These findings show that the importance of demonstrations might be overestimated. Our work highlights the need for granular evaluation across multiple axes towards a better understanding of in-context learning.</abstract>
-      <url hash="8c4f7f1c">2024.findings-acl.438</url>
+      <url hash="6c59f4b6">2024.findings-acl.438</url>
       <bibkey>zhang-etal-2024-impact</bibkey>
       <doi>10.18653/v1/2024.findings-acl.438</doi>
       <video href="2024.findings-acl.438.mp4"/>
@@ -12208,7 +12208,7 @@
       <author><first>Tingting</first><last>He</last><affiliation>Central China Normal University</affiliation></author>
       <pages>7372-7383</pages>
       <abstract>Chinese Spell Checking (CSC) is a widely used technology, which plays a vital role in speech to text (STT) and optical character recognition (OCR). Most of the existing CSC approaches relying on BERT architecture achieve excellent performance. However, limited by the scale of the foundation model, BERT-based method does not work well in few-shot scenarios, showing certain limitations in practical applications. In this paper, we explore using an in-context learning method named RS-LLM <tex-math>(\textbf{R}ich\ \textbf{S}emantic\ based\ LLMs\)</tex-math> to introduce large language models (LLMs) as the foundation model. Besides, we study the impact of introducing various Chinese rich semantic information in our framework. We found that by introducing a small number of specific Chinese rich semantic structures, LLMs achieve better performance than most of the BERT-based model on few-shot CSC task. Furthermore, we conduct experiments on multiple datasets, and the experimental results verified the superiority of our proposed framework.</abstract>
-      <url hash="935e1822">2024.findings-acl.439</url>
+      <url hash="250e8abc">2024.findings-acl.439</url>
       <bibkey>dong-etal-2024-rich</bibkey>
       <doi>10.18653/v1/2024.findings-acl.439</doi>
       <video href="2024.findings-acl.439.mp4"/>
@@ -12220,7 +12220,7 @@
       <author><first>Raj</first><last>Dabre</last><affiliation>National Institute of Information and Communications Technology (NICT), National Institute of Advanced Industrial Science and Technology</affiliation></author>
       <pages>7384-7406</pages>
       <abstract>Recent interest has surged in employing Large Language Models (LLMs) for machine translation (MT) via in-context learning (ICL) (Vilar et al., 2023). Most prior studies primarily focus on optimizing translation quality, with limited attention to understanding the specific aspects of ICL that influence the said quality. To this end, we perform the first of its kind, exhaustive study of in-context learning for machine translation (MT). We first establish that ICL is primarily example-driven and not instruction-driven. Following this, we conduct an extensive exploration of various aspects of the examples to understand their influence on downstream performance. Our analysis includes factors such as quality and quantity of demonstrations, spatial proximity, and source versus target originality. Further, we also investigate challenging scenarios involving indirectness and misalignment of examples to understand the limits of ICL. While we establish the significance of the quality of the target distribution over the source distribution of demonstrations, we further observe that perturbations sometimes act as regularizers, resulting in performance improvements. Surprisingly, ICL does not necessitate examples from the same task, and a related task with the same target distribution proves sufficient. We hope that our study acts as a guiding resource for considerations in utilizing ICL for MT. Our code is available on https://github.com/PranjalChitale/in-context-mt-analysis.</abstract>
-      <url hash="5604fedf">2024.findings-acl.440</url>
+      <url hash="41562c86">2024.findings-acl.440</url>
       <bibkey>chitale-etal-2024-empirical</bibkey>
       <doi>10.18653/v1/2024.findings-acl.440</doi>
       <video href="2024.findings-acl.440.mp4"/>
@@ -12237,7 +12237,7 @@
       <author><first>Barbara</first><last>Plank</last><affiliation>Ludwig-Maximilians-Universität München and IT University of Copenhagen</affiliation></author>
       <pages>7407-7416</pages>
       <abstract>The open-ended nature of language generation makes the evaluation of autoregressive large language models (LLMs) challenging. One common evaluation approach uses multiple-choice questions to limit the response space. The model is then evaluated by ranking the candidate answers by the log probability of the first token prediction. However, first-tokens may not consistently reflect the final response output, due to model’s diverse response styles such as starting with “Sure” or refusing to answer. Consequently, first-token evaluation is not indicative of model behaviour when interacting with users. But by how much? We evaluate how aligned first-token evaluation is with the text output along several dimensions, namely final option choice, refusal rate, choice distribution and robustness under prompt perturbation. Our results show that the two approaches are severely misaligned <i>on all dimensions</i>, reaching mismatch rates over 60%. Models heavily fine-tuned on conversational or safety data are especially impacted. Crucially, models remain misaligned even when we increasingly constrain prompts, i.e., force them to start with an option letter or example template. Our findings i) underscore the importance of inspecting the text output as well and ii) caution against relying solely on first-token evaluation.</abstract>
-      <url hash="9e81b732">2024.findings-acl.441</url>
+      <url hash="0f087161">2024.findings-acl.441</url>
       <bibkey>wang-etal-2024-answer-c</bibkey>
       <doi>10.18653/v1/2024.findings-acl.441</doi>
       <video href="2024.findings-acl.441.mp4"/>
@@ -12250,7 +12250,7 @@
       <author><first>Hiroshi</first><last>Arakawa</last></author>
       <pages>7417-7431</pages>
       <abstract>The integration of Large Language Models (LLMs) and knowledge graphs (KGs) has achieved remarkable success in various natural language processing tasks. However, existing methodologies that integrate LLMs and KGs often navigate the task-solving process solely based on the LLM’s analysis of the question, overlooking the rich cognitive potential inherent in the vast knowledge encapsulated in KGs. To address this, we introduce Observation-Driven Agent (ODA), a novel AI agent framework tailored for tasks involving KGs. ODA incorporates KG reasoning abilities via global observation, which enhances reasoning capabilities through a cyclical paradigm of observation, action, and reflection. Confronting the exponential explosion of knowledge during observation, we innovatively design a recursive observation mechanism. Subsequently, we integrate the observed knowledge into the action and reflection modules. Through extensive experiments, ODA demonstrates state-of-the-art performance on several datasets, notably achieving accuracy improvements of 12.87% and 8.9%.</abstract>
-      <url hash="430f034b">2024.findings-acl.442</url>
+      <url hash="c52ce358">2024.findings-acl.442</url>
       <bibkey>sun-etal-2024-oda</bibkey>
       <doi>10.18653/v1/2024.findings-acl.442</doi>
       <video href="2024.findings-acl.442.mp4"/>
@@ -12264,7 +12264,7 @@
       <author><first>Stjepan</first><last>Picek</last><affiliation>Radboud University Nijmegen</affiliation></author>
       <pages>7432-7449</pages>
       <abstract>Large Language Models (LLMs) have increasingly become central to generating content with potential societal impacts. Notably, these models have demonstrated capabilities for generating content that could be deemed harmful. To mitigate these risks, researchers have adopted safety training techniques to align model outputs with societal values to curb the generation of malicious content. However, the phenomenon of “jailbreaking” — where carefully crafted prompts elicit harmful responses from models — persists as a significant challenge. This research conducts a comprehensive analysis of existing studies on jailbreaking LLMs and their defense techniques. We meticulously investigate nine attack techniques and seven defense techniques applied across three distinct language models: Vicuna, LLama, and GPT-3.5 Turbo. We aim to evaluate the effectiveness of these attack and defense techniques. Our findings reveal that existing white-box attacks underperform compared to universal techniques and that including special tokens in the input significantly affects the likelihood of successful attacks. This research highlights the need to concentrate on the security facets of LLMs. Additionally, we contribute to the field by releasing our datasets and testing framework, aiming to foster further research into LLM security. We believe these contributions will facilitate the exploration of security measures within this domain.</abstract>
-      <url hash="4f8eb538">2024.findings-acl.443</url>
+      <url hash="93af91f4">2024.findings-acl.443</url>
       <bibkey>xu-etal-2024-comprehensive</bibkey>
       <doi>10.18653/v1/2024.findings-acl.443</doi>
       <video href="2024.findings-acl.443.mp4"/>
@@ -12278,7 +12278,7 @@
       <author><first>Ion</first><last>Androutsopoulos</last><affiliation>Athens University of Economics and Business</affiliation></author>
       <pages>7450-7466</pages>
       <abstract>Diagnostic Captioning (DC) automatically generates a diagnostic text from one or more medical images (e.g., X-rays, MRIs) of a patient. Treated as a draft, the generated text may assist clinicians, by providing an initial estimation of the patient’s condition, speeding up and helping safeguard the diagnostic process. The accuracy of a diagnostic text, however, strongly depends on how well the key medical conditions depicted in the images are expressed. We propose a new <tex-math>\textit{data-driven}</tex-math> guided decoding method that incorporates medical information, in the form of existing tags capturing key conditions of the image(s), into the beam search of the diagnostic text generation process. We evaluate the proposed method on two medical datasets using four DC systems that range from generic image-to-text systems with CNN encoders and RNN decoders to pre-trained Large Language Models. The latter can also be used in few- and zero-shot learning scenarios. In most cases, the proposed mechanism improves performance with respect to all evaluation measures. We provide an open-source implementation of the proposed method at https://github.com/nlpaueb/dmmcs.</abstract>
-      <url hash="a9347c80">2024.findings-acl.444</url>
+      <url hash="8de99359">2024.findings-acl.444</url>
       <bibkey>kaliosis-etal-2024-data</bibkey>
       <doi>10.18653/v1/2024.findings-acl.444</doi>
       <video href="2024.findings-acl.444.mp4"/>
@@ -12294,7 +12294,7 @@
       <author><first>Fei</first><last>Tan</last><affiliation>Sensetime Research</affiliation></author>
       <pages>7467-7509</pages>
       <abstract>Aligned Large Language Models (LLMs) showcase remarkable versatility, capable of handling diverse real-world tasks. Meanwhile, aligned LLMs are also expected to exhibit speciality, excelling in specific applications. However, fine-tuning with extra data, a common practice to gain speciality, often leads to catastrophic forgetting (CF) of previously acquired versatility, hindering the model’s performance across diverse tasks. In response to this challenge, we propose CoFiTune, a coarse to fine framework in an attempt to strike the balance between speciality and versatility. At the coarse-grained level, an empirical tree-search algorithm is utilized to pinpoint and update specific modules that are crucial for speciality, while keeping other parameters frozen; at the fine-grained level, a soft-masking mechanism regulates the update to the LLMs, mitigating the CF issue without harming speciality. In an overall evaluation of both speciality and versatility, CoFiTune consistently outperforms baseline methods across diverse tasks and model scales. Compared to the full-parameter SFT, CoFiTune leads to about 14% versatility improvement and marginal speciality loss on a 13B model. Lastly, based on further analysis, we provide a speculative insight into the information forwarding process in LLMs, which helps explain the effectiveness of the proposed method. The code is available at https://github.com/rattlesnakey/CoFiTune.</abstract>
-      <url hash="a44693b4">2024.findings-acl.445</url>
+      <url hash="d6ec2d9e">2024.findings-acl.445</url>
       <bibkey>zhang-etal-2024-balancing</bibkey>
       <doi>10.18653/v1/2024.findings-acl.445</doi>
       <video href="2024.findings-acl.445.mp4"/>
@@ -12308,7 +12308,7 @@
       <author><first>Xinyu</first><last>Dai</last><affiliation>Nanjing University</affiliation></author>
       <pages>7510-7527</pages>
       <abstract>Relation triplet extraction is a fundamental task in natural language processing that aims to identify semantic relationships between entities in text. It is particularly challenging in the zero-shot setting, i.e., zero-shot relation triplet extraction (ZeroRTE), where the relation sets between training and test are disjoint. Existing methods deal with this task by integrating relations into prompts, which may lack sufficient understanding of the unseen relations. To address these limitations, this paper presents a novel Two-Agent Game (TAG) approach to deliberate and debate the semantics of unseen relations. TAG consists of two agents, a generator and an extractor. They iteratively interact in three key steps: attempting, criticizing, and rectifying. This enables the agents to fully debate and understand the unseen relations. Experimental results demonstrate consistent improvement over ALBERT-Large, BART, andGPT3.5, without incurring additional inference costs in all cases. Remarkably, our method outperforms strong baselines by a significant margin, achieving an impressive 6%-16% increase in F1 scores, particularly when dealingwith FewRel with five unseen relations.</abstract>
-      <url hash="d1bb0077">2024.findings-acl.446</url>
+      <url hash="786fa03c">2024.findings-acl.446</url>
       <bibkey>xu-etal-2024-two</bibkey>
       <doi>10.18653/v1/2024.findings-acl.446</doi>
       <video href="2024.findings-acl.446.mp4"/>
@@ -12323,7 +12323,7 @@
       <author><first>Weiping</first><last>Wang</last></author>
       <pages>7528-7541</pages>
       <abstract>Parameter-efficient fine-tuning (PEFT) has emerged as the predominant technique for fine-tuning in the era of large language models. However, existing PEFT methods still have inadequate training efficiency. Firstly, the utilization of large-scale foundation models during the training process is excessively redundant for certain fine-tuning tasks. Secondly, as the model size increases, the growth in trainable parameters of empirically added PEFT modules becomes non-negligible and redundant, leading to inefficiency. To achieve task-specific efficient fine-tuning, we propose the Light-PEFT framework, which includes two methods: Masked Early Pruning of the Foundation Model and Multi-Granularity Early Pruning of PEFT. The Light-PEFT framework allows for the simultaneous estimation of redundant parameters in both the foundation model and PEFT modules during the early stage of training. These parameters can then be pruned for more efficient fine-tuning. We validate our approach on GLUE, SuperGLUE, QA tasks, and various models. With Light-PEFT, parameters of the foundation model can be pruned by up to over 40%, while still controlling trainable parameters to be only 25% of the original PEFT method. Compared to utilizing the PEFT method directly, Light-PEFT achieves training and inference speedup, reduces memory usage, and maintains comparable performance and the plug-and-play feature of PEFT.</abstract>
-      <url hash="ebc51638">2024.findings-acl.447</url>
+      <url hash="4de98ab6">2024.findings-acl.447</url>
       <bibkey>gu-etal-2024-light</bibkey>
       <doi>10.18653/v1/2024.findings-acl.447</doi>
       <video href="2024.findings-acl.447.mp4"/>
@@ -12335,7 +12335,7 @@
       <author><first>Anne</first><last>Lauscher</last><affiliation>Universität Hamburg</affiliation></author>
       <pages>7542-7550</pages>
       <abstract>The translation of gender-neutral person-referring terms (e.g.,the students) is often non-trivial.Translating from English into German poses an interesting case—in German, person-referring nouns are usually gender-specific, and if the gender of the referent(s) is unknown or diverse, the generic masculine (die Studenten (m.)) is commonly used. This solution, however, reduces the visibility of other genders, such as women and non-binary people. To counteract gender discrimination, a societal movement towards using gender-fair language exists (e.g., by adopting neosystems). However, gender-fair German is currently barely supported in machine translation (MT), requiring post-editing or manual translations. We address this research gap by studying gender-fair language in English-to-German MT. Concretely, we enrich a community-created gender-fair language dictionary and sample multi-sentence test instances from encyclopedic text and parliamentary speeches.Using these novel resources, we conduct the first benchmark study involving two commercial systems and six neural MT models for translating words in isolation and natural contexts across two domains. Our findings show that most systems produce mainly masculine forms and rarely gender-neutral variants, highlighting the need for future research. We release code and data at https://github.com/g8a9/building-bridges-gender-fair-german-mt.</abstract>
-      <url hash="3c8a7fb1">2024.findings-acl.448</url>
+      <url hash="25446794">2024.findings-acl.448</url>
       <bibkey>lardelli-etal-2024-building</bibkey>
       <doi>10.18653/v1/2024.findings-acl.448</doi>
       <video href="2024.findings-acl.448.mp4"/>
@@ -12349,7 +12349,7 @@
       <author><first>Pengfei</first><last>Liu</last></author>
       <pages>7551-7558</pages>
       <abstract>Large language models (LLMs) have demonstrated the capacity to improve summary quality by mirroring a human-like iterative process of critique and refinement starting from the initial draft. Two strategies are designed to perform this iterative process: <tex-math>\textit{Prompt Chaining}</tex-math> and <tex-math>\textit{Stepwise Prompt}</tex-math>. Prompt chaining orchestrates the drafting, critiquing, and refining phases through a series of three discrete prompts, while Stepwise prompt integrates these phases within a single prompt. However, the relative effectiveness of the two methods has not been extensively studied. This paper is dedicated to examining and comparing these two methods in the context of text summarization to ascertain which method stands out as the most effective. Experimental results show that the prompt chaining method can produce a more favorable outcome. This might be because stepwise prompt might produce a simulated refinement process according to our various experiments. Since refinement is adaptable to diverse tasks, our conclusions have the potential to be extrapolated to other applications, thereby offering insights that may contribute to the broader development of LLMs.</abstract>
-      <url hash="aa7c561e">2024.findings-acl.449</url>
+      <url hash="89208cc5">2024.findings-acl.449</url>
       <bibkey>sun-etal-2024-prompt</bibkey>
       <doi>10.18653/v1/2024.findings-acl.449</doi>
       <video href="2024.findings-acl.449.mp4"/>
@@ -12363,7 +12363,7 @@
       <author><first>Bowen</first><last>Zhou</last><affiliation>Tsinghua University</affiliation></author>
       <pages>7559-7569</pages>
       <abstract>Multi-modal entity linking (MEL) is a challenging task that requires accurate prediction of entities within extensive search spaces, utilizing multi-modal contexts. Existing generative approaches struggle with the knowledge gap between visual entity information and the intrinsic parametric knowledge of LLMs. To address this knowledge gap, we introduce a novel approach called GELR, which incorporates a knowledge retriever to enhance visual entity information by leveraging external sources. Additionally, we devise a prioritization scheme that effectively handles noisy retrieval results and manages conflicts arising from the integration of external and internal knowledge. Moreover, we propose a noise-aware instruction tuning technique during training to finely adjust the model’s ability to leverage retrieved information effectively. Through extensive experiments conducted on three benchmarks, our approach showcases remarkable improvements, ranging from 3.0% to 6.5%, across all evaluation metrics compared to strong baselines. These results demonstrate the effectiveness and superiority of our proposed method in tackling the complexities of multi-modal entity linking.</abstract>
-      <url hash="2a1881c6">2024.findings-acl.450</url>
+      <url hash="9e125b4e">2024.findings-acl.450</url>
       <bibkey>long-etal-2024-trust</bibkey>
       <doi>10.18653/v1/2024.findings-acl.450</doi>
     </paper>
@@ -12373,7 +12373,7 @@
       <author><first>Danushka</first><last>Bollegala</last><affiliation>Amazon and University of Liverpool</affiliation></author>
       <pages>7570-7584</pages>
       <abstract>Detecting temporal semantic changes of words is an important task for various NLP applications that must make time-sensitive predictions.Lexical Semantic Change Detection (SCD) task involves predicting whether a given target word, <tex-math>w</tex-math>, changes its meaning between two different text corpora, <tex-math>C_1</tex-math> and <tex-math>C_2</tex-math>.For this purpose, we propose a supervised two-staged SCD method that uses existing Word-in-Context (WiC) datasets.In the first stage, for a target word <tex-math>w</tex-math>, we learn two sense-aware encoders that represent the meaning of <tex-math>w</tex-math> in a given sentence selected from a corpus.Next, in the second stage, we learn a sense-aware distance metric that compares the semantic representations of a target word across all of its occurrences in <tex-math>C_1</tex-math> and <tex-math>C_2</tex-math>.Experimental results on multiple benchmark datasets for SCD show that our proposed method achieves strong performance in multiple languages.Additionally, our method achieves significant improvements on WiC benchmarks compared to a sense-aware encoder with conventional distance functions.</abstract>
-      <url hash="f85459bc">2024.findings-acl.451</url>
+      <url hash="72864815">2024.findings-acl.451</url>
       <bibkey>aida-bollegala-2024-semantic</bibkey>
       <doi>10.18653/v1/2024.findings-acl.451</doi>
       <video href="2024.findings-acl.451.mp4"/>
@@ -12387,7 +12387,7 @@
       <author><first>Yue</first><last>Zhang</last><affiliation>Westlake University</affiliation></author>
       <pages>7585-7606</pages>
       <abstract>Recent advances have made non-autoregressive (NAT) translation comparable to autoregressive methods (AT). However, their evaluation using BLEU has been shown to weakly correlate with human annotations. Limited research compares non-autoregressive translation and autoregressive translation comprehensively, leaving uncertainty about the true proximity of NAT to AT. To address this gap, we systematically evaluate four representative NAT methods across various dimensions, including human evaluation. Our empirical results demonstrate that despite narrowing the performance gap, state-of-the-art NAT still underperforms AT under more reliable evaluation metrics. Furthermore, we discover that explicitly modeling dependencies is crucial for generating natural language and generalizing to out-of-distribution sequences.</abstract>
-      <url hash="8d7618e2">2024.findings-acl.452</url>
+      <url hash="6f9a02ab">2024.findings-acl.452</url>
       <bibkey>li-etal-2024-achieved</bibkey>
       <doi>10.18653/v1/2024.findings-acl.452</doi>
       <video href="2024.findings-acl.452.mp4"/>
@@ -12401,7 +12401,7 @@
       <author><first>Yedid</first><last>Hoshen</last><affiliation>Google and Hebrew University of Jerusalem</affiliation></author>
       <pages>7607-7617</pages>
       <abstract>When first deploying an anomaly detection system, e.g., to detect out-of-scope queries in chatbots, there are no observed data, making data-driven approaches ineffective. Zero-shot anomaly detection methods offer a solution to such “cold-start” cases, but unfortunately they are often not accurate enough. This paper studies the realistic but underexplored <tex-math>\textit{cold-start}</tex-math> setting where an anomaly detection model is initialized using zero-shot guidance, but subsequently receives a small number of contaminated observations (namely, that may include anomalies). The goal is to make efficient use of both the zero-shot guidance and the observations. We propose ColdFusion, a method that effectively adapts the zero-shot anomaly detector to contaminated observations. To support future development of this new setting, we propose an evaluation suite consisting of evaluation protocols and metrics.</abstract>
-      <url hash="f9ec0ad1">2024.findings-acl.453</url>
+      <url hash="9ca16c80">2024.findings-acl.453</url>
       <bibkey>reiss-etal-2024-zero</bibkey>
       <doi>10.18653/v1/2024.findings-acl.453</doi>
       <video href="2024.findings-acl.453.mp4"/>
@@ -12417,7 +12417,7 @@
       <author><first>Lin</first><last>Gui</last><affiliation>King’s College London, University of London</affiliation></author>
       <pages>7618-7638</pages>
       <abstract>Existing datasets for narrative understanding often fail to represent the complexity and uncertainty of relationships in real-life social scenarios. To address this gap, we introduce a new benchmark, Conan, designed for extracting and analysing intricate character relation graphs from detective narratives. Specifically, we designed hierarchical relationship categories and manually extracted and annotated role-oriented relationships from the perspectives of various characters, incorporating both public relationships known to most characters and secret ones known to only a few. Our experiments with advanced Large Language Models (LLMs) like GPT-3.5, GPT-4, and Llama2 reveal their limitations in inferencing complex relationships and handling longer narratives. The combination of the Conan dataset and our pipeline strategy is geared towards understanding the ability of LLMs to comprehend nuanced relational dynamics in narrative contexts.</abstract>
-      <url hash="ecfba06d">2024.findings-acl.454</url>
+      <url hash="6ea6e20d">2024.findings-acl.454</url>
       <bibkey>zhao-etal-2024-large</bibkey>
       <doi>10.18653/v1/2024.findings-acl.454</doi>
       <video href="2024.findings-acl.454.mp4"/>
@@ -12429,7 +12429,7 @@
       <author><first>Seung-Hoon</first><last>Na</last><affiliation>Chonbuk National University</affiliation></author>
       <pages>7639-7654</pages>
       <abstract>Among the recently emerged knowledge editing methods, in-context knowledge editing (IKE) has shown respectable abilities on knowledge editing in terms of generalization and specificity. Noting the promising advantages but unexplored issues of IKE, we propose **DistillMIKE** as a novel extension of IKE, i.e., editing **distill**ation of "**M**assive” **I**n-context **K**nowledge **E**diting in large language models (LLMs), mainly consisting of two expansions; 1) *Massive in-context knowledge editing (MIKE)*, which extends IKE to a massive editing task, aiming to inject not a single edit but a set of massive edits to LLMs; To preserve specificity, our key novel extension is a “selective” retrieval augmentation, where the retrieval-augmented IKE is only applied to “in-scope” examples, whereas the unedited model without IKE is employed for “out-of-scope” ones. 2) *Editing distillation* of MIKE using low-rank adaptation (LoRA), which distills editing abilities of MIKE to parameters of LLMs in a manner of eliminating the need of lengthy in-context demonstrations, thus removing the computational overhead encountered at the inference time. Experimental results on the zsRE and CounterFact datasets demonstrate that MIKE shows the state-of-the-art perfomrances and DistilMIKE show comparable performances with MIKE. Our code is available at https://github.com/JoveReCode/DistillMIKE.git.</abstract>
-      <url hash="fb18bf85">2024.findings-acl.455</url>
+      <url hash="28057cce">2024.findings-acl.455</url>
       <bibkey>qiao-etal-2024-distillmike</bibkey>
       <doi>10.18653/v1/2024.findings-acl.455</doi>
       <video href="2024.findings-acl.455.mp4"/>
@@ -12447,7 +12447,7 @@
       <author><first>Zhifang</first><last>Sui</last><affiliation>Peking University</affiliation></author>
       <pages>7655-7671</pages>
       <abstract>To mitigate the high inference latency stemming from autoregressive decoding in Large Language Models (LLMs), Speculative Decoding has emerged as a novel decoding paradigm for LLM inference. In each decoding step, this method first drafts several future tokens efficiently and then verifies them in parallel. Unlike autoregressive decoding, Speculative Decoding facilitates the simultaneous decoding of multiple tokens per step, thereby accelerating inference. This paper presents a comprehensive overview and analysis of this promising decoding paradigm. We begin by providing a formal definition and formulation of Speculative Decoding. Then, we organize in-depth discussions on its key facets, such as drafter selection and verification strategies. Furthermore, we present a comparative analysis of leading methods under third-party testing environments. We aim for this work to serve as a catalyst for further research on Speculative Decoding, ultimately contributing to more efficient LLM inference.</abstract>
-      <url hash="9aedd963">2024.findings-acl.456</url>
+      <url hash="95c24c3e">2024.findings-acl.456</url>
       <bibkey>xia-etal-2024-unlocking</bibkey>
       <doi>10.18653/v1/2024.findings-acl.456</doi>
     </paper>
@@ -12458,7 +12458,7 @@
       <author><first>Heung-Seon</first><last>Oh</last><affiliation>Korea University of Technology and Education</affiliation></author>
       <pages>7672-7682</pages>
       <abstract>Hierarchical text classification (HTC) is a challenging problem with two key issues: utilizing structural information and mitigating label imbalance. Recently, the unit-based approach generating unit-based feature representations has outperformed the global approach focusing on a global feature representation. Nevertheless, unit-based models using BCE and ZLPR losses still face static thresholding and label imbalance challenges. Those challenges become more critical in large-scale hierarchies. This paper introduces a novel hierarchy-aware loss function for unit-based HTC models: Hierarchy-aware Biased Bound Margin (HBM) loss. HBM integrates learnable bounds, biases, and a margin to address static thresholding and mitigate label imbalance adaptively. Experimental results on benchmark datasets demonstrate the superior performance of HBM compared to competitive HTC models.</abstract>
-      <url hash="b335fab7">2024.findings-acl.457</url>
+      <url hash="5bf6377b">2024.findings-acl.457</url>
       <bibkey>kim-etal-2024-hierarchy</bibkey>
       <doi>10.18653/v1/2024.findings-acl.457</doi>
       <video href="2024.findings-acl.457.mp4"/>
@@ -12473,7 +12473,7 @@
       <author><first>Kewei</first><last>Tu</last><affiliation>ShanghaiTech University</affiliation></author>
       <pages>7683-7694</pages>
       <abstract>In the era of large language models, applying techniques such as Retrieval Augmented Generation can better address Open-Domain Question-Answering problems. Due to constraints including model sizes and computing resources, the length of context is often limited, and it becomes challenging to empower the model to cover overlong contexts while answering questions from open domains. This paper proposes a general and convenient method to cover longer contexts in Open-Domain Question-Answering tasks. %It leverages a small encoder language model that effectively encodes contexts, and the encoding applies cross-attention with origin inputs.It leverages a small encoder and cross-attention mechanism and effectively encodes contexts. With our method, the original language models can cover several times longer contexts while keeping the computing requirements close to the baseline. Our experiments demonstrate that after fine-tuning, there is improved performance across two held-in datasets, four held-out datasets, and also in two In Context Learning settings. Our code will be released at https://github.com/Alibaba-NLP/Vec-RA-ODQA.</abstract>
-      <url hash="3ab5f013">2024.findings-acl.458</url>
+      <url hash="6df774f3">2024.findings-acl.458</url>
       <bibkey>chen-etal-2024-improving-retrieval</bibkey>
       <doi>10.18653/v1/2024.findings-acl.458</doi>
       <video href="2024.findings-acl.458.mp4"/>
@@ -12486,7 +12486,7 @@
       <author><first>Tony</first><last>Lindgren</last><affiliation>Depratment of Computer and Systems Sciences</affiliation></author>
       <pages>7695-7715</pages>
       <abstract>Contaminated or adulterated food poses a substantial risk to human health. Given sets of labeled web texts for training, Machine Learning and Natural Language Processing can be applied to automatically detect such risks. We publish a dataset of 7,546 short texts describing public food recall announcements. Each text is manually labeled, on two granularity levels (coarse and fine), for food products and hazards that the recall corresponds to. We describe the dataset and benchmark naive, traditional, and Transformer models. Based on our analysis, Logistic Regression based on a TF-IDF representation outperforms RoBERTa and XLM-R on classes with low support. Finally, we discuss different prompting strategies and present an LLM-in-the-loop framework, based on Conformal Prediction, which boosts the performance of the base classifier while reducing energy consumption compared to normal prompting.</abstract>
-      <url hash="152adb8f">2024.findings-acl.459</url>
+      <url hash="ac07daef">2024.findings-acl.459</url>
       <bibkey>randl-etal-2024-cicle</bibkey>
       <doi>10.18653/v1/2024.findings-acl.459</doi>
       <video href="2024.findings-acl.459.mp4"/>
@@ -12504,7 +12504,7 @@
       <author><first>Chun</first><last>Yuan</last><affiliation>Tsinghua University, Tsinghua University</affiliation></author>
       <pages>7716-7741</pages>
       <abstract>Large language models (LLMs) excel in natural language processing but demand intensive computation. To mitigate this, various quantization methods have been explored, yet they compromise LLM performance. This paper unveils a previously overlooked type of outliers in LLMs. Such outliers are found to allocate most of the attention scores on initial tokens of input, termed as pivot tokens, which are crucial to the performance of quantized LLMs. Given that, we propose IntactKV to generate the KV cache of pivot tokens losslessly from the full-precision model. The approach is simple and easy to combine with existing quantization solutions with no extra inference overhead. Besides, IntactKV can be calibrated as additional LLM parameters to boost the quantized LLMs further with minimal training costs. Mathematical analysis also proves that IntactKV effectively reduces the upper bound of quantization error. Empirical results show that IntactKV brings consistent improvement over various quantization methods across different LLMs and downstream tasks, leading to the new state-of-the-art for LLM quantization. The codes are available at https://github.com/ruikangliu/IntactKV.</abstract>
-      <url hash="a3b6daa2">2024.findings-acl.460</url>
+      <url hash="0f66f002">2024.findings-acl.460</url>
       <bibkey>liu-etal-2024-intactkv</bibkey>
       <doi>10.18653/v1/2024.findings-acl.460</doi>
       <video href="2024.findings-acl.460.mp4"/>
@@ -12516,7 +12516,7 @@
       <author><first>Ichiro</first><last>Kobayashi</last><affiliation>Ochanomizu University</affiliation></author>
       <pages>7742-7752</pages>
       <abstract>For humans and robots to collaborate more in the real world, robots need to understand human intentions from the different manner of their behaviors. In our study, we focus on the meaning of adverbs which describe human motions. We propose a topic model, Hierarchical Dirichlet Process-Spectral Mixture Latent Dirichlet Allocation, which concurrently learns the relationship between those human motions and those adverbs by capturing the frequency kernels that represent motion characteristics and the shared topics of adverbs that depict such motions. We trained the model on datasets we made from movies about “walking” and “dancing”, and found that our model outperforms representative neural network models in terms of perplexity score. We also demonstrate our model’s ability to determine the adverbs for a given motion and confirmed that the model predicts more appropriate adverbs.</abstract>
-      <url hash="1729b6cd">2024.findings-acl.461</url>
+      <url hash="418a42b3">2024.findings-acl.461</url>
       <bibkey>taniguchi-etal-2024-learning</bibkey>
       <doi>10.18653/v1/2024.findings-acl.461</doi>
       <video href="2024.findings-acl.461.mp4"/>
@@ -12536,7 +12536,7 @@
       <author><first>Min</first><last>Yang</last><affiliation>Shenzhen Institutes of Advanced Technology, Chinese Academy of Sciences, Chinese Academy of Sciences</affiliation></author>
       <pages>7753-7774</pages>
       <abstract>The rapid development of Large Language Models (LLMs) has led to their increasing utilization in Chinese K-12 education. Despite the growing integration of LLMs and education, the absence of a dedicated benchmark for evaluating LLMs within this domain presents a pressing concern. Consequently, there is an urgent need for a comprehensive natural language processing benchmark to precisely assess the capabilities of various LLMs in Chinese K-12 education. In response, we introduce E-EVAL, the first comprehensive evaluation benchmark specifically tailored for Chinese K-12 education. E-EVAL comprises 4,351 multiple-choice questions spanning primary, middle, and high school levels, covering a diverse array of subjects. Through meticulous evaluation, we find that Chinese-dominant models often outperform English-dominant ones, with many exceeding GPT 4.0. However, most struggle with complex subjects like mathematics. Additionally, our analysis indicates that most Chinese-dominant LLMs do not achieve higher scores at the primary school level compared to the middle school level, highlighting the nuanced relationship between proficiency in higher-order and lower-order knowledge domains. Furthermore, experimental results highlight the effectiveness of the Chain of Thought (CoT) technique in scientific subjects and Few-shot prompting in liberal arts. Through E-EVAL, we aim to conduct a rigorous analysis delineating the strengths and limitations of LLMs in educational applications, thereby contributing significantly to the advancement of Chinese K-12 education and LLMs.</abstract>
-      <url hash="b4d4b52d">2024.findings-acl.462</url>
+      <url hash="02c9140f">2024.findings-acl.462</url>
       <bibkey>hou-etal-2024-e</bibkey>
       <doi>10.18653/v1/2024.findings-acl.462</doi>
       <video href="2024.findings-acl.462.mp4"/>
@@ -12552,7 +12552,7 @@
       <author><first>Ping</first><last>Luo</last><affiliation>The University of Hong Kong</affiliation></author>
       <pages>7775-7803</pages>
       <abstract>Charts play a vital role in data visualization, understanding data patterns, and informed decision-making. However, their unique combination of graphical elements (e.g., bars, lines) and textual components (e.g., labels, legends) poses challenges for general-purpose multimodal models. While vision-language models trained on chart data excel in comprehension, they struggle with generalization. To address these challenges, we propose ChartAssistant, a chart-based vision-language model for universal chart comprehension and reasoning. ChartAssistant leverages ChartSFT, a comprehensive dataset covering diverse chart-related tasks with basic (e.g. bars and pies) and specialized (e.g. radars, and bubbles) chart types. It undergoes a two-stage training process, starting with pre-training on chart-to-table parsing to align chart and text, followed by multitask instruction-following fine-tuning. This approach enables ChartAssistant to achieve competitive performance across various chart tasks. Experimental results demonstrate significant performance gains over the state-of-the-art UniChart and ChartLlama methods, especially outperforming them on real-world chart data with zero-shot setting. The code and data are available at https://github.com/OpenGVLab/ChartAst.</abstract>
-      <url hash="96c75faa">2024.findings-acl.463</url>
+      <url hash="641716d6">2024.findings-acl.463</url>
       <bibkey>meng-etal-2024-chartassistant</bibkey>
       <doi>10.18653/v1/2024.findings-acl.463</doi>
     </paper>
@@ -12567,7 +12567,7 @@
       <author><first>Jun</first><last>Zhao</last><affiliation>Institute of automation, Chinese academy of science</affiliation></author>
       <pages>7804-7816</pages>
       <abstract>Large Language Models (LLMs) can teach small language models (SLMs) to solve complex reasoning tasks (e.g., mathematical question answering) by Chain-of-thought Distillation (CoTD). Specifically, CoTD fine-tunes SLMs by utilizing rationales generated from LLMs such as ChatGPT. However, CoTD has certain limitations that make it unsuitable for knowledge-intensive multi-hop question answering: 1) SLMs have a very limited capacity in memorizing required knowledge compared to LLMs. 2) SLMs do not possess the same powerful integrated abilities in question understanding and knowledge reasoning as LLMs. To address the above limitations, we introduce Decompose-and-Response Distillation (D&amp;R Distillation), which distills two student models, namely Decomposer and Responser separately. The two models solve a knowledge-intensive multi-hop question through an interactive process of asking and answering subquestions. Our method offers two advantages: 1) SLMs have the capability to access external knowledge to address subquestions, which provides more comprehensive knowledge for multi-hop questions. 2) By employing simpler subquestions instead of complex CoT reasoning, SLMs effectively mitigate task complexity and decrease data prerequisites. Experimental results on three knowledge-intensive multi-hop question answering datasets demonstrate that D&amp;R Distillation can surpass previous CoTD methods, even with much less training data.</abstract>
-      <url hash="583017cb">2024.findings-acl.464</url>
+      <url hash="21ee4594">2024.findings-acl.464</url>
       <bibkey>li-etal-2024-teaching</bibkey>
       <doi>10.18653/v1/2024.findings-acl.464</doi>
       <video href="2024.findings-acl.464.mp4"/>
@@ -12581,7 +12581,7 @@
       <author><first>Zhongyu</first><last>Wei</last><affiliation>Fudan University</affiliation></author>
       <pages>7817-7831</pages>
       <abstract>We introduce ALaRM, the first framework modeling hierarchical rewards in reinforcement learning from human feedback (RLHF), which is designed to enhance the alignment of large language models (LLMs) with human preferences. The framework addresses the limitations of current alignment approaches, which often struggle with the inconsistency and sparsity of human supervision signals, by integrating holistic rewards with aspect-specific rewards. This integration enables more precise and consistent guidance of language models towards desired outcomes, particularly in complex and open text generation tasks. By employing a methodology that filters and combines multiple rewards based on their consistency, the framework provides a reliable mechanism for improving model alignment. We validate our approach through applications in long-form question answering and machine translation tasks, employing gpt-3.5-turbo for pairwise comparisons, and demonstrate improvements over existing baselines. Our work underscores the effectiveness of hierarchical rewards modeling in refining LLM training processes for better human preference alignment. We release our code at https://ALaRM-fdu.github.io.</abstract>
-      <url hash="1f881e61">2024.findings-acl.465</url>
+      <url hash="7b95df43">2024.findings-acl.465</url>
       <bibkey>lai-etal-2024-alarm</bibkey>
       <doi>10.18653/v1/2024.findings-acl.465</doi>
       <video href="2024.findings-acl.465.mp4"/>
@@ -12595,7 +12595,7 @@
       <author><first>B. Aditya</first><last>Prakash</last><affiliation>Georgia Institute of Technology</affiliation></author>
       <pages>7832-7840</pages>
       <abstract>Time-series forecasting (TSF) finds broad applications in real-world scenarios. Prompting off-the-shelf Large Language Models (LLMs) demonstrates strong zero-shot TSF capabilities while preserving computational efficiency. However, existing prompting methods oversimplify TSF as language next-token predictions, overlooking its dynamic nature and lack of integration with state-of-the-art prompt strategies such as Chain-of-Thought. Thus, we propose LSTPrompt, a novel approach for prompting LLMs in zero-shot TSF tasks. LSTPrompt decomposes TSF into short-term and long-term forecasting sub-tasks, tailoring prompts to each. LSTPrompt guides LLMs to regularly reassess forecasting mechanisms to enhance adaptability. Extensive evaluations demonstrate consistently better performance of LSTPrompt than existing prompting methods, and competitive results compared to foundation TSF models.</abstract>
-      <url hash="7244b126">2024.findings-acl.466</url>
+      <url hash="0bbd25f1">2024.findings-acl.466</url>
       <bibkey>liu-etal-2024-lstprompt</bibkey>
       <doi>10.18653/v1/2024.findings-acl.466</doi>
       <video href="2024.findings-acl.466.mp4"/>
@@ -12611,7 +12611,7 @@
       <author><first>Dangyang</first><last>Chen</last><affiliation>Pingan Technology</affiliation></author>
       <pages>7841-7864</pages>
       <abstract>Text classification is a crucial task encountered frequently in practical scenarios, yet it is still under-explored in the era of large language models (LLMs). This study shows that LLMs are vulnerable to changes in the number and arrangement of options in text classification. Our extensive empirical analyses reveal that the key bottleneck arises from ambiguous decision boundaries and inherent biases towards specific tokens and positions.To mitigate these issues, we make the first attempt and propose a novel two-stage classification framework for LLMs. Our approach is grounded in the empirical observation that pairwise comparisons can effectively alleviate boundary ambiguity and inherent bias. Specifically, we begin with a self-reduction technique to efficiently narrow down numerous options, which contributes to reduced decision space and a faster comparison process. Subsequently, pairwise contrastive comparisons are employed in a chain-of-thought manner to draw out nuances and distinguish confusable options, thus refining the ambiguous decision boundary.Extensive experiments on four datasets (Banking77, HWU64, LIU54, and Clinic150) verify the effectiveness of our framework. Furthermore, benefitting from our framework, various LLMs can achieve consistent improvements. Our code and data are available in <url>https://github.com/Chuge0335/PC-CoT</url>.</abstract>
-      <url hash="8c08ebe0">2024.findings-acl.467</url>
+      <url hash="22bee118">2024.findings-acl.467</url>
       <bibkey>lu-etal-2024-mitigating</bibkey>
       <doi>10.18653/v1/2024.findings-acl.467</doi>
     </paper>
@@ -12625,7 +12625,7 @@
       <author><first>Gongshen</first><last>Liu</last><affiliation>Shanghai Jiao Tong University</affiliation></author>
       <pages>7865-7877</pages>
       <abstract>Task-agnostic and transferable backdoors implanted in pre-trained language models (PLMs) pose a severe security threat as they can be inherited to any downstream task. However, existing methods rely on manual selection of triggers and backdoor representations, hindering their effectiveness and universality across different PLMs or usage paradigms. In this paper, we propose a new backdoor attack method called UOR, which overcomes these limitations by turning manual selection into automatic optimization. Specifically, we design poisoned supervised contrastive learning, which can automatically learn more uniform and universal backdoor representations. This allows for more even coverage of the output space, thus hitting more labels in downstream tasks after fine-tuning. Furthermore, we utilize gradient search to select appropriate trigger words that can be adapted to different PLMs and vocabularies. Experiments show that UOR achieves better attack performance on various text classification tasks compared to manual methods. Moreover, we test on PLMs with different architectures, usage paradigms, and more challenging tasks, achieving higher scores for universality.</abstract>
-      <url hash="aa745baf">2024.findings-acl.468</url>
+      <url hash="19c90899">2024.findings-acl.468</url>
       <bibkey>du-etal-2024-uor</bibkey>
       <doi>10.18653/v1/2024.findings-acl.468</doi>
       <video href="2024.findings-acl.468.mp4"/>
@@ -12637,7 +12637,7 @@
       <author><first>Lena</first><last>Jäger</last><affiliation>University of Zurich and Universität Potsdam</affiliation></author>
       <pages>7878-7892</pages>
       <abstract>To date, most investigations on surprisal and entropy effects in reading have been conducted on the group level, disregarding individual differences. In this work, we revisit the predictive power (PP) of different LMs’ surprisal and entropy measures on data of human reading times as a measure of processing effort by incorporating information of language users’ cognitive capacities. To do so, we assess the PP of surprisal and entropy estimated from generative language models (LMs) on reading data obtained from individuals who also completed a wide range of psychometric tests.Specifically, we investigate if modulating surprisal and entropy relative to cognitive scores increases prediction accuracy of reading times, and we examine whether LMs exhibit systematic biases in the prediction of reading times for cognitively high- or low-performing groups, revealing what type of psycholinguistic subjects a given LM emulates.Our study finds that in most cases, incorporating cognitive capacities increases predictive power of surprisal and entropy on reading times, and that generally, high performance in the psychometric tests is associated with lower sensitivity to predictability effects. Finally, our results suggest that the analyzed LMs emulate readers with lower verbal intelligence, suggesting that for a given target group (i.e., individuals with high verbal intelligence), these LMs provide less accurate predictability effect estimates.</abstract>
-      <url hash="15db2f7b">2024.findings-acl.469</url>
+      <url hash="e9eea5ee">2024.findings-acl.469</url>
       <bibkey>haller-etal-2024-language</bibkey>
       <doi>10.18653/v1/2024.findings-acl.469</doi>
       <video href="2024.findings-acl.469.mp4"/>
@@ -12648,7 +12648,7 @@
       <author><first>Brendan</first><last>O’Connor</last><affiliation>University of Massachusetts, Amherst</affiliation></author>
       <pages>7893-7906</pages>
       <abstract>Relation extraction (RE) extracts structured tuples of relationships (e.g. friend, enemy) between entities (e.g. Sherlock Holmes, John Watson) from text, with exciting potential applications. Hundreds of RE papers have been published in recent years; do their evaluation practices inform these goals? We review recent surveys and a sample of recent RE methods papers, compiling 38 datasets currently being used. Unfortunately, many have frequent label errors, and ones with known problems continue to be used. Many datasets focus on producing labels for a large number of relation types, often through error-prone annotation methods (e.g. distant supervision or crowdsourcing), and many recent papers rely exclusively on such datasets. We draw attention to a promising alternative: datasets with a <i>small</i> number of relations, often in specific domains like chemistry, finance, or biomedicine, where it is possible to obtain high quality expert annotations; such data can more realistically evaluate RE performance. The research community should consider more often using such resources.</abstract>
-      <url hash="d9b8d993">2024.findings-acl.470</url>
+      <url hash="8ce29b55">2024.findings-acl.470</url>
       <bibkey>cai-oconnor-2024-state</bibkey>
       <doi>10.18653/v1/2024.findings-acl.470</doi>
     </paper>
@@ -12664,7 +12664,7 @@
       <author><first>Jie</first><last>Tang</last><affiliation>Tsinghua University, Tsinghua University</affiliation></author>
       <pages>7907-7928</pages>
       <abstract>Large language models (LLMs) have manifested strong ability to generate codes for productive activities. However, current benchmarks for code synthesis, such as HumanEval, MBPP, and DS-1000, are predominantly oriented towards introductory tasks on algorithm and data science, insufficiently satisfying challenging requirements prevalent in real-world coding. To fill this gap, we propose NaturalCodeBench (NCB), a challenging code benchmark designed to mirror the complexity and variety of scenarios in real coding tasks. NCB comprises 402 high-quality problems in Python and Java, meticulously selected from natural user queries from online coding services, covering 6 different domains. Noting the extraordinary difficulty in creating testing cases for real-world queries, we also introduce a semi-automated pipeline to enhance the efficiency of test case construction. Comparing with manual solutions, it achieves an efficiency increase of more than 4 times. Our systematic experiments on 39 LLMs find that performance gaps on NCB between models with close HumanEval scores could still be significant, indicating a lack of focus on practical code synthesis scenarios or over-specified optimization on HumanEval. On the other hand, even the best-performing GPT-4 is still far from satisfying on NCB. The evaluation toolkit and development set are available at https://github.com/THUDM/NaturalCodeBench.</abstract>
-      <url hash="455ba3d0">2024.findings-acl.471</url>
+      <url hash="a54ae9cb">2024.findings-acl.471</url>
       <bibkey>zhang-etal-2024-naturalcodebench</bibkey>
       <doi>10.18653/v1/2024.findings-acl.471</doi>
       <video href="2024.findings-acl.471.mp4"/>
@@ -12676,7 +12676,7 @@
       <author><first>Matthias</first><last>Gallé</last><affiliation>Cohere</affiliation></author>
       <pages>7929-7960</pages>
       <abstract>Humans follow criteria when they execute tasks, and these criteria are directly used to assess the quality of task completion. Therefore, having models learn to use criteria to provide feedback can help humans or models to perform tasks better. However, current research in this area tends to consider only a limited number of criteria, or only a limited number of quality assessment aspects. To fill this gap, we propose a general framework that enables large language models (LLMs) to use comprehensive criteria for a task in delivering natural language feedback on task execution. In particular, we present a model-in-the-loop framework that semi-automatically derives criteria from collected guidelines for different writing tasks and constructs in-context demonstrations for each criterion. We choose three tasks from real-world scenarios to operationalize this idea: paper introduction writing, Python code writing, and Reddit post writing, and evaluate our feedback generation framework using different LLMs. The results reveal the fine-grained effects of adding criteria and demonstrations and provide valuable guidance on how to teach LLMs to use criteria more effectively.</abstract>
-      <url hash="b19ceaa1">2024.findings-acl.472</url>
+      <url hash="151ea912">2024.findings-acl.472</url>
       <bibkey>yuan-etal-2024-llmcrit</bibkey>
       <doi>10.18653/v1/2024.findings-acl.472</doi>
       <video href="2024.findings-acl.472.mp4"/>
@@ -12688,7 +12688,7 @@
       <author><first>Andre</first><last>Freitas</last><affiliation>Idiap Research Institute and University of Manchester</affiliation></author>
       <pages>7961-7973</pages>
       <abstract>The language ability of Large Language Models (LLMs) is often unbalanced towards English because of the imbalance in the distribution of the pre-training data. This disparity is demanded in further fine-tuning and affecting the cross-lingual abilities of LLMs. In this paper, we propose to empower Instruction-tuned LLMs (It-LLMs) in languages other than English by building semantic alignment between them. Hence, we propose <i>CrossAlpaca</i>, an It-LLM with cross-lingual Instruction-following and Translation-following demonstrations to improve semantic alignment between languages. We validate our approach on the multilingual Question Answering (QA) benchmarks XQUAD and MLQA and adapted versions of MMLU and BBH.Our models, tested over six different languages, outperform the It-LLMs tuned on monolingual data. The final results show that instruction tuning on non-English data is not enough and that semantic alignment can be further improved by Translation-following demonstrations.</abstract>
-      <url hash="f45a820f">2024.findings-acl.473</url>
+      <url hash="5b6a3de1">2024.findings-acl.473</url>
       <bibkey>ranaldi-etal-2024-empowering</bibkey>
       <doi>10.18653/v1/2024.findings-acl.473</doi>
     </paper>
@@ -12699,7 +12699,7 @@
       <author><first>Steven</first><last>Schockaert</last><affiliation>Cardiff University</affiliation></author>
       <pages>7974-7989</pages>
       <abstract>Conceptual spaces represent entities in terms of their primitive semantic features. Such representations are highly valuable but they are notoriously difficult to learn, especially when it comes to modelling perceptual and subjective features. Distilling conceptual spaces from Large Language Models (LLMs) has recently emerged as a promising strategy, but existing work has been limited to probing pre-trained LLMs using relatively simple zero-shot strategies. We focus in particular on the task of ranking entities according to a given conceptual space dimension. Unfortunately, we cannot directly fine-tune LLMs on this task, because ground truth rankings for conceptual space dimensions are rare. We therefore use more readily available features as training data and analyse whether the ranking capabilities of the resulting models transfer to perceptual and subjective features. We find that this is indeed the case, to some extent, but having at least some perceptual and subjective features in the training data seems essential for achieving the best results.</abstract>
-      <url hash="27c62d6d">2024.findings-acl.474</url>
+      <url hash="13946401">2024.findings-acl.474</url>
       <bibkey>kumar-etal-2024-ranking</bibkey>
       <doi>10.18653/v1/2024.findings-acl.474</doi>
       <video href="2024.findings-acl.474.mp4"/>
@@ -12715,7 +12715,7 @@
       <author><first>Jinsong</first><last>Su</last><affiliation>Xiamen University</affiliation></author>
       <pages>7990-8001</pages>
       <abstract>To achieve non-parametric NMT domain adaptation, <tex-math>k</tex-math>-Nearest-Neighbor Machine Translation (<tex-math>k</tex-math>NN-MT) constructs an external datastore to store domain-specific translation knowledge, which derives a <tex-math>k</tex-math>NN distribution to interpolate the prediction distribution of the NMT model via a linear interpolation coefficient <tex-math>\lambda</tex-math>. Despite its success, <tex-math>k</tex-math>NN retrieval at each timestep leads to substantial time overhead. To address this issue, dominant studies resort to <tex-math>k</tex-math>NN-MT with adaptive retrieval (<tex-math>k</tex-math>NN-MT-AR), which dynamically estimates <tex-math>\lambda</tex-math> and skips <tex-math>k</tex-math>NN retrieval if <tex-math>\lambda</tex-math> is less than a fixed threshold. Unfortunately, <tex-math>k</tex-math>NN-MT-AR does not yield satisfactory results. In this paper, we first conduct a preliminary study to reveal two key limitations of <tex-math>k</tex-math>NN-MT-AR: 1) the optimization gap leads to inaccurate estimation of <tex-math>\lambda</tex-math> for determining <tex-math>k</tex-math>NN retrieval skipping, and 2) using a fixed threshold fails to accommodate the dynamic demands for <tex-math>k</tex-math>NN retrieval at different timesteps. To mitigate these limitations, we then propose <tex-math>k</tex-math>NN-MT with dynamic retrieval (<tex-math>k</tex-math>NN-MT-DR) that significantly extends vanilla <tex-math>k</tex-math>NN-MT in two aspects. Firstly, we equip <tex-math>k</tex-math>NN-MT with a MLP-based classifier for determining whether to skip <tex-math>k</tex-math>NN retrieval at each timestep. Particularly, we explore several carefully-designed scalar features to fully exert the potential of the classifier. Secondly, we propose a timestep-aware threshold adjustment method to dynamically generate the threshold, which further improves the efficiency of our model. Experimental results on the widely-used datasets demonstrate the effectiveness and generality of our model.</abstract>
-      <url hash="c070d36a">2024.findings-acl.475</url>
+      <url hash="62581218">2024.findings-acl.475</url>
       <bibkey>gao-etal-2024-efficient</bibkey>
       <doi>10.18653/v1/2024.findings-acl.475</doi>
     </paper>
@@ -12727,7 +12727,7 @@
       <author><first>Georg</first><last>Rehm</last><affiliation>Humboldt Universität Berlin and Deutsches Forschungszentrum für Künstliche Intelligenz</affiliation></author>
       <pages>8002-8011</pages>
       <abstract>Initially introduced as a machine translation model, the Transformer architecture has now become the foundation for modern deep learning architecture, with applications in a wide range of fields, from computer vision to natural language processing. Nowadays, to tackle increasingly more complex tasks, Transformer-based models are stretched to enormous sizes, requiring increasingly larger training datasets, and unsustainable amount of compute resources. The ubiquitous nature of the Transformer and its core component, the attention mechanism, are thus prime targets for efficiency research.In this work, we propose an alternative compatibility function for the self-attention mechanism introduced by the Transformer architecture. This compatibility function exploits an overlap in the learned representation of the traditional scaled dot-product attention, leading to a symmetric with pairwise coefficient dot-product attention. When applied to the pre-training of BERT-like models, this new symmetric attention mechanism reaches a score of 79.36 on the GLUE benchmark against 78.74 for the traditional implementation, leads to a reduction of 6% in the number of trainable parameters, and reduces the number of training steps required before convergence by half.</abstract>
-      <url hash="263cac99">2024.findings-acl.476</url>
+      <url hash="2429fc82">2024.findings-acl.476</url>
       <bibkey>courtois-etal-2024-symmetric</bibkey>
       <doi>10.18653/v1/2024.findings-acl.476</doi>
       <video href="2024.findings-acl.476.mp4"/>
@@ -12740,7 +12740,7 @@
       <author><first>Srinivasan</first><last>Sengamedu</last><affiliation>Amazon</affiliation></author>
       <pages>8012-8026</pages>
       <abstract>In this study, we tackle the challenge of inadequate and costly training data that has hindered the development of conversational question answering (ConvQA) systems. Enterprises have a large corpus of diverse internal documents. Instead of relying on a searching engine, a more compelling approach for people to comprehend these documents is to create a dialogue system. In this paper, we propose a robust dialog synthesising method. We learn the segmentation of data for the dialog task instead of using segmenting at sentence boundaries. The synthetic dataset generated by our proposed method achieves superior quality when compared to WikiDialog, as assessed through machine and human evaluations. By employing our inpainted data for ConvQA retrieval system pre-training, we observed a notable improvement in performance across OR-QuAC benchmarks.</abstract>
-      <url hash="f8101505">2024.findings-acl.477</url>
+      <url hash="f79a2ec7">2024.findings-acl.477</url>
       <bibkey>wu-etal-2024-synthesizing</bibkey>
       <doi>10.18653/v1/2024.findings-acl.477</doi>
     </paper>
@@ -12750,7 +12750,7 @@
       <author><first>Shay</first><last>Cohen</last><affiliation>University of Edinburgh</affiliation></author>
       <pages>8027-8042</pages>
       <abstract>Although large language models (LLMs) exhibit remarkable capacity to leverage in-context demonstrations, it is still unclear to what extent they can learn new facts or concept definitions via prompts. To address this question, we examine the capacity of instruction-tuned LLMs to follow in-context concept annotation guidelines for zero-shot sentence labeling tasks. We design guidelines that present different types of factual and counterfactual concept definitions, which are used as prompts for zero-shot sentence classification tasks. Our results show that although concept definitions consistently help in task performance, only the larger models (with 70B parameters or more) have limited ability to work under counterfactual contexts. Importantly, only proprietary models such as GPT-3.5 can recognize nonsensical guidelines, which we hypothesize is due to more sophisticated alignment methods. Finally, we find that Falcon-180B-chat is outperformed by Llama-2-70B-chat is most cases, which indicates that increasing model scale does not guarantee better adherence to guidelines. Altogether, our simple evaluation method reveals significant gaps in concept understanding between the most capable open-source language models and the leading proprietary APIs.</abstract>
-      <url hash="611f5814">2024.findings-acl.478</url>
+      <url hash="79d6b9d3">2024.findings-acl.478</url>
       <bibkey>fonseca-cohen-2024-large</bibkey>
       <doi>10.18653/v1/2024.findings-acl.478</doi>
     </paper>
@@ -12760,7 +12760,7 @@
       <author><first>Taro</first><last>Watanabe</last><affiliation>Nara Institute of Science and Technology, Japan</affiliation></author>
       <pages>8043-8054</pages>
       <abstract>Japanese input method editors (IMEs) are essential tools for inputting Japanese text using a limited set of characters such as the kana syllabary. However, despite their importance, the potential of newer attention-based encoder-decoder neural networks, such as Transformer, has not yet been fully explored for IMEs due to their high computational cost and low-quality intermediate output in simultaneous settings, leading to high latencies. In this work, we propose a simple decoding policy to enable the use of attention-based encoder-decoder networks for simultaneous kana-kanji conversion in the context of Japanese IMEs inspired by simultaneous machine translation (SimulMT). We demonstrate that simply decoding by explicitly considering the word boundaries achieves a fairly strong quality-latency trade-off, as it can be seen as equivalent to performing decoding on aligned prefixes and thus achieving an incremental anticipation-free conversion. We further show how such a policy can be applied in practice to achieve high-quality conversions with minimal computational overhead. Our experiments show that our approach can achieve a noticeably better quality-latency trade-off compared to the baselines, while also being a more practical approach due to its ability to directly handle streaming input. Our code is available at https://anonymous.4open.science/r/transformer_ime-D327.</abstract>
-      <url hash="116a20ad">2024.findings-acl.479</url>
+      <url hash="22431f49">2024.findings-acl.479</url>
       <bibkey>sarhangzadeh-watanabe-2024-alignment</bibkey>
       <doi>10.18653/v1/2024.findings-acl.479</doi>
       <video href="2024.findings-acl.479.mp4"/>
@@ -12778,7 +12778,7 @@
       <author><first>Renhong</first><last>Cheng</last></author>
       <pages>8055-8074</pages>
       <abstract>The demand for understanding and expressing emotions in the field of natural language processing is growing rapidly. Knowledge graphs, as an important form of knowledge representation, have been widely utilized in various emotion-related tasks. However, existing knowledge graphs mainly focus on the representation and reasoning of general factual knowledge, while there are still significant deficiencies in the understanding and reasoning of emotional knowledge. In this work, we construct a comprehensive and accurate emotional commonsense knowledge graph, ECoK. We integrate cutting-edge theories from multiple disciplines such as psychology, cognitive science, and linguistics, and combine techniques such as large language models and natural language processing. By mining a large amount of text, dialogue, and sentiment analysis data, we construct rich emotional knowledge and establish the knowledge generation model COMET-ECoK. Experimental results show that ECoK contains high-quality emotional reasoning knowledge, and the performance of our knowledge generation model surpasses GPT-4-Turbo, which can help downstream tasks better understand and reason about emotions. Our data and code is available from https://github.com/ZornWang/ECoK.</abstract>
-      <url hash="c909b841">2024.findings-acl.480</url>
+      <url hash="303623c2">2024.findings-acl.480</url>
       <bibkey>wang-etal-2024-ecok</bibkey>
       <doi>10.18653/v1/2024.findings-acl.480</doi>
     </paper>
@@ -12790,7 +12790,7 @@
       <author><first>Yuhang</first><last>Guo</last></author>
       <pages>8075-8089</pages>
       <abstract>Data augmentation is an effective way to diversify corpora in machine translation, but previous methods may introduce semantic inconsistency between original and augmented data because of irreversible operations and random subword sampling procedures. To generate both symbolically diverse and semantically consistent augmentation data, we propose Deterministic Reversible Data Augmentation (DRDA), a simple but effective data augmentation method for neural machine translation. DRDA adopts deterministic segmentations and reversible operations to generate multi-granularity subword representations and pulls them closer together with multi-view techniques. With no extra corpora or model changes required, DRDA outperforms strong baselines on several translation tasks with a clear margin (up to 4.3 BLEU gain over Transformer) and exhibits good robustness in noisy, low-resource, and cross-domain datasets.</abstract>
-      <url hash="c28f0b98">2024.findings-acl.481</url>
+      <url hash="75a4ddcc">2024.findings-acl.481</url>
       <bibkey>yao-etal-2024-deterministic</bibkey>
       <doi>10.18653/v1/2024.findings-acl.481</doi>
     </paper>
@@ -12804,7 +12804,7 @@
       <author><first>Jun</first><last>Xiao</last><affiliation>Zhejiang University</affiliation></author>
       <pages>8090-8101</pages>
       <abstract>The growing interest in leveraging large language models is driven by their exceptional imitation and reasoning capabilities. In-context learning (ICL), a streamlined method, has shown potential in boosting these models’ performance without modifying their underlying parameters, especially when supplied with suitable demonstrations. However, existing methods mainly choose demonstrations by comparing surface-level semantic similarities (e.g., based on embedding) and fall short of identifying the most fitting ones. This paper introduces the concept of a “latent learningscape”, a more nuanced representation that describes the characteristic of the demonstrations. Building on this concept, we develop a results-driven approach to characterize the latent learningscape features of demonstrations, which then inform the creation of more effective prompts. Through comprehensive testing across datasets in arithmetic, commonsense, and symbolic reasoning tasks, our approach outperforms leading models, showing an average increase in scores by 7.4 percentage points.</abstract>
-      <url hash="e4fb057b">2024.findings-acl.482</url>
+      <url hash="d0b61baa">2024.findings-acl.482</url>
       <bibkey>zhou-etal-2024-latent</bibkey>
       <doi>10.18653/v1/2024.findings-acl.482</doi>
       <video href="2024.findings-acl.482.mp4"/>
@@ -12820,7 +12820,7 @@
       <author><first>Bowen</first><last>Zhou</last><affiliation>Tsinghua University</affiliation></author>
       <pages>8102-8116</pages>
       <abstract>Despite the promising performance of state space models (SSMs) in long sequence modeling, limitations still exist. Advanced SSMs like S5 and S6 (Mamba) in addressing non-uniform sampling, their recursive structures impede efficient SSM computation via convolution. To overcome compatibility limitations in parallel convolutional computation, this paper proposes a novel non-recursive non-uniform sample processing strategy. Theoretical analysis of SSMs through the lens of Event-Triggered Control (ETC) theory reveals the Non-Stable State (NSS) problem, where deviations from sampling point requirements lead to error transmission and accumulation, causing the divergence of the SSM’s hidden state. Our analysis further reveals that adjustments of input sequences with early memories can mitigate the NSS problem, achieving Sampling Step Adaptation (SSA).Building on this insight, we introduce a simple yet effective plug-and-play mechanism, State Memory Replay (SMR), which utilizes learnable memories to adjust the current state with multi-step information for generalization at sampling points different from those in the training data. This enables SSMs to stably model varying sampling points. Experiments on long-range modeling tasks in autoregressive language modeling and Long Range Arena demonstrate the general effectiveness of the SMR mechanism for a series of SSM models.</abstract>
-      <url hash="9e36d69d">2024.findings-acl.483</url>
+      <url hash="0503c8a7">2024.findings-acl.483</url>
       <bibkey>qi-etal-2024-smr</bibkey>
       <doi>10.18653/v1/2024.findings-acl.483</doi>
       <video href="2024.findings-acl.483.mp4"/>
@@ -12834,7 +12834,7 @@
       <author><first>Estevam</first><last>Hruschka</last><affiliation>Megagon Labs and Carnegie Mellon University</affiliation></author>
       <pages>8117-8139</pages>
       <abstract>Large language models (LLMs) are proficient at generating fluent text with minimal task-specific supervision. However, their ability to generate rationales for knowledge-intensive tasks (KITs) remains under-explored. Generating rationales for KIT solutions, such as commonsense multiple-choice QA, requires external knowledge to support predictions and refute alternate options. In this work, we consider the task of generating retrieval-augmented rationalization of KIT model predictions via external knowledge guidance within a few-shot setting. Surprisingly, crowd-workers preferred LLM-generated rationales over existing crowd-sourced rationales, generated in a similar knowledge-guided setting, on aspects such as factuality, sufficiency, and convincingness. However, fine-grained evaluation of such rationales highlights the need for further improvements in conciseness, novelty, and domain invariance. Additionally, through an expert-sourced study evaluating the reliability of the rationales, we demonstrate that humans’ trust in LLM-generated rationales erodes when communicated faithfully, i.e., without taking model prediction accuracy into account. We find that even instrumenting simple guardrails can be effective for reliable rationalization.</abstract>
-      <url hash="c38dd4c8">2024.findings-acl.484</url>
+      <url hash="ff2a559d">2024.findings-acl.484</url>
       <bibkey>mishra-etal-2024-characterizing</bibkey>
       <doi>10.18653/v1/2024.findings-acl.484</doi>
       <video href="2024.findings-acl.484.mp4"/>
@@ -12848,7 +12848,7 @@
       <author><first>Fei</first><last>Xia</last><affiliation>University of Washington, Seattle</affiliation></author>
       <pages>8140-8162</pages>
       <abstract>Recent progress in large language models (LLMs) has marked a notable milestone in the field of artificial intelligence. The conventional evaluation of LLMs primarily relies on existing tasks and benchmarks, raising concerns about test set contamination and the genuine comprehension abilities of LLMs. To address these concerns, we propose to evaluate LLMs by designing new tasks, automatically generating evaluation datasets for the tasks, and conducting detailed error analyses to scrutinize LLMs’ adaptability to new tasks, their sensitivity to prompt variations, and their error tendencies. We investigate the capacity of LLMs to adapt to new but simple tasks, especially when they diverge from the models’ pre-existing knowledge. Our methodology emphasizes the creation of straightforward tasks, facilitating a precise error analysis to uncover the underlying causes of LLM failures. This strategic approach also aims to uncover effective strategies for enhancing LLM performance based on the detailed error analysis of system output.</abstract>
-      <url hash="6036dd46">2024.findings-acl.485</url>
+      <url hash="d13ececa">2024.findings-acl.485</url>
       <bibkey>li-etal-2024-challenging</bibkey>
       <doi>10.18653/v1/2024.findings-acl.485</doi>
       <video href="2024.findings-acl.485.mp4"/>
@@ -12860,7 +12860,7 @@
       <author><first>John</first><last>Bohannon</last></author>
       <pages>8163-8171</pages>
       <abstract>Semantics of a sentence is defined with much less ambiguity than semantics of a single word, and we assume that it should be better preserved by translation to another language. If multilingual sentence embeddings intend to represent sentence semantics, then the similarity between embeddings of any two sentences must be invariant with respect to translation. Based on this suggestion, we consider a simple linear cross-lingual mapping as a possible improvement of the multilingual embeddings. We also consider deviation from orthogonality conditions as a measure of deficiency of the embeddings.</abstract>
-      <url hash="ece7f7b4">2024.findings-acl.486</url>
+      <url hash="d639657d">2024.findings-acl.486</url>
       <bibkey>vasilyev-etal-2024-linear</bibkey>
       <doi>10.18653/v1/2024.findings-acl.486</doi>
       <video href="2024.findings-acl.486.mp4"/>
@@ -12874,7 +12874,7 @@
       <author><first>Alakananda</first><last>Vempala</last><affiliation>Bloomberg</affiliation></author>
       <pages>8172-8185</pages>
       <abstract>Structural extraction of events within discourse is critical since it avails a deeper understanding of communication patterns and behavior trends. Event argument extraction (EAE), at the core of event-centric understanding, is the task of identifying role-specific text spans (i.e., arguments) for a given event. Document-level EAE (DocEAE) focuses on arguments that are scattered across an entire document. In this work, we explore open-source Large Language Models (LLMs) for DocEAE, and propose ULTRA, a hierarchical framework that extracts event arguments more cost-effectively. Further, it alleviates the positional bias issue intrinsic to LLMs. ULTRA sequentially reads text chunks of a document to generate a candidate argument set, upon which non-pertinent candidates are dropped through self-refinement. We introduce LEAFER to address the challenge LLMs face in locating the exact boundary of an argument. ULTRA outperforms strong baselines, including strong supervised models and ChatGPT, by 9.8% when evaluated by Exact Match (EM).</abstract>
-      <url hash="b7a8e063">2024.findings-acl.487</url>
+      <url hash="56ab4bb2">2024.findings-acl.487</url>
       <bibkey>zhang-etal-2024-ultra</bibkey>
       <doi>10.18653/v1/2024.findings-acl.487</doi>
       <video href="2024.findings-acl.487.mp4"/>
@@ -12886,7 +12886,7 @@
       <author><first>Alexander</first><last>Fraser</last><affiliation>Technical University of Munich</affiliation></author>
       <pages>8186-8213</pages>
       <abstract>To democratize large language models (LLMs) to most natural languages, it is imperative to make these models capable of understanding and generating texts in many languages, in particular low-resource ones. While recent multilingual LLMs demonstrate remarkable performance in such capabilities, these LLMs still support a limited number of human languages due to the lack of training data for low resource languages. Moreover, these LLMs are not yet aligned with human preference for downstream tasks, which is crucial for the success of LLMs in English. In this paper, we introduce xLLaMA-100 and xBLOOM-100 (collectively xLLMs-100), which scale the multilingual capabilities of LLaMA and BLOOM to 100 languages. To do so, we construct two datasets: a multilingual instruction dataset including 100 languages, which represents the largest language coverage to date, and a cross-lingual human feedback dataset encompassing 30 languages. We perform multilingual instruction tuning on the constructed instruction data and further align the LLMs with human feedback using the DPO algorithm on our cross-lingual human feedback dataset. We evaluate the multilingual understanding and generating capabilities of xLLMs-100 on five multilingual benchmarks. Experimental results show that xLLMs-100 consistently outperforms its peers across the benchmarks by considerable margins, defining a new state-of-the-art multilingual LLM that supports 100 languages.</abstract>
-      <url hash="febe4c61">2024.findings-acl.488</url>
+      <url hash="c8c52894">2024.findings-acl.488</url>
       <bibkey>lai-etal-2024-llms</bibkey>
       <doi>10.18653/v1/2024.findings-acl.488</doi>
       <video href="2024.findings-acl.488.mp4"/>
@@ -12904,7 +12904,7 @@
       <author><first>Anoop</first><last>Deoras</last><affiliation>Amazon</affiliation></author>
       <pages>8214-8224</pages>
       <abstract>Speculative decoding has emerged as a powerful method to improve latency and throughput in hosting large language models. However, most existing implementations focus on generating a single sequence. Real-world generative AI applications often require multiple responses and how to perform speculative decoding in a batched setting while preserving its latency benefits poses non-trivial challenges. This paper describes a system of batched speculative decoding that sets a new state of the art in multi-sequence generation latency and that demonstrates superior GPU utilization as well as quality of generations within a time budget. For example, for a 7.8B-size model on a single A100 GPU and with a batch size of 8, each sequence is generated at an average speed of 5.8ms per token, the overall throughput being 1.1K tokens per second. These results represent state-of-the-art latency and a 2.15<tex-math>\times</tex-math> speed-up over optimized regular decoding. Within a time budget that regular decoding does not finish, our system is able to generate sequences with HumanEval Pass@First of 43% and Pass@All of 61%, far exceeding what’s feasible with single-sequence speculative decoding. Our peak GPU utilization during decoding reaches as high as 15.8%, more than 3<tex-math>\times</tex-math> the highest of that of regular decoding and around 10<tex-math>\times</tex-math> of single-sequence speculative decoding.</abstract>
-      <url hash="87b1363f">2024.findings-acl.489</url>
+      <url hash="de410a71">2024.findings-acl.489</url>
       <bibkey>qian-etal-2024-bass</bibkey>
       <doi>10.18653/v1/2024.findings-acl.489</doi>
       <video href="2024.findings-acl.489.mp4"/>
@@ -12917,7 +12917,7 @@
       <author><first>Bang</first><last>Liu</last><affiliation>University of Montreal</affiliation></author>
       <pages>8225-8291</pages>
       <abstract>In this study, we explore the application of Large Language Models (LLMs) in Jubensha, a Chinese detective role-playing game and a novel area in Artificial Intelligence (AI) driven gaming. We introduce the first dataset specifically for Jubensha, including character scripts and game rules, to foster AI agent development in this complex narrative environment. Our work also presents a unique multi-agent interaction framework using LLMs, allowing AI agents to autonomously engage in Jubensha games. To evaluate the gaming performance of these AI agents, we developed novel methods measuring their mastery of case information and reasoning skills. Furthermore, we incorporated the latest advancements in prompting engineering to enhance the agents’ performance in information gathering, murderer identification, and logical reasoning. The experimental results validate the effectiveness of our proposed methods. This work aims to offer a novel perspective on understanding LLM capabilities and establish a new benchmark for evaluating large language model-based agents.</abstract>
-      <url hash="832920c2">2024.findings-acl.490</url>
+      <url hash="555f54c6">2024.findings-acl.490</url>
       <bibkey>wu-etal-2024-deciphering</bibkey>
       <doi>10.18653/v1/2024.findings-acl.490</doi>
       <video href="2024.findings-acl.490.mp4"/>
@@ -12926,10 +12926,10 @@
       <title>It Is Not About What You Say, It Is About How You Say It: A Surprisingly Simple Approach for Improving Reading Comprehension</title>
       <author><first>Sagi</first><last>Shaier</last></author>
       <author><first>Lawrence</first><last>Hunter</last><affiliation>University of Colorado at Denver</affiliation></author>
-      <author><first>Katharina</first><last>von der Wense</last><affiliation>Johannes-Gutenberg Universität Mainz, University of Colorado, Boulder and New York University</affiliation></author>
+      <author><first>Katharina</first><last>Wense</last><affiliation>Johannes-Gutenberg Universität Mainz, University of Colorado, Boulder and New York University</affiliation></author>
       <pages>8292-8305</pages>
       <abstract>Natural language processing has seen rapid progress over the past decade. Due to the speed of developments, some practices get established without proper evaluation. Considering one such case and focusing on reading comprehension, we ask our first research question: 1) How does the order of inputs – i.e., question and context – affect model performance? Additionally, given recent advancements in input emphasis, we ask a second research question: 2) Does emphasizing either the question, the context, or both enhance performance? Experimenting with 9 large language models across 3 datasets, we find that presenting the context before the question improves model performance, with an accuracy increase of up to 31%. Furthermore, emphasizing the context yields superior results compared to question emphasis, and in general, emphasizing parts of the input is particularly effective for addressing questions that models lack the parametric knowledge to answer. Experimenting with both prompt-based and attention-based emphasis methods, we additionally find that the best method is surprisingly simple: it only requires concatenating a few tokens to the input and results in an ac- curacy improvement of up to 36%, allowing smaller models to outperform their significantly larger counterparts.</abstract>
-      <url hash="43d3cfa0">2024.findings-acl.491</url>
+      <url hash="5880a978">2024.findings-acl.491</url>
       <bibkey>shaier-etal-2024-say</bibkey>
       <doi>10.18653/v1/2024.findings-acl.491</doi>
       <video href="2024.findings-acl.491.mp4"/>
@@ -12941,7 +12941,7 @@
       <author><first>Shay</first><last>Cohen</last><affiliation>University of Edinburgh</affiliation></author>
       <pages>8306-8323</pages>
       <abstract>Advances in model editing through neuron pruning hold promise for removing undesirable concepts from large language models. However, it remains unclear whether models have the capacity to reacquire pruned concepts after editing. To investigate this, we evaluate concept relearning in models by tracking concept saliency and similarity in pruned neurons during retraining for named entity recognition tasks. Our findings reveal that models can quickly regain performance post-pruning by relocating advanced concepts to earlier layers and reallocating pruned concepts to primed neurons with similar semantics. This suggests that models exhibit polysemantic capacities and can blend old and new concepts in individual neurons. While neuron pruning provides interpretability into model concepts, our results highlight the challenges of permanent concept removal for improved model *safety*. Monitoring concept reemergence and developing techniques to mitigate relearning of unsafe concepts will be important directions for more robust model editing. Overall, our work strongly demonstrates the resilience and fluidity of concept representations in LLMs post concept removal.</abstract>
-      <url hash="25ebfed3">2024.findings-acl.492</url>
+      <url hash="2594e833">2024.findings-acl.492</url>
       <bibkey>lo-etal-2024-large</bibkey>
       <doi>10.18653/v1/2024.findings-acl.492</doi>
       <video href="2024.findings-acl.492.mp4"/>
@@ -12954,7 +12954,7 @@
       <author><first>Yulan</first><last>He</last><affiliation>King’s College London, University of London</affiliation></author>
       <pages>8324-8340</pages>
       <abstract>Task embedding, a meta-learning technique that captures task-specific information, has gained popularity, especially in areas such as multi-task learning, model editing, and interpretability. However, it faces challenges with the emergence of prompt-guided Large Language Models (LLMs) operating in a gradient-free manner. Existing task embedding methods rely on fine-tuned, task-specific language models, which hinders the adaptability of task embeddings across diverse models, especially prompt-based LLMs. To hardness the potential of task embeddings in the era of LLMs, we propose a framework for unified task embeddings (FUTE), harmonizing task embeddings from various models, including smaller language models and LLMs with varied prompts, within a single vector space. Such uniformity enables comparison and analysis of similarities amongst different models, broadening the scope and utility of existing task embedding methods in multi-model scenarios, while maintaining their performance comparable to architecture-specific methods.</abstract>
-      <url hash="55674bf4">2024.findings-acl.493</url>
+      <url hash="85806526">2024.findings-acl.493</url>
       <bibkey>wang-etal-2024-towards-unified</bibkey>
       <doi>10.18653/v1/2024.findings-acl.493</doi>
       <video href="2024.findings-acl.493.mp4"/>
@@ -12967,7 +12967,7 @@
       <author><first>Nigel</first><last>Collier</last><affiliation>University of Cambridge</affiliation></author>
       <pages>8341-8356</pages>
       <abstract>In light of recent advances in large language models (LLMs), the expectations for the next generation of virtual assistants include enhanced naturalness and adaptability across diverse usage scenarios. However, the creation of high-quality annotated data for Task-Oriented Dialog (TOD) is recognized to be slow and costly. To address these challenges, we introduce Task-Oriented Automatic Dialogs (TOAD), a novel and scalable TOD dataset along with its automatic generation pipeline. The TOAD dataset simulates realistic app context interaction and provide a variety of system response style options. Two aspects of system response styles are considered, verbosity level and users’ expression mirroring. We benchmark TOAD on two response generation tasks, and the results show that modeling more verbose responses or responses without user expression mirroring is more challenging.</abstract>
-      <url hash="6ac4245d">2024.findings-acl.494</url>
+      <url hash="20ed2140">2024.findings-acl.494</url>
       <bibkey>liu-etal-2024-toad</bibkey>
       <doi>10.18653/v1/2024.findings-acl.494</doi>
       <video href="2024.findings-acl.494.mp4"/>
@@ -12979,7 +12979,7 @@
       <author><first>Bryan</first><last>Plummer</last><affiliation>Boston University</affiliation></author>
       <pages>8357-8371</pages>
       <abstract>Machine-Generated Text (MGT) detection aims to identify a piece of text as machine or human written. Prior work has primarily formulated MGT detection as a binary classification task over an entire document, with limited work exploring cases where only part of a document is machine generated. This paper provides the first in-depth study of MGT that localizes the portions of a document that were machine generated. Thus, if a bad actor were to change a key portion of a news article to spread misinformation, whole document MGT detection may fail since the vast majority is human written, but our approach can succeed due to its granular approach. A key challenge in our MGT localization task is that short spans of text, *e.g.*, a single sentence, provides little information indicating if it is machine generated due to its short length. To address this, we leverage contextual information, where we predict whether multiple sentences are machine or human written at once. This enables our approach to identify changes in style or content to boost performance. A gain of 4-13% mean Average Precision (mAP) over prior work demonstrates the effectiveness of approach on five diverse datasets: GoodNews, VisualNews, WikiText, Essay, and WP. We release our implementation at https://github.com/Zhongping-Zhang/MGT_Localization.</abstract>
-      <url hash="07d0ce06">2024.findings-acl.495</url>
+      <url hash="19ee69aa">2024.findings-acl.495</url>
       <bibkey>zhang-etal-2024-machine</bibkey>
       <doi>10.18653/v1/2024.findings-acl.495</doi>
       <video href="2024.findings-acl.495.mp4"/>
@@ -12990,7 +12990,7 @@
       <author><first>Philippe</first><last>Langlais</last><affiliation>Université de Montréal</affiliation></author>
       <pages>8372-8394</pages>
       <abstract>Open Information Extraction (OIE) is a field of natural language processing that aims to present textual information in a format that allows it to be organized, analyzed and reflected upon. Numerous OIE systems are developed, claiming ever-increasing performance, marking the need for objective benchmarks. BenchIE is the latest reference we know of. Despite being very well thought out, we noticed a number of issues we believe are limiting. Therefore, we propose BenchIE^FL, a new OIE benchmark which fully enforces the principles of BenchIE while containing fewer errors, omissions and shortcomings when candidate facts are matched towards reference ones. BenchIE^FL allows insightful conclusions to be drawn on the actual performance of OIE extractors.</abstract>
-      <url hash="e83c213f">2024.findings-acl.496</url>
+      <url hash="40c1df8a">2024.findings-acl.496</url>
       <bibkey>lamarche-langlais-2024-benchie</bibkey>
       <doi>10.18653/v1/2024.findings-acl.496</doi>
       <video href="2024.findings-acl.496.mp4"/>
@@ -13006,7 +13006,7 @@
       <author><first>Bernhard</first><last>Schölkopf</last><affiliation>ELLIS Institute and Max Planck Institute for Intelligent Systems, Max-Planck Institute</affiliation></author>
       <pages>8395-8410</pages>
       <abstract>Citation count of a paper is a commonly used proxy for evaluating the significance of a paper in the scientific community. Yet citation measures are widely criticized for failing to accurately reflect the true impact of a paper. Thus, we propose CausalCite, a new way to measure the significance of a paper by assessing the causal impact of the paper on its follow-up papers. CausalCite is based on a novel causal inference method, TextMatch, which adapts the traditional matching framework to high-dimensional text embeddings. TextMatch encodes each paper using text embeddings from large language models (LLMs), extracts similar samples by cosine similarity, and synthesizes a counterfactual sample as the weighted average of similar papers according to their similarity values. We demonstrate the effectiveness of CausalCite on various criteria, such as high correlation with paper impact as reported by scientific experts on a previous dataset of 1K papers, (test-of-time) awards for past papers, and its stability across various subfields of AI. We also provide a set of findings that can serve as suggested ways for future researchers to use our metric for a better understanding of the quality of a paper. Our code is available at https://github.com/causalNLP/causal-cite.</abstract>
-      <url hash="1688aa81">2024.findings-acl.497</url>
+      <url hash="b2d0efea">2024.findings-acl.497</url>
       <bibkey>agrawal-etal-2024-causalcite</bibkey>
       <doi>10.18653/v1/2024.findings-acl.497</doi>
     </paper>
@@ -13020,7 +13020,7 @@
       <author><first>Alexandra</first><last>Birch</last><affiliation>University of Edinburgh</affiliation></author>
       <pages>8411-8423</pages>
       <abstract>Large language models show compelling performance on reasoning tasks but they tend to perform much worse in languages other than English. This is unsurprising given that their training data largely consists of English text and instructions. A typical solution is to translate instruction data into all languages of interest, and then train on the resulting multilingual data, which is called translate-training. This approach not only incurs high cost, but also results in poorly translated data due to the non-standard formatting of mathematical chain-of-thought. In this paper, we explore the benefits of question alignment, where we train the model to translate reasoning questions into English by finetuning on X-English parallel question data. In this way we perform targeted, in-domain language alignment which makes best use of English instruction data to unlock the LLMs’ multilingual reasoning abilities. Experimental results on LLaMA2-13B show that question alignment leads to consistent improvements over the translate-training approach: an average improvement of 11.3% and 16.1% accuracy across ten languages on the MGSM and MSVAMP multilingual reasoning benchmarks.</abstract>
-      <url hash="a1734f34">2024.findings-acl.498</url>
+      <url hash="4d8fd800">2024.findings-acl.498</url>
       <bibkey>zhu-etal-2024-question</bibkey>
       <doi>10.18653/v1/2024.findings-acl.498</doi>
       <video href="2024.findings-acl.498.mp4"/>
@@ -13037,7 +13037,7 @@
       <author><first>Dong</first><last>Yu</last><affiliation>Tencent AI Lab</affiliation></author>
       <pages>8424-8436</pages>
       <abstract>This work studies mitigating fact-conflicting hallucinations for large language model (LLM) at inference time.Particularly, we propose a self-endorsement framework that leverages the fine-grained fact-level comparisons across multiple sampled responses.Compared with prior ensemble methods (e.g., self-consistency) that perform response-level selection, our approach can better alleviate hallucinations for knowledge-intensive tasks.Our approach can broadly benefit smaller and open-source LLMs as it mainly conducts simple content-based comparisons.Experiments on Biographies show that our method can effectively improve the factuality of generations with simple and intuitive prompts across different scales of LLMs.Besides, comprehensive analyses on TriviaQA and GSM8K demonstrate the potential of self-endorsement for broader application.</abstract>
-      <url hash="1d842dcc">2024.findings-acl.499</url>
+      <url hash="9ef5ee41">2024.findings-acl.499</url>
       <bibkey>wang-etal-2024-improving</bibkey>
       <doi>10.18653/v1/2024.findings-acl.499</doi>
       <video href="2024.findings-acl.499.mp4"/>
@@ -13049,7 +13049,7 @@
       <author><first>Bonnie</first><last>Webber</last><affiliation>Edinburgh University, University of Edinburgh</affiliation></author>
       <pages>8437-8451</pages>
       <abstract>Discourse relations play a pivotal role in establishing coherence within textual content, uniting sentences and clauses into a cohesive narrative. The Penn Discourse Treebank (PDTB) stands as one of the most extensively utilized datasets in this domain. In PDTB-3, the annotators can assign multiple labels to an example, when they believe the simultaneous presence of multiple relations. Prior research in discourse relation recognition has treated these instances as separate examples during training, with a gold-standard prediction matching one of the labels considered correct at test time. However, this approach is inadequate, as it fails to account for the interdependence of labels in real-world contexts and to distinguish between cases where only one sense relation holds and cases where multiple relations hold simultaneously. In our work, we address this challenge by exploring various multi-label classification frameworks to handle implicit discourse relation recognition. We show that the methods for multi-label prediction don’t depress performance for single-label prediction. Additionally, we give comprehensive analysis of results and data. Our work contributes to advancing the understanding and application of discourse relations and provide a foundation for the future study.</abstract>
-      <url hash="4f6c0a8e">2024.findings-acl.500</url>
+      <url hash="8930e081">2024.findings-acl.500</url>
       <bibkey>long-etal-2024-multi</bibkey>
       <doi>10.18653/v1/2024.findings-acl.500</doi>
     </paper>
@@ -13063,7 +13063,7 @@
       <author><first>Carolyn</first><last>Anderson</last><affiliation>Wellesley College</affiliation></author>
       <pages>8452-8474</pages>
       <abstract>Code LLMs have the potential to make it easier for non-experts to understand and write code. However, current CodeLLM benchmarks rely on a single expert-written prompt per problem, making it hard to generalize their success to non-expert users. In this paper, we present a new natural-language-to-code benchmark of prompts written by a key population of non-experts: beginning programmers. StudentEval contains 1,749 prompts written by 80 students who have only completed one introductory Python course. StudentEval contains numerous non-expert prompts describing the same problem, enabling exploration of key factors in prompt success. We use StudentEval to evaluate 12 Code LLMs and find that StudentEval is a better discriminator of model performance than existing benchmarks. Our analysis of student prompting strategies reveals that nondeterministic LLM sampling can mislead students about the quality of their descriptions, a finding with key implications for Code LLMs in education.</abstract>
-      <url hash="0b9e4d67">2024.findings-acl.501</url>
+      <url hash="39b06310">2024.findings-acl.501</url>
       <bibkey>babe-etal-2024-studenteval</bibkey>
       <doi>10.18653/v1/2024.findings-acl.501</doi>
       <video href="2024.findings-acl.501.mp4"/>
@@ -13075,7 +13075,7 @@
       <author><first>Zhou</first><last>Yu</last><affiliation>Columbia University</affiliation></author>
       <pages>8475-8493</pages>
       <abstract>Lexical Substitution discovers appropriate substitutes for a given target word in a context sentence. However, the task fails to consider substitutes that are of equal or higher proficiency than the target, an aspect that could be beneficial for language learners looking to improve their writing. To bridge this gap, we propose a new task — language proficiency-oriented lexical substitution. We also introduce ProLex, a novel benchmark designed to assess systems’ ability to generate not only appropriate substitutes but also substitutes that demonstrate better language proficiency. Besides the benchmark, we propose models that can automatically perform the new task. We show that our best model, a Llama2-13B model fine-tuned with task-specific synthetic data, outperforms ChatGPT by an average of 3.2% in F-score and achieves comparable results with GPT-4 on ProLex.</abstract>
-      <url hash="38c6be49">2024.findings-acl.502</url>
+      <url hash="bdfda6cb">2024.findings-acl.502</url>
       <bibkey>zhang-etal-2024-prolex</bibkey>
       <doi>10.18653/v1/2024.findings-acl.502</doi>
       <video href="2024.findings-acl.502.mp4"/>
@@ -13088,7 +13088,7 @@
       <author><first>Peinan</first><last>Zhang</last><affiliation>CyberAgent AI Lab</affiliation></author>
       <pages>8494-8525</pages>
       <abstract>One of the most important challenges in text generation systems is to produce outputs that are not only correct but also diverse.Recently, Minimum Bayes-Risk (MBR) decoding has gained prominence for generating sentences of the highest quality among the decoding algorithms. However, existing algorithms proposed to generate diverse outputs are predominantly based on beam search or random sampling, thus their output quality is capped by these underlying decoding algorithms. In this paper, we investigate an alternative approach – we develop diversity-promoting decoding algorithms by enforcing diversity objectives to MBR decoding.We propose two variants of MBR; (i) Diverse MBR (DMBR) that adds a diversity penalty to the decoding objective and (ii) <tex-math>k</tex-math>-medoids MBR (KMBR) that reformulates the decoding task as a clustering problem.We evaluate DMBR and KMBR on a variety of directed text generation tasks using encoder-decoder models and a language model with prompting. The experimental results show that the proposed method achieves a better trade-off than the diverse beam search and sampling algorithms overall.</abstract>
-      <url hash="6b1eb81b">2024.findings-acl.503</url>
+      <url hash="db5640cb">2024.findings-acl.503</url>
       <bibkey>jinnai-etal-2024-generating</bibkey>
       <doi>10.18653/v1/2024.findings-acl.503</doi>
       <video href="2024.findings-acl.503.mp4"/>
@@ -13101,7 +13101,7 @@
       <author><first>Vishal</first><last>Chowdhary</last></author>
       <pages>8526-8546</pages>
       <abstract>Neural Machine Translation (NMT) continues to improve in quality and adoption, yet the in advertent perpetuation of gender bias remains a significant concern. Despite numerous studies on gender bias in translations into English from weakly gendered-languages, there are no benchmarks for evaluating this phenomenon or for assessing mitigation strategies. To address this gap, we introduce GATE X-E, an extension to the GATE (Rarrick et al., 2023) corpus, that consists of human translations from Turkish, Hungarian, Finnish, and Persian into English. Each translation is accompanied by feminine, masculine, and neutral variants. The dataset, which contains between 1250 and 1850 instances for each of the four language pairs, features natural sentences with a wide range of sentence lengths and domains, challenging translation rewriters on various linguistic phenomena. Additionally, we present a translation gender rewriting solution built with GPT-4 and use GATE X-E to evaluate it. We open source our contributions to encourage further research on gender debiasing.</abstract>
-      <url hash="5890e082">2024.findings-acl.504</url>
+      <url hash="5bbe117b">2024.findings-acl.504</url>
       <bibkey>rarrick-etal-2024-gate</bibkey>
       <doi>10.18653/v1/2024.findings-acl.504</doi>
       <video href="2024.findings-acl.504.mp4"/>
@@ -13112,7 +13112,7 @@
       <author><first>Kaito</first><last>Ariu</last><affiliation>CyberAgent, Inc.</affiliation></author>
       <pages>8547-8566</pages>
       <abstract>Minimum Bayes-Risk (MBR) decoding is shown to be a powerful alternative to beam search decoding for a wide range of text generation tasks. However, MBR requires a huge amount of time for inference to compute the MBR objective, which makes the method infeasible in many situations where response time is critical. Confidence-based pruning (CBP) (Cheng and Vlachos, 2023) has recently been proposed to reduce the inference time in machine translation tasks. Although it is shown to significantly reduce the amount of computation, it requires hyperparameter tuning using a development set to be effective. To this end, we propose Adaptive Minimum Bayes-Risk (AMBR) decoding, a hyperparameter-free method to run MBR decoding efficiently. AMBR is derived from the observation that the problem of computing the sample-based MBR objective is the medoid identification problem. AMBR uses the Correlated Sequential Halving (CSH) algorithm (Baharav and Tse, 2019), the algorithm with the best performance guarantee to date for the medoid identification problem, to compute the sample-based MBR objective. We evaluate AMBR on machine translation, text summarization, and image captioning tasks. The results show that AMBR achieves on par with CBP, with CBP selecting hyperparameters through an Oracle for each given computation budget.</abstract>
-      <url hash="fc7e71ed">2024.findings-acl.505</url>
+      <url hash="7a0ce510">2024.findings-acl.505</url>
       <bibkey>jinnai-ariu-2024-hyperparameter</bibkey>
       <doi>10.18653/v1/2024.findings-acl.505</doi>
       <video href="2024.findings-acl.505.mp4"/>
@@ -13126,7 +13126,7 @@
       <author><first>Koichi</first><last>Takeda</last><affiliation>Nagoya University</affiliation></author>
       <pages>8567-8577</pages>
       <abstract>In recent years, neural machine translation (NMT) has become widely used in everyday life. However, the current NMT lacks a mechanism to adjust the difficulty level of translations to match the user’s language level. Additionally, due to the bias in the training data for NMT, translations of simple source sentences are often produced with complex words. In particular, this could pose a problem for children, who may not be able to understand the meaning of the translations correctly. In this study, we propose a method that replaces high Age of Acquisitions (AoA) words in translations with simpler words to match the translations to the user’s level. We achieve this by using large language models (LLMs), providing a triple of a source sentence, a translation, and a target word to be replaced. We create a benchmark dataset using back-translation on Simple English Wikipedia. The experimental results obtained from the dataset show that our method effectively replaces high-AoA words with lower-AoA words and, moreover, can iteratively replace most of the high-AoA words while still maintaining high BLEU and COMET scores.</abstract>
-      <url hash="257c4e36">2024.findings-acl.506</url>
+      <url hash="eeb2dfbd">2024.findings-acl.506</url>
       <bibkey>oshika-etal-2024-simplifying</bibkey>
       <doi>10.18653/v1/2024.findings-acl.506</doi>
       <video href="2024.findings-acl.506.mp4"/>
@@ -13138,7 +13138,7 @@
       <author><first>Linqi</first><last>Song</last><affiliation>City University of Hong Kong</affiliation></author>
       <pages>8578-8598</pages>
       <abstract>Large Language Models (LLMs) have shown human-like reasoning abilities but still face challenges in solving complex logical problems. Existing unidirectional chaining methods, such as forward chaining and backward chaining, suffer from issues like low prediction accuracy and efficiency. To address these, we propose a bidirectional chaining method, Bi-Chainer, which dynamically switches to depth-first reasoning in the opposite reasoning direction when it encounters multiple branching options within the current direction. Thus, the intermediate reasoning results can be utilized as guidance to facilitate the reasoning process. We show that Bi-Chainer achieves sizable accuracy boots over unidirectional chaining frameworks on four challenging logical reasoning datasets. Moreover, Bi-Chainer enhances the accuracy of intermediate proof steps and reduces the average number of inference calls, resulting in more efficient and accurate reasoning.</abstract>
-      <url hash="98bb9981">2024.findings-acl.507</url>
+      <url hash="eb78604a">2024.findings-acl.507</url>
       <bibkey>liu-etal-2024-bi</bibkey>
       <doi>10.18653/v1/2024.findings-acl.507</doi>
     </paper>
@@ -13148,7 +13148,7 @@
       <author><first>Shay</first><last>Cohen</last><affiliation>University of Edinburgh</affiliation></author>
       <pages>8599-8618</pages>
       <abstract>In this work, we investigate the controllability of large language models (LLMs) on scientific summarization tasks. We identify key stylistic and content coverage factors that characterize different types of summaries such as paper reviews, abstracts, and lay summaries. By controlling stylistic features, we find that non-fine-tuned LLMs outperform humans in the MuP review generation task, both in terms of similarity to reference summaries and human preferences. Also, we show that we can improve the controllability of LLMs with keyword-based classifier-free guidance (CFG) while achieving lexical overlap comparable to strong fine-tuned baselines on arXiv and PubMed. However, our results also indicate that LLMs cannot consistently generate long summaries with more than 8 sentences. Furthermore, these models exhibit limited capacity to produce highly abstractive lay summaries. Although LLMs demonstrate strong generic summarization competency, sophisticated content control without costly fine-tuning remains an open problem for domain-specific applications.</abstract>
-      <url hash="eec0cf7f">2024.findings-acl.508</url>
+      <url hash="f67b13a7">2024.findings-acl.508</url>
       <bibkey>fonseca-cohen-2024-large-language</bibkey>
       <doi>10.18653/v1/2024.findings-acl.508</doi>
     </paper>
@@ -13162,7 +13162,7 @@
       <author><first>Zhendong</first><last>Mao</last><affiliation>University of Science and Technology of China</affiliation></author>
       <pages>8619-8630</pages>
       <abstract>Text-based knowledge graph completion (KGC) methods utilize pre-trained language models for triple encoding and further fine-tune the model to achieve completion. Despite their excellent performance, they neglect the knowledge context in inferring process. Intuitively, knowledge contexts, which refer to the neighboring triples around the target triples, are important information for triple inferring, since they provide additional detailed information about the entities. To this end, we propose a novel framework named KnowC, which models the knowledge context as additional prompts with pre-trained language models for knowledge graph completion. Given the substantial number of neighbors typically associated with entities, along with the constrained input token capacity of language models, we further devise several strategies to sample the neighbors. We conduct extensive experiments on common datasets FB15k-237, WN18RR and Wikidata5M, experiments show that KnowC achieves state-of-the-art performance.</abstract>
-      <url hash="ce807ed3">2024.findings-acl.509</url>
+      <url hash="b1f3a53f">2024.findings-acl.509</url>
       <bibkey>yang-etal-2024-knowledge</bibkey>
       <doi>10.18653/v1/2024.findings-acl.509</doi>
     </paper>
@@ -13176,7 +13176,7 @@
       <author><first>Philip</first><last>Yu</last><affiliation>University of Illinois, Chicago</affiliation></author>
       <pages>8631-8643</pages>
       <abstract>Attribute value extraction involves identifying the value spans of predetermined attributes in product texts. This area of research has traditionally operated under a closed-world assumption, focusing on products from a static set of categories and their associated attributes. However, products in e-commerce stores are ever-increasing and evolving, calling for life-long learning. If continuously trained on the fast-increasing products and attributes, most existing solutions not only struggle for parameter efficiency but also endure foreseeable defects due to data contamination, catastrophic forgetting, etc. As a remedy, we propose and study a new task, which aims to effectively maintain a strong single model for many domains in a life-long learning fashion, without jeopardizing the model performance and parameter efficiency. We introduce factorization into the model and make it domain-aware by decoupling the modeling of product type and attribute, as a way to promote de-contamination and parameter efficiency while scaling up. Tuning the model with distillation prevents forgetting historical knowledge and enables continuous learning from emerging domains. Experiments on hundreds of domains showed that our model attains the near state-of-the-art performance with affordable parameter size, the least historical knowledge forgetting, and the greatest robustness against noises, whilst adding only a few parameters per domain when compared with competitive baselines.</abstract>
-      <url hash="e14f4692">2024.findings-acl.510</url>
+      <url hash="3081a4f1">2024.findings-acl.510</url>
       <bibkey>zhang-etal-2024-stronger</bibkey>
       <doi>10.18653/v1/2024.findings-acl.510</doi>
       <video href="2024.findings-acl.510.mp4"/>
@@ -13189,7 +13189,7 @@
       <author><first>YoonHyung</first><last>Roh</last><affiliation>Electronics and Telecommunications Research Institute</affiliation></author>
       <pages>8644-8652</pages>
       <abstract>Recent advancements in large language models have heavily relied on the large reward model from reinforcement learning from human feedback for fine-tuning. However, the use of a single reward model across various domains may not always be optimal, often requiring retraining from scratch when new domain data is introduced. To address these challenges, we explore the utilization of small language models operating in a domain-specific manner based on router mechanisms. Our three approaches are: 1) utilize mixture of experts to form a single reward model by modularizing an internal router and experts, 2) employing external router to select the appropriate reward model from multiple domain-specific models, and 3) the framework reduces parameter size by loading reward models and router adapters onto a single small language model using adapters. Experimental validation underscores the effectiveness of our approach, demonstrating performance comparable to baseline methods while also reducing the total parameter size.</abstract>
-      <url hash="31014dea">2024.findings-acl.511</url>
+      <url hash="b5ddf97e">2024.findings-acl.511</url>
       <bibkey>namgoong-etal-2024-exploring</bibkey>
       <doi>10.18653/v1/2024.findings-acl.511</doi>
       <video href="2024.findings-acl.511.mp4"/>
@@ -13209,7 +13209,7 @@
       <author><first>Ping</first><last>Chen</last><affiliation>University of Massachusetts, Boston</affiliation></author>
       <pages>8653-8665</pages>
       <abstract>Generalized Category Discovery (GCD) is a crucial task that aims to recognize both known and novel categories from a set of unlabeled data by utilizing a few labeled data with only known categories. Due to the lack of supervision and category information, current methods usually perform poorly on novel categories and struggle to reveal semantic meanings of the discovered clusters, which limits their applications in the real world. To mitigate the above issues, we propose Loop, an end-to-end active-learning framework that introduces Large Language Models (LLMs) into the training loop, which can boost model performance and generate category names without relying on any human efforts. Specifically, we first propose Local Inconsistent Sampling (LIS) to select samples that have a higher probability of falling to wrong clusters, based on neighborhood prediction consistency and entropy of cluster assignment probabilities. Then we propose a Scalable Query strategy to allow LLMs to choose true neighbors of the selected samples from multiple candidate samples. Based on the feedback from LLMs, we perform Refined Neighborhood Contrastive Learning (RNCL) to pull samples and their neighbors closer to learn clustering-friendly representations. Finally, we select representative samples from clusters corresponding to novel categories to allow LLMs to generate category names for them. Extensive experiments on three benchmark datasets show that Loop outperforms SOTA models by a large margin and generates accurate category names for the discovered clusters. Code and data are available at https://github.com/Lackel/LOOP.</abstract>
-      <url hash="03f82c28">2024.findings-acl.512</url>
+      <url hash="b8b5aa23">2024.findings-acl.512</url>
       <bibkey>an-etal-2024-generalized</bibkey>
       <doi>10.18653/v1/2024.findings-acl.512</doi>
     </paper>
@@ -13221,7 +13221,7 @@
       <author><first>Dan</first><last>Wang</last><affiliation>Alibaba Group</affiliation></author>
       <pages>8666-8681</pages>
       <abstract>Text embedding requires a highly efficient method for training domain-specific models on limited data, as general models trained on large corpora lack universal applicability in highly specific fields. Therefore, we have introduced VAEGPT-Sim, an innovative model for generating synonyms that combines a denoising variational autoencoder with a target-specific discriminator to generate synonymous sentences that closely resemble human language. Even when trained with completely unsupervised settings, it maintains a harmonious balance between semantic similarity and lexical diversity, as shown by a comprehensive evaluation metric system with the highest average scores compared to other generative models. When VAEGPT-Sim is utilized as a module for contrastive learning in text representation, it delivers state-of-the-art results in small-dataset training on STS benchmarks, surpassing ConSERT by 2.8 points. This approach optimizes the effectiveness of text representation despite a limited corpus, signifying an advancement in domain-specific embedding technology.</abstract>
-      <url hash="d4168a66">2024.findings-acl.513</url>
+      <url hash="3bbb73ce">2024.findings-acl.513</url>
       <bibkey>wang-etal-2024-vaegpt</bibkey>
       <doi>10.18653/v1/2024.findings-acl.513</doi>
       <video href="2024.findings-acl.513.mp4"/>
@@ -13235,7 +13235,7 @@
       <author><first>Nan</first><last>Duan</last><affiliation>Microsoft Research Asia</affiliation></author>
       <pages>8682-8701</pages>
       <abstract>Recent evaluations of Large Language Models (LLMs) have centered around testing their zero-shot/few-shot capabilities for basic natural language tasks and their ability to translate instructions into tool APIs. However, the evaluation of LLMs utilizing complex tools to finish multi-turn, multi-modal instructions in a complex multi-modal environment has not been investigated. To address this gap, we introduce the PowerPoint Task Completion (PPTC) benchmark to assess LLMs’ ability to create and edit PPT files based on user instructions. It contains 279 multi-turn sessions covering diverse topics and hundreds of instructions involving multi-modal operations. We also propose the PPTX-Match Evaluation System that evaluates if LLMs finish the instruction based on the prediction file rather than the label API sequence, thus it supports various LLM-generated API sequences. We measure 3 closed LLMs and 6 open-source LLMs. The results show that GPT-4 outperforms other LLMs with 75.1% accuracy in single-turn dialogue testing but faces challenges in completing entire sessions, achieving just 6% session accuracy. We find three main error causes in our benchmark: error accumulation in the multi-turn session, long PPT template processing, and multi-modality perception. These pose great challenges for future LLM and agent systems .</abstract>
-      <url hash="ff3afd68">2024.findings-acl.514</url>
+      <url hash="f615d6d8">2024.findings-acl.514</url>
       <bibkey>guo-etal-2024-pptc</bibkey>
       <doi>10.18653/v1/2024.findings-acl.514</doi>
     </paper>
@@ -13250,7 +13250,7 @@
       <author><first>Jianshu</first><last>Chen</last><affiliation>Amazon</affiliation></author>
       <pages>8702-8718</pages>
       <abstract>For a LLM to be trustworthy, its confidence level should be well-calibrated with its actual performance. While it is now common sense that LLM performances are greatly impacted by prompts, the confidence calibration in prompting LLMs has yet to be thoroughly explored.In this paper, we explore how different prompting strategies influence LLM confidence calibration and how it could be improved. We conduct extensive experiments on six prompting methods in the question-answering context and we observe that, while these methods help improve the expected LLM calibration, they also trigger LLMs to be over-confident when responding to some instances.Inspired by human cognition, we propose Fact-and-Reflection (FaR) prompting, which improves the LLM calibration in two steps. First, FaR elicits the known “facts” that are relevant to the input prompt from the LLM. And then it asks the model to “reflect” over them to generate the final answer.Experiments show that FaR prompting achieves significantly better calibration; it lowers the Expected Calibration Error by 23.5% on our multi-purpose QA tasks. Notably, FaR prompting even elicits the capability of verbally expressing concerns in less confident scenarios, which helps trigger retrieval augmentation for solving these harder instances.</abstract>
-      <url hash="88af5c8a">2024.findings-acl.515</url>
+      <url hash="06875aa2">2024.findings-acl.515</url>
       <bibkey>zhao-etal-2024-fact</bibkey>
       <doi>10.18653/v1/2024.findings-acl.515</doi>
       <video href="2024.findings-acl.515.mp4"/>
@@ -13270,7 +13270,7 @@
       <author><first>Dacheng</first><last>Tao</last><affiliation>University of Sydney</affiliation></author>
       <pages>8719-8730</pages>
       <abstract>Large language models (LLMs) have significantly advanced the field of natural language processing, while the expensive memory and computation consumption impede their practical deployment. Quantization emerges as one of the most effective methods for improving the computational efficiency of LLMs. However, existing ultra-low-bit quantization always causes severe accuracy drops. In this paper, we empirically investigate the micro and macro characteristics of ultra-low bit quantization and present a novel <b>D</b>ual-<b>B</b>inarization method for <b>LLM</b>s, namely <b>DB-LLM</b>. For the micro-level, we take both the accuracy advantage of 2-bit-width and the efficiency advantage of binarization into account, introducing <i>Flexible Dual Binarization</i> (<b>FDB</b>). By splitting 2-bit quantized weights into two independent sets of binaries, FDB ensures the accuracy of representations and introduces flexibility, utilizing the efficient bitwise operations of binarization while retaining the inherent high sparsity of ultra-low bit quantization. For the macro-level, we find the distortion that exists in the prediction of LLM after quantization, which is specified as the deviations related to the ambiguity of samples. We propose the <i>Deviation-Aware Distillation</i> (<b>DAD</b>) method, enabling the model to focus differently on various samples. Comprehensive experiments show that our DB-LLM not only significantly surpasses the current State-of-The-Art (SoTA) in ultra-low bit quantization (, perplexity decreased from 9.64 to 7.23), but also achieves an additional 20% reduction in computational consumption compared to the SOTA method under the same bit-width. Our code is available at https://github.com/Hon-Chen/DB-LLM.</abstract>
-      <url hash="0086e4e0">2024.findings-acl.516</url>
+      <url hash="f22ca2ff">2024.findings-acl.516</url>
       <bibkey>chen-etal-2024-db</bibkey>
       <doi>10.18653/v1/2024.findings-acl.516</doi>
     </paper>
@@ -13287,7 +13287,7 @@
       <author><first>Lu</first><last>Hou</last><affiliation>Huawei Technologies Ltd.</affiliation></author>
       <pages>8731-8772</pages>
       <abstract>Recently, there is a surge in interest surrounding video large language models (Video LLMs). However, existing benchmarks fail to provide a comprehensive feedback on the temporal perception ability of Video LLMs. On the one hand, most of them are unable to distinguish between different temporal aspects (e.g., speed, direction) and thus cannot reflect the nuanced performance on these specific aspects. On the other hand, they are limited in the diversity of task formats (e.g., only multi-choice QA), which hinders the understanding of how temporal perception performance may vary across different types of tasks. Motivated by these two problems, we propose the <b>TempCompass</b> benchmark, which introduces a diversity of temporal aspects and task formats. To collect high-quality test data, we devise two novel strategies: (1) In video collection, we construct conflicting videos that share the same static content but differ in a specific temporal aspect, which prevents Video LLMs from leveraging single-frame bias or language priors. (2) To collect the task instructions, we propose a paradigm where humans first annotate meta-information for a video and then an LLM generates the instruction. We also design an LLM-based approach to automatically and accurately evaluate the responses from Video LLMs. Based on TempCompass, we comprehensively evaluate 9 state-of-the-art (SOTA) Video LLMs and 3 Image LLMs, and reveal the discerning fact that these models exhibit notably poor temporal perception ability.</abstract>
-      <url hash="cf8f047d">2024.findings-acl.517</url>
+      <url hash="d194ba4f">2024.findings-acl.517</url>
       <bibkey>liu-etal-2024-tempcompass</bibkey>
       <doi>10.18653/v1/2024.findings-acl.517</doi>
       <video href="2024.findings-acl.517.mp4"/>
@@ -13298,7 +13298,7 @@
       <author><first>Jeffrey</first><last>Rzeszotarski</last><affiliation>Cornell University</affiliation></author>
       <pages>8773-8782</pages>
       <abstract>Annotation quality is often framed as post-hoc cleanup of annotator-caused issues. This position paper discusses whether, how, and why this narrative limits the scope of improving annotation. We call to consider annotation as a procedural collaboration, outlining three points in this direction:(1) An issue can be either annotator- or researcher-oriented, where one party is accountable and the other party may lack ability to fix it; (2) yet, they can co-occur or have similar consequences, and thus any specific problem we encounter may be a combination;(3) therefore, we need a new language to capture the nuance and holistically describe the full procedure to resolve these issues.To that end, we propose to study how agency is manifested in annotation and picture how this perspective benefits the community more broadly.</abstract>
-      <url hash="6215f8fd">2024.findings-acl.518</url>
+      <url hash="3ef00aa7">2024.findings-acl.518</url>
       <bibkey>zhu-rzeszotarski-2024-get</bibkey>
       <doi>10.18653/v1/2024.findings-acl.518</doi>
     </paper>
@@ -13310,7 +13310,7 @@
       <author><first>Yansong</first><last>Feng</last><affiliation>Peking University</affiliation></author>
       <pages>8783-8800</pages>
       <abstract>Existing large language models struggle to support numerous low-resource languages, particularly the extremely low-resource ones, for which there is minimal training data available for effective parameter updating. We thus investigate whether LLMs can learn a new language on the fly solely through prompting. To study this question, we collect a research suite for Zhuang, a language supported by no LLMs currently. We introduce DiPMT++, a framework for adapting LLMs to unseen languages by in-context learning. Using a dictionary and 5K parallel sentences only, DiPMT++ significantly enhances the performance of GPT-4 from 0 to 16 BLEU for Chinese-to-Zhuang translation and achieves 32 BLEU for Zhuang-to-Chinese translation. We also validate the effectiveness of our framework on Kalamang, another unseen language. Furthermore, we demonstrate the practical utility of DiPMT++ in aiding humans in translating completely unseen languages, which could contribute to the preservation of linguistic diversity.</abstract>
-      <url hash="eb96bc81">2024.findings-acl.519</url>
+      <url hash="094fda9e">2024.findings-acl.519</url>
       <bibkey>zhang-etal-2024-teaching</bibkey>
       <doi>10.18653/v1/2024.findings-acl.519</doi>
       <video href="2024.findings-acl.519.mp4"/>
@@ -13325,7 +13325,7 @@
       <author><first>Dacheng</first><last>Tao</last><affiliation>University of Sydney</affiliation></author>
       <pages>8801-8816</pages>
       <abstract>Generative large language models (LLMs), e.g., ChatGPT, have demonstrated remarkable proficiency across several NLP tasks, such as machine translation, text summarization. Recent research (Kocmi and Federmann, 2023) has shown that utilizing LLMs for assessing the quality of machine translation (MT) achieves state-of-the-art performance at the system level but performs poorly at the segment level. To further improve the performance of LLMs on MT quality assessment, we conduct an investigation into several prompting designs, and propose a new prompting method called Error Analysis Prompting (EAPrompt) by combining Chain-of-Thoughts (Wei et al., 2022) and Error Analysis (Lu et al., 2023). This technique emulates the commonly accepted human evaluation framework - Multidimensional Quality Metrics (MQM, Freitag et al., (2021)) and produces explainable and reliable MT evaluations at both the system and segment level. Experimental Results from WMT22 metrics shared task validate the effectiveness of EAPrompt on various LLMs, with different structures. Further analysis confirms that EAPrompt effectively distinguishes major errors from minor ones, while also sharing a similar distribution of the number of errors with MQM. These findings highlight the potential of EAPrompt as a human-like evaluator prompting technique for MT evaluation. We will release our code and scripts to facilitate the community.</abstract>
-      <url hash="62bed5a0">2024.findings-acl.520</url>
+      <url hash="cae5cb1d">2024.findings-acl.520</url>
       <bibkey>lu-etal-2024-error</bibkey>
       <doi>10.18653/v1/2024.findings-acl.520</doi>
     </paper>
@@ -13335,7 +13335,7 @@
       <author><first>Xipeng</first><last>Qiu</last><affiliation>Fudan University</affiliation></author>
       <pages>8817-8825</pages>
       <abstract>The Large Vision-Language Models (LVLMs) have demonstrated great abilities in image perception and language understanding. However, existing datasets either focus solely on primary perception abilities and commonsense knowledge, or have a low level of text comprehension difficulty, which are insufficient to reflect the comprehensive capabilities of LVLMs, particularly in terms of Chinese language proficiency. We propose GAOKAO-MM, a multimodal benchmark based on the Chinese College Entrance Examination (GAOKAO), comprising of 8 subjects and 12 types of images, such as diagrams, function graphs, maps and photos. GAOKAO-MM derives from native Chinese context and sets human-level requirements for the model’s abilities, including perception, understanding, knowledge and reasoning. We evaluate 10 LVLMs and find that the accuracies of all of them are lower than 50%, with GPT-4-Vision (48.1%), Qwen-VL-Plus (41.2%) and Gemini-Pro-Vision (35.1%) ranking in the top three positions. The results of our multi-dimension analysis indicate that LVLMs have moderate distance towards Artificial General Intelligence (AGI) and provide insights facilitating the development of multilingual LVLMs. The dataset and evaluation code are available through: https://github.com/OpenMOSS/GAOKAO-MM</abstract>
-      <url hash="3b459fed">2024.findings-acl.521</url>
+      <url hash="7cd0b6e5">2024.findings-acl.521</url>
       <bibkey>zong-qiu-2024-gaokao</bibkey>
       <doi>10.18653/v1/2024.findings-acl.521</doi>
     </paper>
@@ -13348,7 +13348,7 @@
       <author><first>Lianwen</first><last>Jin</last><affiliation>South China University of Technology</affiliation></author>
       <pages>8826-8840</pages>
       <abstract>We present DiffChat, a novel method to align Large Language Models (LLMs) to “chat” with prompt-as-input Text-to-Image Synthesis (TIS)models (e.g., Stable Diffusion) for interactive image creation. Given a raw prompt/image and a user-specified instruction, DiffChat can effectively make appropriate modifications and generate the target prompt, which can be leveraged to create the target image of high quality. To achieve this, we first collect an instruction-following prompt engineering dataset named InstructPE for the supervised training of DiffChat.Next, we propose a reinforcement learning framework with the feedback of three core criteria for image creation, i.e., aesthetics, user preference and content integrity. It involves an action-space dynamic modification technique to obtain more relevant positive samples and harder negative samples during the off-policy sampling. Content integrity is also introduced into the value estimation function for further improvement of produced images. Our method can exhibit superior performance than baseline models and strong competitors based on both automatic and human evaluations, which fully demonstrates its effectiveness.</abstract>
-      <url hash="7d36a4b6">2024.findings-acl.522</url>
+      <url hash="b354b9aa">2024.findings-acl.522</url>
       <bibkey>wang-etal-2024-diffchat</bibkey>
       <doi>10.18653/v1/2024.findings-acl.522</doi>
       <video href="2024.findings-acl.522.mp4"/>
@@ -13363,7 +13363,7 @@
       <author><first>Jie</first><last>Tang</last><affiliation>Tsinghua University, Tsinghua University</affiliation></author>
       <pages>8841-8852</pages>
       <abstract>We identify two crucial limitations in the evaluation of recent parallel-integrated method Parallel Context Windows (PCW), which extends the maximum context lengths of language models, e.g., 2048 for LLaMA, by harnessing window-wise attention and positional embedding techniques. We first show that a simple yet strong baseline, weighted sum ensemble, is missing for the in-context few-shot classification. Moreover, on more challenging Chain-of-Thought (CoT) reasoning (e.g., HotpotQA), PCW would present unexpected deterioration regarding question miscomprehension and false inference. Based on our findings, we suggest that the existing PCW design may not guarantee sufficient improvement and practicality in handling lengthy documents in real-world applications. More community efforts on enabling language models’ long context understanding ability should be paid.</abstract>
-      <url hash="419ae122">2024.findings-acl.523</url>
+      <url hash="1a2265b4">2024.findings-acl.523</url>
       <bibkey>yang-etal-2024-revisiting</bibkey>
       <doi>10.18653/v1/2024.findings-acl.523</doi>
     </paper>
@@ -13375,7 +13375,7 @@
       <author><first>Zhenan</first><last>He</last><affiliation>Sichuan University</affiliation></author>
       <pages>8853-8869</pages>
       <abstract>Recently, large language models (LLMs) have demonstrated breakthrough mathematical problem-solving capabilities in grade school math word problems (MWP). For example, on the MWP benchmark GSM8K, the accuracy of GPT-3.5-Turbo and MetaMath-70B reaches 80.80% and 82.30%, respectively. One question arises, does it mean that LLMs have truly mastered related mathematical problem-solving abilities? In this paper, by presenting two types of benchmarks, where MCGSM8K aims at selecting one correct solution from four solutions, while GSM8K-Judgement judges whether a solution to a given question is true or false, we demonstrate that the ability of most LLMs to evaluate the mathematical reasoning process of MWP is far from sufficient. To compensate for this issue, we propose hybrid supervised fine-tuning data from the training data of GSM8K, MCGSM8K, and GSM8K-Judgement, which significantly improves performance on the proposed reasoning process evaluation benchmarks. For example, fine-tuning improves the performance of LLaMA-2-13B from 33.51% to 70.89% on MCGSM8K. In conclusion, we experimentally demonstrate that most LLMs have limited ability to evaluate the mathematical reasoning process of MWP, which can be enhanced through fine-tuning.</abstract>
-      <url hash="c31c39c7">2024.findings-acl.524</url>
+      <url hash="17a0a06a">2024.findings-acl.524</url>
       <bibkey>zhang-etal-2024-rationales</bibkey>
       <doi>10.18653/v1/2024.findings-acl.524</doi>
       <video href="2024.findings-acl.524.mp4"/>
@@ -13394,7 +13394,7 @@
       <author><first>Qi</first><last>Zhang</last></author>
       <pages>8870-8884</pages>
       <abstract>As one of the most popular parameter-efficient fine-tuning (PEFT) methods, low-rank adaptation (LoRA) is commonly applied to fine-tune large language models (LLMs). However, updating the weights of LoRA blocks effectively and expeditiously is challenging due to the long calculation path in the original model. To address this, we propose ResLoRA, an improved framework of LoRA. By adding residual paths during training and using merging approaches to eliminate these extra paths during inference, our method can achieve better results in fewer training steps without any extra trainable parameters or inference cost compared to LoRA. The experiments on NLG, NLU, and text-to-image tasks demonstrate the effectiveness of our method. To the best of our knowledge, ResLoRA is the first work that combines the residual path with LoRA. The code of our method is available at [this url](https://github.com/microsoft/LMOps/tree/main/reslora).</abstract>
-      <url hash="73aaf1cb">2024.findings-acl.525</url>
+      <url hash="c7005e6f">2024.findings-acl.525</url>
       <bibkey>shi-etal-2024-reslora</bibkey>
       <doi>10.18653/v1/2024.findings-acl.525</doi>
     </paper>
@@ -13406,7 +13406,7 @@
       <author><first>Baoyuan</first><last>Wang</last><affiliation>Xiaobing.ai</affiliation></author>
       <pages>8885-8897</pages>
       <abstract>Prominent large language models have exhibited human-level performance in many domains, even enabling the derived agents to simulate human and social interactions. While practical works have substantiated the practicability of grounding language agents in sandbox simulation or embodied simulators, current social intelligence benchmarks either stay at the language level or use subjective metrics. In pursuit of a more realistic and objective evaluation, we introduce the Social Tasks in Sandbox Simulation (STSS) benchmark, which assesses language agents objectively at the action level by scrutinizing the goal achievements within the multi-agent simulation.Additionally, we sample conversation scenarios to build a language-level benchmark to provide an economically prudent preliminary evaluation and align with prevailing benchmarks. To gauge the significance of agent architecture, we implement a target-driven planning (TDP) module as an adjunct to the existing agent. Our evaluative findings highlight that the STSS benchmark is challenging for state-of-the-art language agents. Furthermore, it effectively discriminates between distinct language agents, suggesting its usefulness as a benchmark for evaluating both language models and agent architectures. Our code is available at https://github.com/wcx21/Social-Tasks-in-Sandbox-Simulation.</abstract>
-      <url hash="c8014ffc">2024.findings-acl.526</url>
+      <url hash="27ad69a2">2024.findings-acl.526</url>
       <bibkey>wang-etal-2024-towards-objectively</bibkey>
       <doi>10.18653/v1/2024.findings-acl.526</doi>
       <video href="2024.findings-acl.526.mp4"/>
@@ -13419,7 +13419,7 @@
       <author><first>Min</first><last>Zhang</last><affiliation>Harbin Institute of Technology, Shenzhen</affiliation></author>
       <pages>8898-8911</pages>
       <abstract>Semantic Role Labeling (SRL), crucial for understanding semantic relationships in sentences, has traditionally focused on text-based input. However, the increasing use of voice assistants and the need for hands-free interaction have highlighted the importance of SRL from speech.SRL from speech can be accomplished via a two-step pipeline directly: transcribing speech to text via Automatic Speech Recognition (ASR) and then applying text-based SRL, which could lead to error propagation and loss of useful acoustic features.Addressing these challenges, we present the first end-to-end approach for SRL from speech, integrating ASR and SRL in a joint-learning framework, focusing on the Chinese language. By employing a Stright-Through Gumbel-Softmax module for connecting ASR and SRL models, it enables gradient back-propagation and joint optimization, enhancing robustness and effectiveness.Experiments on the Chinese Proposition Bank 1.0 (CPB1.0) and a newly annotated dataset AS-SRL based on AISHELL-1 demonstrate the superiority of the end-to-end model over traditional pipelines, with significantly improved performance.</abstract>
-      <url hash="63543793">2024.findings-acl.527</url>
+      <url hash="2e804ae9">2024.findings-acl.527</url>
       <bibkey>chen-etal-2024-semantic</bibkey>
       <doi>10.18653/v1/2024.findings-acl.527</doi>
     </paper>
@@ -13434,7 +13434,7 @@
       <author><first>Chongyang</first><last>Tao</last><affiliation>Beihang University</affiliation></author>
       <pages>8912-8925</pages>
       <abstract>Multi-modal Event Reasoning (MMER) endeavors to endow machines with the ability to comprehend intricate event relations across diverse data modalities. MMER is fundamental and underlies a wide broad of applications. Despite extensive instruction fine-tuning, current multi-modal large language models still fall short in such ability. The disparity stems from that existing models are insufficient to capture underlying principles governing event evolution in various scenarios. In this paper, we introduce Multi-Modal Event Evolution Learning (MEEL) to enable the model to grasp the event evolution mechanism yielding advanced MMER ability. Specifically, we commence with the design of event diversification to gather seed events from a rich spectrum of scenarios. Subsequently, we employ ChatGPT to generate evolving graphs for these seed events. We propose an instruction encapsulation process that formulates the evolving graphs into instruction-tuning data, aligning the comprehension of event reasoning to humans. Finally, we observe that models trained in this way are still struggling to fully comprehend event evolution. In such a case, we propose the guiding discrimination strategy, in which models are trained to discriminate the improper evolution direction. We collect and curate a benchmark M-EV2 for MMER. Extensive experiments on M-EV2 validate the effectiveness of our approach, showcasing competitive performance in open-source multi-modal LLMs.</abstract>
-      <url hash="264e90aa">2024.findings-acl.528</url>
+      <url hash="3a1928f9">2024.findings-acl.528</url>
       <bibkey>tao-etal-2024-meel</bibkey>
       <doi>10.18653/v1/2024.findings-acl.528</doi>
       <video href="2024.findings-acl.528.mp4"/>
@@ -13450,7 +13450,7 @@
       <author><first>Yuyu</first><last>Yin</last></author>
       <pages>8926-8939</pages>
       <abstract>The large-scale conversational recommendation dataset is pivotal for the development of conversational recommender systems (CRS). Most existing CRS datasets suffers from the problems of data inextensibility and semantic inconsistency. To tackle these limitations and establish a benchmark in the conversational recommendation scenario, in this paper, we introduce the LLM-REDIAL dataset to facilitate the research in CRS. LLM-REDIAL is constructed by leveraging large language models (LLMs) to generate the high-quality dialogues. To provide the LLMs with detailed guidance, we integrate historical user behavior data with dialogue templates that are carefully designed through the combination of multiple pre-defined goals. LLM-REDIAL has two main advantages. First, it is the largest multi-domain CRS dataset which consists of 47.6k multi-turn dialogues with 482.6k utterances across 4 domains. Second, dialogue semantics and the users’ historical interaction information is highly consistent. Human evaluation are conducted to verify the quality of LLM-REDIAL. In addition, we evaluate the usability of advanced LLM-based models on LLM-REDIAL.</abstract>
-      <url hash="c4f153d1">2024.findings-acl.529</url>
+      <url hash="b8dee272">2024.findings-acl.529</url>
       <bibkey>liang-etal-2024-llm</bibkey>
       <doi>10.18653/v1/2024.findings-acl.529</doi>
       <video href="2024.findings-acl.529.mp4"/>
@@ -13462,7 +13462,7 @@
       <author><first>Gene</first><last>Kim</last><affiliation>University of South Florida</affiliation></author>
       <pages>8940-8965</pages>
       <abstract>LLMs are increasingly powerful and widely used to assist users in a variety of tasks. This use risks introducing LLM biases into consequential decisions such as job hiring, human performance evaluation, and criminal sentencing. Bias in NLP systems along the lines of gender and ethnicity has been widely studied, especially for specific stereotypes (e.g., Asians are good at math). In this paper, we investigate bias along less-studied but still consequential, dimensions, such as age and beauty, measuring subtler correlated decisions that LLMs make between social groups and unrelated positive and negative attributes. Although these subtler biases are understudied they follow people as much as gender and ethnicity do. So, we want to see whether they also follow one with LLMs.We introduce a template-generated dataset of sentence completion tasks that asks the model to select the most appropriate attribute to complete an evaluative statement about a person described as a member of a specific social group. We also reverse the completion task to select the social group based on an attribute. We report the correlations that we find for 4 cutting-edge LLMs. This dataset can be used as a benchmark to evaluate progress in more generalized biases and the templating technique can be used to expand the benchmark with minimal additional human annotation.</abstract>
-      <url hash="6f9e672f">2024.findings-acl.530</url>
+      <url hash="eb4945f6">2024.findings-acl.530</url>
       <bibkey>kamruzzaman-etal-2024-investigating</bibkey>
       <doi>10.18653/v1/2024.findings-acl.530</doi>
       <video href="2024.findings-acl.530.mp4"/>
@@ -13477,7 +13477,7 @@
       <author><first>Yiwei</first><last>Lou</last></author>
       <pages>8966-8979</pages>
       <abstract>Events refer to specific occurrences, incidents, or happenings that take place under a particular background. Event reasoning aims to infer events according to certain relations and predict future events. The cutting-edge techniques for event reasoning play a crucial role in various natural language processing applications. Large language models (LLMs) have made significant advancements in event reasoning owing to their wealth of knowledge and reasoning capabilities. However, smaller instruction-tuned models currently in use do not consistently demonstrate exceptional proficiency in managing these tasks. This discrepancy arises from the absence of explicit modeling of events and the interconnections of them within their instruction data. Consequently, these models face challenges in comprehending event structures and semantics while struggling to bridge the gap between their interpretations and human understanding of events. Additionally, their limitations in grasping event relations lead to constrained event reasoning abilities to effectively deduce and incorporate pertinent event knowledge. In this paper, we propose Event-Oriented Instruction Tuning to train our large language model named EvIT specializing in event reasoning tasks. Specifically, we first propose a novel structure named event quadruple which contains the structure and semantics of events and is complete in the event representation. We then design event-relation learning based on the structures. We encapsulate the learning into the instruction-tuning formulation to better stimulate the event reasoning capacity of our model. To implement our training, we design a heuristic unsupervised method to mine event quadruple from a large-scale corpus. At last, we finetune a Llama model on our Event-Oriented Instruction Tuning. We conduct extensive experiments on event reasoning tasks on several datasets. Automatic and human evaluations demonstrate EvIT achieves competitive performances on event reasoning.</abstract>
-      <url hash="3d0f081b">2024.findings-acl.531</url>
+      <url hash="052a3d15">2024.findings-acl.531</url>
       <bibkey>tao-etal-2024-evit</bibkey>
       <doi>10.18653/v1/2024.findings-acl.531</doi>
       <video href="2024.findings-acl.531.mp4"/>
@@ -13485,12 +13485,12 @@
     <paper id="532">
       <title><fixed-case>I</fixed-case>nstruct<fixed-case>CMP</fixed-case>: Length Control in Sentence Compression through Instruction-based Large Language Models</title>
       <author><first>Juseon-Do</first><last>Juseon-Do</last></author>
-      <author><first>Jingun</first><last>Kwon</last><affiliation>Chungnam National University</affiliation></author>
       <author><first>Hidetaka</first><last>Kamigaito</last><affiliation>Division of Information Science, Nara Institute of Science and Technology</affiliation></author>
       <author><first>Manabu</first><last>Okumura</last><affiliation>Tokyo Institute of Technology, Tokyo Institute of Technology</affiliation></author>
+      <author><first>Jingun</first><last>Kwon</last><affiliation>Chungnam National University</affiliation></author>
       <pages>8980-8996</pages>
       <abstract>Extractive summarization can produce faithful summaries but often requires additional constraints such as a desired summary length. Traditional sentence compression models do not typically consider the constraints because of their restricted model abilities, which require model modifications for coping with them. To bridge this gap, we propose Instruction-based Compression (InstructCMP), an approach to the sentence compression task that can consider the length constraint through instructions by leveraging the zero-shot task-solving abilities of Large Language Models (LLMs). For this purpose, we created new evaluation datasets by transforming traditional sentence compression datasets into an instruction format. By using the datasets, we first reveal that the current LLMs still face challenges in accurately controlling the length for a compressed text. To address this issue, we propose an approach named length priming, that incorporates additional length information into the instructions without external resources. While the length priming effectively works in a zero-shot setting, a training dataset with the instructions would further improve the ability of length control. Thus, we additionally created a training dataset in an instruction format to fine-tune the model on it. Experimental results and analysis show that applying the length priming significantly improves performances of InstructCMP in both zero-shot and fine-tuning settings without the need of any model modifications.</abstract>
-      <url hash="1800f738">2024.findings-acl.532</url>
+      <url hash="5893350a">2024.findings-acl.532</url>
       <bibkey>juseon-do-etal-2024-instructcmp</bibkey>
       <doi>10.18653/v1/2024.findings-acl.532</doi>
       <video href="2024.findings-acl.532.mp4"/>
@@ -13503,7 +13503,7 @@
       <author><first>Mukesh</first><last>Mohania</last><affiliation>Indraprastha Institute of Information Technology</affiliation></author>
       <pages>8997-9008</pages>
       <abstract>Citing pertinent literature is pivotal to writing and reviewing a scientific document. Existing techniques mainly focus on the local context or the global context for recommending citations but fail to consider the actual human citation behaviour. We propose SymTax, a three-stage recommendation architecture that considers both the local and the global context, and additionally the taxonomical representations of query-candidate tuples and the Symbiosis prevailing amongst them. SymTax learns to embed the infused taxonomies in the hyperbolic space and uses hyperbolic separation as a latent feature to compute query-candidate similarity. We build a novel and large dataset ArSyTa containing 8.27 million citation contexts and describe the creation process in detail. We conduct extensive experiments and ablation studies to demonstrate the effectiveness and design choice of each module in our framework. Also, combinatorial analysis from our experiments shed light on the choice of language models (LMs) and fusion embedding, and the inclusion of section heading as a signal. Our proposed module that captures the symbiotic relationship solely leads to performance gains of 26.66% and 39.25% in Recall@5 w.r.t. SOTA on ACL-200 and RefSeer datasets, respectively. The complete framework yields a gain of 22.56% in Recall@5 wrt SOTA on our proposed dataset. The code and dataset are available at https://github.com/goyalkaraniit/SymTax.</abstract>
-      <url hash="9ec27d99">2024.findings-acl.533</url>
+      <url hash="f7e30f5e">2024.findings-acl.533</url>
       <bibkey>goyal-etal-2024-symtax</bibkey>
       <doi>10.18653/v1/2024.findings-acl.533</doi>
       <video href="2024.findings-acl.533.mp4"/>
@@ -13515,7 +13515,7 @@
       <author><first>Kunwoo</first><last>Park</last><affiliation>Soongsil University</affiliation></author>
       <pages>9009-9024</pages>
       <abstract>This paper addresses the critical challenge of assessing the representativeness of news thumbnail images, which often serve as the first visual engagement for readers when an article is disseminated on social media. We focus on whether a news image represents the actors discussed in the news text. To serve the challenge, we introduce NewsTT, a manually annotated dataset of 1000 news thumbnail images and text pairs. We found that the pretrained vision and language models, such as BLIP-2, struggle with this task. Since news subjects frequently involve named entities or proper nouns, the pretrained models could have a limited capability to match news actors’ visual and textual appearances. We hypothesize that learning to contrast news text with its counterfactual, of which named entities are replaced, can enhance the cross-modal matching ability of vision and language models. We propose CFT-CLIP, a contrastive learning framework that updates vision and language bi-encoders according to the hypothesis. We found that our simple method can boost the performance for assessing news thumbnail representativeness, supporting our assumption. Code and data can be accessed at https://github.com/ssu-humane/news-images-acl24.</abstract>
-      <url hash="a5fd4b1d">2024.findings-acl.534</url>
+      <url hash="59c08c0e">2024.findings-acl.534</url>
       <bibkey>yoon-etal-2024-assessing</bibkey>
       <doi>10.18653/v1/2024.findings-acl.534</doi>
       <video href="2024.findings-acl.534.mp4"/>
@@ -13526,7 +13526,7 @@
       <author><first>Jian</first><last>Liu</last><affiliation>Beijing Jiaotong University</affiliation></author>
       <pages>9025-9038</pages>
       <abstract>Event Extraction (EE) is an essential information extraction task that aims to extract event-related information from unstructured texts.The paradigm of this task has shifted from conventional classification-based methods to more contemporary question-answering-based (QA-based) approaches. However, in QA-based EE, the quality of the questions dramatically affects the extraction accuracy, and how to generate high-quality questions for QA-based EE remains a challenge. In this work, to tackle this challenge, we suggest four criteria to evaluate the quality of a question and propose a reinforcement learning method, RLQG, for QA-based EE that can generate generalizable, high-quality, and context-dependent questions and provides clear guidance to QA models. The extensive experiments conducted on ACE and RAMS datasets have strongly validated our approach’s effectiveness, which also demonstrates its robustness in scenarios with limited training data. The corresponding code of RLQG is released for further research.</abstract>
-      <url hash="7711a44f">2024.findings-acl.535</url>
+      <url hash="1a235150">2024.findings-acl.535</url>
       <bibkey>hong-liu-2024-towards</bibkey>
       <doi>10.18653/v1/2024.findings-acl.535</doi>
       <video href="2024.findings-acl.535.mp4"/>
@@ -13541,7 +13541,7 @@
       <author id="yang-liu"><first>Yang</first><last>Liu</last></author>
       <pages>9039-9052</pages>
       <abstract>Despite intensive efforts devoted to tool learning, the problem of budget-constrained tool learning, which focuses on resolving user queries within a specific budget constraint, has been widely overlooked. This paper proposes a novel method for budget-constrained tool learning. Our approach involves creating a preferable plan under the budget constraint before utilizing the tools. This plan outlines the feasible tools and the maximum number of times they can be employed, offering a comprehensive overview of the tool learning process for large language models. This allows them to allocate the budget from a broader perspective. To devise the plan without incurring significant extra costs, we suggest initially estimating the usefulness of the candidate tools based on past experience. Subsequently, we employ dynamic programming to formulate the plan. Experimental results demonstrate that our method can be integrated with various tool learning methods, significantly enhancing their effectiveness under strict budget constraints.</abstract>
-      <url hash="0e56dbcc">2024.findings-acl.536</url>
+      <url hash="21730936">2024.findings-acl.536</url>
       <bibkey>zheng-etal-2024-budget</bibkey>
       <doi>10.18653/v1/2024.findings-acl.536</doi>
     </paper>
@@ -13557,7 +13557,7 @@
       <author><first>Shuming</first><last>Shi</last><affiliation>Tencent AI Lab</affiliation></author>
       <pages>9053-9076</pages>
       <abstract>Large language models with instruction-following abilities have revolutionized the field of artificial intelligence. These models show exceptional generalizability to tackle various real-world tasks through their natural language interfaces. However, their performance heavily relies on high-quality exemplar data, which is often difficult to obtain. This challenge is further exacerbated when it comes to multimodal instruction following. We introduce TextBind, an almost annotation-free framework for empowering LLMs with multi-turn interleaved multimodal instruction-following capabilities. Our approach requires only image-caption pairs and generates multi-turn multimodal instruction-response conversations from a language model. To accommodate interleaved image-text inputs and outputs, we devise MIM, a language model-centric architecture that seamlessly integrates image encoder and decoder models. Extensive quantitative and qualitative experiments demonstrate that MIM trained on TextBind achieves remarkable generation capability in multimodal conversations compared to recent baselines.</abstract>
-      <url hash="6eb03d26">2024.findings-acl.537</url>
+      <url hash="d44c33ba">2024.findings-acl.537</url>
       <bibkey>li-etal-2024-textbind</bibkey>
       <doi>10.18653/v1/2024.findings-acl.537</doi>
       <video href="2024.findings-acl.537.mp4"/>
@@ -13572,7 +13572,7 @@
       <author><first>Pengfei</first><last>Liu</last></author>
       <pages>9077-9096</pages>
       <abstract>Critique, as a natural language description for assessing the quality of model-generated content, has played a vital role in the training, evaluation, and refinement of LLMs. However, a systematic method to evaluate the quality of critique is lacking. In this paper, we pioneer the critique of critique, termed <tex-math>\textbf{MetaCritique}</tex-math>, which builds specific quantification criteria. To achieve a reliable evaluation outcome, we propose Atomic Information Units (AIUs), which describe the critique in a more fine-grained manner. MetaCritique aggregates each AIU’s judgment for the overall score. Moreover, MetaCritique delivers a natural language rationale for the intricate reasoning within each judgment. Lastly, we construct a meta-evaluation dataset covering 4 tasks across 16 public datasets involving human-written and LLM-generated critiques. Experiments demonstrate that MetaCritique can achieve near-human performance. Our study can facilitate future research in LLM critiques based on our following observations and released resources: (1) superior critiques judged by MetaCritique can lead to better refinements, indicating that it can potentially enhance the alignment of existing LLMs; (2) the leaderboard of critique models reveals that open-source critique models commonly suffer from factuality issues; (3) relevant code and data are publicly available at https://anonymous.4open.science/r/MetaCritique-ARR/ to support deeper exploration; (4) an <tex-math>\textbf{API}</tex-math> at PyPI with the usage documentation in Appendix C allows users to assess the critique conveniently.</abstract>
-      <url hash="eef412a3">2024.findings-acl.538</url>
+      <url hash="e39e2f77">2024.findings-acl.538</url>
       <bibkey>sun-etal-2024-critique</bibkey>
       <doi>10.18653/v1/2024.findings-acl.538</doi>
       <video href="2024.findings-acl.538.mp4"/>
@@ -13584,7 +13584,7 @@
       <author><first>Hai</first><last>Zhao</last><affiliation>Shanghai Jiao Tong University</affiliation></author>
       <pages>9097-9110</pages>
       <abstract>Multimodal large language models (MLLMs) have shown remarkable potential as human-like autonomous language agents to interact with real-world environments, especially for graphical user interface (GUI) automation.However, those GUI agents require comprehensive cognition including exhaustive perception and reliable action response.We propose a Comprehensive Cognitive LLM Agent, CoCo-Agent, with two novel approaches, comprehensive environment perception (CEP) and conditional action prediction (CAP), to systematically improve the GUI automation performance. First, CEP facilitates the GUI perception through different aspects and granularity, including screenshots and complementary detailed layouts for the visual channel and historical actions for the textual channel.Second, CAP decomposes the action prediction into sub-problems: determining the action type and then identifying the action target conditioned on the action type.With our technical design, our agent achieves state-of-the-art performance on AITW and META-GUI benchmarks, showing promising abilities in realistic scenarios. Code is available at <url>https://github.com/xbmxb/CoCo-Agent</url>.</abstract>
-      <url hash="e1ab96f1">2024.findings-acl.539</url>
+      <url hash="7d31c26e">2024.findings-acl.539</url>
       <bibkey>ma-etal-2024-coco</bibkey>
       <doi>10.18653/v1/2024.findings-acl.539</doi>
       <video href="2024.findings-acl.539.mp4"/>
@@ -13599,7 +13599,7 @@
       <author><first>Jiye</first><last>Liang</last><affiliation>Shanxi University</affiliation></author>
       <pages>9111-9128</pages>
       <abstract>Structured entailment tree can exhibit the reasoning chains from knowledge facts to predicted answers, which is important for constructing an explainable question answering system. Existing works mainly include directly generating the entire tree and stepwise generating the proof steps. The stepwise methods can exploit combinatoriality and generalize to longer steps, but they have large fact search spaces and error accumulation problems resulting in the generation of invalid steps. In this paper, inspired by the Dual Process Theory in cognitive science, we propose FRVA, a Fact-Retrieval and Verification Augmented bidirectional entailment tree generation method that contains two systems. Specifically, System 1 makes intuitive judgments through the fact retrieval module and filters irrelevant facts to reduce the search space. System 2 designs a deductive-abductive bidirectional reasoning module, and we construct cross-verification and multi-view contrastive learning to make the generated proof steps closer to the target hypothesis. We enhance the reliability of the stepwise proofs to mitigate error propagation. Experiment results on EntailmentBank show that FRVA outperforms previous models and achieves state-of-the-art performance in fact selection and structural correctness.</abstract>
-      <url hash="2a3776ed">2024.findings-acl.540</url>
+      <url hash="c474912a">2024.findings-acl.540</url>
       <bibkey>fan-etal-2024-frva</bibkey>
       <doi>10.18653/v1/2024.findings-acl.540</doi>
       <video href="2024.findings-acl.540.mp4"/>
@@ -13614,7 +13614,7 @@
       <author><first>Qi</first><last>Zhang</last><affiliation>Fudan University</affiliation></author>
       <pages>9129-9144</pages>
       <abstract>Empowering Large Language Models (LLMs) with distinct human-like personality traits has become an innovative task for developing advanced dialog systems.Although LLMs demonstrate impressive capabilities in following instructions, directly prompting them to exhibit certain personalities through manually crafted instructions may result in sub-optimal performance.In this paper, we propose a plug-and-play prompting method to manipulate the LLMs’ personality traits.Specifically, we append discrete personalized suffixes, automatically generated through an aggregated gradient-based search method, to the user query or dialog histories and induce LLMs to respond with target personalities.In addition, due to the high redundancy of the search space, we adopt a reward-based strategy to prune the vocabulary and focus exclusively on influential tokens.Experiment results on four models ranging from 1.1B to 13B show that our method achieves 79.9% accuracy in customizing LLMs’ personalities, significantly outperforming other prompting methods (65.5%) and model editing methods.Our method also excels in generation fluency and quality with the lowest generation perplexity and the highest GPT-4 evaluation scores.</abstract>
-      <url hash="2d9d7aed">2024.findings-acl.541</url>
+      <url hash="8f6cae66">2024.findings-acl.541</url>
       <bibkey>zhang-etal-2024-p4</bibkey>
       <doi>10.18653/v1/2024.findings-acl.541</doi>
       <video href="2024.findings-acl.541.mp4"/>
@@ -13627,7 +13627,7 @@
       <author><first>Nan</first><last>Duan</last><affiliation>Microsoft Research Asia</affiliation></author>
       <pages>9145-9154</pages>
       <abstract>One major challenge for Large Language Models (LLMs) is completing complex tasks involving multiple entities, such as tool APIs. To tackle this, one approach is to retrieve relevant entities to enhance LLMs in task completion. A crucial issue here is obtaining accurate natural language representations for each entity to aid in retriever precision. In this paper, we propose the Natural Language Representation Optimization Problem, which aims to refine entity descriptions for improved retrieval and LLM utilization. We introduce the Learning to Represent with Natural Language method, which utilizes LLMs to optimize entity representations consisting of text patterns based on environmental feedback. We iteratively prompt LLMs to enhance or adjust patterns based on entity samples and evaluate their effectiveness through environmental feedback. Our method successfully learns human-understandable representations for classification tasks (e.g., instructions and documents) and API call tasks (e.g., APIbench and Virtual Home), significantly improving GPT-4’s task performance.</abstract>
-      <url hash="9c46cea1">2024.findings-acl.542</url>
+      <url hash="ef1eab80">2024.findings-acl.542</url>
       <bibkey>guo-etal-2024-large</bibkey>
       <doi>10.18653/v1/2024.findings-acl.542</doi>
     </paper>
@@ -13640,7 +13640,7 @@
       <author><first>Yang</first><last>Feng</last><affiliation>Institute of Computing Technology, Chinese Academy of Sciences</affiliation></author>
       <pages>9155-9161</pages>
       <abstract>Direct speech-to-speech translation (S2ST) has achieved impressive translation quality, but it often faces the challenge of slow decoding due to the considerable length of speech sequences. Recently, some research has turned to non-autoregressive (NAR) models to expedite decoding, yet the translation quality typically lags behind autoregressive (AR) models significantly. In this paper, we investigate the performance of CTC-based NAR models in S2ST, as these models have shown impressive results in machine translation. Experimental results demonstrate that by combining pretraining, knowledge distillation, and advanced NAR training techniques such as glancing training and non-monotonic latent alignments, CTC-based NAR models achieve translation quality comparable to the AR model, while preserving up to 26.81<tex-math>\times</tex-math> decoding speedup.</abstract>
-      <url hash="63b5a140">2024.findings-acl.543</url>
+      <url hash="1e72942b">2024.findings-acl.543</url>
       <bibkey>fang-etal-2024-ctc</bibkey>
       <doi>10.18653/v1/2024.findings-acl.543</doi>
     </paper>
@@ -13653,7 +13653,7 @@
       <author><first>Tong</first><last>Ruan</last></author>
       <pages>9162-9175</pages>
       <abstract>The Clinical Terminology Normalization aims at finding standard terms from a given termbase for mentions extracted from clinical texts. However, we found that extracted mentions suffer from the multi-implication problem, especially disease diagnoses. The reason for this is that physicians often use abbreviations, conjunctions, and juxtapositions when writing diagnoses, and it is difficult to manually decompose. To address this problem, we propose a Terminology Component Recognition and Reconstruction strategy that leverages the reasoning capability of large language models (LLMs) to recognize the components of terms, enabling automated decomposition and transforming original mentions into multiple atomic mentions. Furthermore, we adopt the mainstream “Recall and Rank” framework to apply the benefits of the above strategy to the task flow. By leveraging the LLM incorporating the advanced sampling strategies, we design a sampling algorithm for atomic mentions and train the recall model using contrastive learning. Besides the information about the components is also used as knowledge to guide the final term ranking and selection. The experimental results show that our proposed strategy effectively improves the performance of the terminology normalization task and our proposed approach achieves state-of-the-art on the experimental dataset. We release our code and data on the repository https://github.com/yuugaochyan/RRNorm.</abstract>
-      <url hash="5841864a">2024.findings-acl.544</url>
+      <url hash="13ec9584">2024.findings-acl.544</url>
       <bibkey>fan-etal-2024-rrnorm</bibkey>
       <doi>10.18653/v1/2024.findings-acl.544</doi>
       <video href="2024.findings-acl.544.mp4"/>
@@ -13670,7 +13670,7 @@
       <author><first>Tong</first><last>Ruan</last></author>
       <pages>9176-9190</pages>
       <abstract>Information extraction plays a critical role in natural language processing. When applying large language models (LLMs) to this domain, we discover an unexpected phenomenon: LLMs’ spurious associations. In tasks such as relation extraction, LLMs can accurately identify entity pairs, even if the given relation (label) is semantically unrelated to the pre-defined original one. To find these labels, we design two strategies in this study, including forward label extension and backward label validation. We also leverage the extended labels to improve model performance. Our comprehensive experiments show that spurious associations occur consistently in both Chinese and English datasets across various LLM sizes. Moreover, the use of extended labels significantly enhances LLM performance in information extraction tasks. Remarkably, there is a performance increase of 9.55%, 11.42%, and 21.27% in F1 scores on the SciERC, ACE05, and DuEE datasets, respectively.</abstract>
-      <url hash="b382bf2f">2024.findings-acl.545</url>
+      <url hash="e1f36e34">2024.findings-acl.545</url>
       <bibkey>zhang-etal-2024-unexpected</bibkey>
       <doi>10.18653/v1/2024.findings-acl.545</doi>
       <video href="2024.findings-acl.545.mp4"/>
@@ -13684,7 +13684,7 @@
       <author><first>Libo</first><last>Qin</last><affiliation>Harbin Institute of Technology</affiliation></author>
       <pages>9191-9200</pages>
       <abstract>Cross-lingual chain-of-thought can effectively complete reasoning tasks across languages, which gains increasing attention.Recently, dominant approaches in the literature improve cross-lingual alignment capabilities by integrating reasoning knowledge from different languages. Despite achieving excellent performance, current methods still have two main challenges: (1) Manual language specification: They still highly rely on manually selecting the languages to integrate, severely affecting their generalizability; (2) Static weight allocation: Current methods simply integrate all languages equally. In fact, different language reasoning paths should have different weights to achieve better complementation and integration. Motivated by this, we introduce an Automatic Cross-lingual Alignment Planning (AutoCAP) for zero-shot chain-of-thought to address the above challenges. The core of AutoCAP consists of two components: (1) Automatic Language Selection Prompting to guide LLMs to select appropriate languages and (2) Automatic Weight Allocation Prompting to automatically allocate alignment weight scores to each reasoning path. Extensive experiments on several benchmarks reveal that AutoCAP achieves state-of-the-art performance, surpassing previous methods that required manual effort.</abstract>
-      <url hash="84303845">2024.findings-acl.546</url>
+      <url hash="500d3a23">2024.findings-acl.546</url>
       <bibkey>zhang-etal-2024-autocap</bibkey>
       <doi>10.18653/v1/2024.findings-acl.546</doi>
     </paper>
@@ -13698,7 +13698,7 @@
       <author><first>Jie</first><last>Zhou</last></author>
       <pages>9201-9214</pages>
       <abstract>Multilingual neural machine translation models generally distinguish translation directions by the language tag (LT) in front of the source or target sentences. However, current LT strategies cannot indicate the desired target language as expected on zero-shot translation, i.e., the off-target issue. Our analysis reveals that the indication of the target language is sensitive to the placement of the target LT. For example, when placing the target LT on the decoder side, the indication would rapidly degrade along with decoding steps, while placing the target LT on the encoder side would lead to copying or paraphrasing the source input. To address the above issues, we propose a simple yet effective strategy named Language Converter Strategy (LCS). By introducing the target language embedding into the top encoder layers, LCS mitigates confusion in the encoder and ensures stable language indication for the decoder. Experimental results on MultiUN, TED, and OPUS-100 datasets demonstrate that LCS could significantly mitigate the off-target issue, with language accuracy up to 95.28%, 96.21%, and 85.35% meanwhile outperforming the vanilla LT strategy by 3.07, 3,3, and 7.93 BLEU scores on zero-shot translation, respectively.</abstract>
-      <url hash="713bf468">2024.findings-acl.547</url>
+      <url hash="8d462c51">2024.findings-acl.547</url>
       <bibkey>sun-etal-2024-lcs</bibkey>
       <doi>10.18653/v1/2024.findings-acl.547</doi>
       <video href="2024.findings-acl.547.mp4"/>
@@ -13713,7 +13713,7 @@
       <author><first>Yansong</first><last>Feng</last><affiliation>Peking University</affiliation></author>
       <pages>9215-9235</pages>
       <abstract>Quantitative reasoning is a critical skill to analyze data, yet the assessment of such ability remains limited. To address this gap, we introduce the Quantitative Reasoning with Data (QRData) benchmark, aiming to evaluate Large Language Models’ capability in statistical and causal reasoning with real-world data. The benchmark comprises a carefully constructed dataset of 411 questions accompanied by data sheets from textbooks, online learning materials, and academic papers. To compare models’ quantitative reasoning abilities on data and text, we enrich the benchmark with an auxiliary set of 290 text-only questions, namely QRText. We evaluate natural language reasoning, program-based reasoning, and agent reasoning methods including Chain-of-Thought, Program-of-Thoughts, ReAct, and code interpreter assistants on diverse models. The strongest model GPT-4 achieves an accuracy of 58%, which has much room for improvement. Among open-source models, Deepseek-coder-instruct, a code LLM pretrained on 2T tokens, gets the highest accuracy of 37%. Analysis reveals that models encounter difficulties in data analysis and causal reasoning, and struggle in using causal knowledge and provided data simultaneously. Code and data are in https://github.com/xxxiaol/QRData.</abstract>
-      <url hash="e990c88b">2024.findings-acl.548</url>
+      <url hash="e28c1688">2024.findings-acl.548</url>
       <bibkey>liu-etal-2024-llms</bibkey>
       <doi>10.18653/v1/2024.findings-acl.548</doi>
       <video href="2024.findings-acl.548.mp4"/>
@@ -13731,7 +13731,7 @@
       <author><first>Fangzhao</first><last>Wu</last><affiliation>Microsoft</affiliation></author>
       <pages>9236-9260</pages>
       <abstract>Large language models (LLMs) possess immense capabilities but are susceptible to malicious exploitation. To mitigate the risk, safety alignment is employed to align LLMs with ethical standards. However, safety-aligned LLMs may remain vulnerable to carefully crafted jailbreak attacks, but these attacks often face high rejection rates and limited harmfulness. In this paper, we expose the vulnerabilities of safety alignment in open-access LLMs, which can significantly enhance the success rate and harmfulness of jailbreak attacks. Through reverse alignment, achieved by accessing model parameters, we show the feasibility of efficiently fine-tuning LLMs to undermine their inherent safeguards. We investigate two types of reverse alignment techniques: reverse supervised fine-tuning (RSFT) and reverse preference optimization (RPO). RSFT operates by supervising the fine-tuning of LLMs to reverse their inherent values. We also explore how to prepare data needed for RSFT. RPO optimizes LLMs to enhance their preference for harmful content, reversing the models’ safety alignment. Our extensive experiments reveal that open-access high-performance LLMs can be adeptly reverse-aligned to output harmful content, even in the absence of manually curated malicious datasets. Our research acts as a whistleblower for the community, emphasizing the need to pay more attention to safety of open-accessing LLMs. It also underscores the limitations of current safety alignment approaches and calls for research on robust safety alignment methods to counteract malicious fine-tuning attacks.</abstract>
-      <url hash="c5c32a73">2024.findings-acl.549</url>
+      <url hash="10aab7db">2024.findings-acl.549</url>
       <bibkey>yi-etal-2024-vulnerability</bibkey>
       <doi>10.18653/v1/2024.findings-acl.549</doi>
       <video href="2024.findings-acl.549.mp4"/>
@@ -13744,7 +13744,7 @@
       <author><first>Yanru</first><last>Zhou</last></author>
       <pages>9261-9273</pages>
       <abstract>Pre-trained language models (PLMs) have shown great dialogue generation capability in different scenarios. However, the huge VRAM consumption when fine-tuning them is one of their drawbacks. PEFT approaches can significantly reduce the number of trainable parameters, which enables us to fine-tune larger dialogue generation models. However, the reduction in parameter quantity can diminish a PLM’s expressive capacity and affect the PLM’s learning from certain specific examples like knowledge-related conversations. Previous works have demonstrated that injecting external knowledge into dialogue generation models can improve the model’s performance in knowledge-related conversations. Nonetheless, these methods are designed for the scenario where most parameters of the entire framework are trainable. In this paper, we propose PEK, a parameter-efficient framework for knowledge-enhanced dialogue generation. It enables PLMs to leverage external knowledge documents and knowledge graphs to enhance its generation capabilities with an acceptable number of trainable parameters. Evaluation results on the Wizard of Wikipedia and CMU_DoG datasets show that our approach outperforms baseline methods on multiple evaluation metrics, which validates the effectiveness of our approach.</abstract>
-      <url hash="75d2d60c">2024.findings-acl.550</url>
+      <url hash="143e6167">2024.findings-acl.550</url>
       <bibkey>yang-etal-2024-pek</bibkey>
       <doi>10.18653/v1/2024.findings-acl.550</doi>
     </paper>
@@ -13758,7 +13758,7 @@
       <author><first>Haoran</first><last>Jia</last><affiliation>Beijing University of Posts and Telecommunications</affiliation></author>
       <pages>9274-9281</pages>
       <abstract>Current fact verification methods generally follow the two-stage training paradigm: evidence retrieval and claim verification. While existing works focus on developing sophisticated claim verification modules, the fundamental importance of evidence retrieval is largely ignored. Existing approaches usually adopt the heuristic semantic similarity-based retrieval strategy, resulting in the task-irrelevant evidence and undesirable performance. In this paper, we concentrate on evidence retrieval and propose a Retrieval-Augmented Verification framework RAV, consisting of two major modules: the hybrid evidence retrieval and the joint fact verification. Hybrid evidence retrieval module incorporates an efficient retriever for preliminary pruning of candidate evidence, succeeded by a ranker that generates more precise sorting results. Under this end-to-end training paradigm, gradients from the claim verification can be back-propagated to enhance evidence selection. Experimental results on FEVER dataset demonstrate the superiority of RAV.</abstract>
-      <url hash="4eeaf280">2024.findings-acl.551</url>
+      <url hash="350fbcee">2024.findings-acl.551</url>
       <bibkey>zheng-etal-2024-evidence</bibkey>
       <doi>10.18653/v1/2024.findings-acl.551</doi>
     </paper>
@@ -13773,7 +13773,7 @@
       <author><first>Jie</first><last>Zhou</last></author>
       <pages>9282-9293</pages>
       <abstract>Recently, Knowledge Editing has received increasing attention, since it could update the specific knowledge from outdated ones in pretrained models without re-training. However, as pointed out by recent studies, existing related methods tend to merely memorize the superficial word composition of the edited knowledge, rather than truly learning and absorbing it. Consequently, on the reasoning questions, we discover that existing methods struggle to utilize the edited knowledge to reason the new answer, and tend to retain outdated responses, which are generated by the original models utilizing original knowledge. Nevertheless, the outdated responses are unexpected for the correct answers to reasoning questions, which we named as the outdated issue. To alleviate this issue, in this paper, we propose a simple yet effective decoding strategy, i.e., outDated ISsue aware deCOding (DISCO), to enhance the performance of edited models on reasoning questions. Specifically, we capture the difference in the probability distribution between the original and edited models. Further, we amplify the difference of the token prediction in the edited model to alleviate the outdated issue, and thus enhance the model performance w.r.t the edited knowledge. Experimental results suggest that applying DISCO could enhance edited models to reason, e.g., on reasoning questions, DISCO outperforms the prior SOTA method by 12.99 F1 scores, and reduces the ratio of the outdated issue to 5.78% on the zsRE dataset.</abstract>
-      <url hash="b4fc3845">2024.findings-acl.552</url>
+      <url hash="4f430aa0">2024.findings-acl.552</url>
       <bibkey>sun-etal-2024-outdated</bibkey>
       <doi>10.18653/v1/2024.findings-acl.552</doi>
     </paper>
@@ -13784,7 +13784,7 @@
       <author><first>Henning</first><last>Wachsmuth</last><affiliation>Leibniz Universität Hannover</affiliation></author>
       <pages>9294-9313</pages>
       <abstract>Dialects introduce syntactic and lexical variations in language that occur in regional or social groups. Most NLP methods are not sensitive to such variations. This may lead to unfair behavior of the methods, conveying negative bias towards dialect speakers. While previous work has studied dialect-related fairness for aspects like hate speech, other aspects of biased language, such as lewdness, remain fully unexplored. To fill this gap, we investigate performance disparities between dialects in the detection of five aspects of biased language and how to mitigate them. To alleviate bias, we present a multitask learning approach that models dialect language as an auxiliary task to incorporate syntactic and lexical variations. In our experiments with African-American English dialect, we provide empirical evidence that complementing common learning approaches with dialect modeling improves their fairness. Furthermore, the results suggest that multitask learning achieves state-of-the-art performance and helps to detect properties of biased language more reliably.</abstract>
-      <url hash="f059370f">2024.findings-acl.553</url>
+      <url hash="c3c2707a">2024.findings-acl.553</url>
       <bibkey>spliethover-etal-2024-disentangling</bibkey>
       <doi>10.18653/v1/2024.findings-acl.553</doi>
       <video href="2024.findings-acl.553.mp4"/>
@@ -13797,7 +13797,7 @@
       <author><first>Florian</first><last>Matthes</last><affiliation>Technische Universität München</affiliation></author>
       <pages>9314-9328</pages>
       <abstract>The task of text privatization using Differential Privacy has recently taken the form of <tex-math>\textit{text rewriting}</tex-math>, in which an input text is obfuscated via the use of generative (large) language models. While these methods have shown promising results in the ability to preserve privacy, these methods rely on autoregressive models which lack a mechanism to contextualize the private rewriting process. In response to this, we propose <tex-math>\textbf{DP-MLM}</tex-math>, a new method for differentially private text rewriting based on leveraging masked language models (MLMs) to rewrite text in a semantically similar <tex-math>\textit{and}</tex-math> obfuscated manner. We accomplish this with a simple contextualization technique, whereby we rewrite a text one token at a time. We find that utilizing encoder-only MLMs provides better utility preservation at lower <tex-math>\varepsilon</tex-math> levels, as compared to previous methods relying on larger models with a decoder. In addition, MLMs allow for greater customization of the rewriting mechanism, as opposed to generative approaches. We make the code for <tex-math>\textbf{DP-MLM}</tex-math> public and reusable, found at https://github.com/sjmeis/DPMLM.</abstract>
-      <url hash="b91aca74">2024.findings-acl.554</url>
+      <url hash="6b3ed074">2024.findings-acl.554</url>
       <bibkey>meisenbacher-etal-2024-dp</bibkey>
       <doi>10.18653/v1/2024.findings-acl.554</doi>
       <video href="2024.findings-acl.554.mp4"/>
@@ -13808,7 +13808,7 @@
       <author><first>Thamar</first><last>Solorio</last><affiliation>Mohamed bin Zayed University of Artificial Intelligence and University of Houston</affiliation></author>
       <pages>9329-9339</pages>
       <abstract>We present Q-ViD, a simple approach for video question answering (video QA), that unlike prior methods, which are based on complex architectures, computationally expensive pipelines or use closed models like GPTs, Q-ViD relies on a single instruction-aware open vision-language model (InstructBLIP) to tackle videoQA using frame descriptions. Specifically, we create captioning instruction prompts that rely on the target questions about the videos and leverage InstructBLIP to obtain video frame captions that are useful to the task at hand. Subsequently, we form descriptions of the whole video using the question-dependent frame captions, and feed that information, along with a question-answering prompt, to a large language model (LLM). The LLM is our reasoning module, and performs the final step of multiple-choice QA. Our simple Q-ViD framework achieves competitive or even higher performances than current state of the art models on a diverse range of videoQA benchmarks, including NExT-QA, STAR, How2QA, TVQA and IntentQA.</abstract>
-      <url hash="2514cccb">2024.findings-acl.555</url>
+      <url hash="3f0c6221">2024.findings-acl.555</url>
       <bibkey>mogrovejo-solorio-2024-question</bibkey>
       <doi>10.18653/v1/2024.findings-acl.555</doi>
       <video href="2024.findings-acl.555.mp4"/>
@@ -13825,7 +13825,7 @@
       <author><first>Liang</first><last>Wang</last><affiliation>CASIA</affiliation></author>
       <pages>9340-9353</pages>
       <abstract>Fact verification aims to automatically probe the veracity of a claim based on several pieces of evidence. Existing works are always engaging in accuracy improvement, let alone explainability, a critical capability of fact verification systems.Constructing an explainable fact verification system in a complex multi-hop scenario is consistently impeded by the absence of a relevant, high-quality dataset. Previous datasets either suffer from excessive simplification or fail to incorporate essential considerations for explainability. To address this, we present EX-FEVER, a pioneering dataset for multi-hop explainable fact verification. With over 60,000 claims involving 2-hop and 3-hop reasoning, each is created by summarizing and modifying information from hyperlinked Wikipedia documents. Each instance is accompanied by a veracity label and an explanation that outlines the reasoning path supporting the veracity classification. Additionally, we demonstrate a novel baseline system on our EX-FEVER dataset, showcasing document retrieval, explanation generation, and claim verification, and validate the significance of our dataset. Furthermore, we highlight the potential of utilizing Large Language Models in the fact verification task. We hope our dataset could make a significant contribution by providing ample opportunities to explore the integration of natural language explanations in the domain of fact verification.</abstract>
-      <url hash="45212c12">2024.findings-acl.556</url>
+      <url hash="2558b043">2024.findings-acl.556</url>
       <bibkey>ma-etal-2024-ex</bibkey>
       <doi>10.18653/v1/2024.findings-acl.556</doi>
       <video href="2024.findings-acl.556.mp4"/>
@@ -13842,7 +13842,7 @@
       <author><first>Feng</first><last>Zhao</last><affiliation>University of Science and Technology of China</affiliation></author>
       <pages>9354-9366</pages>
       <abstract>Open-sourced Large Language Models (LLMs) have achieved great success in various NLP tasks, however, they are still far inferior to API-based models when acting as agents. How to integrate agent ability into general LLMs becomes a crucial and urgent problem.This paper first delivers three key observations: (1) the current agent training corpus is entangled with both formats following and agent reasoning, which significantly shifts from the distribution of its pre-training data; (2) LLMs exhibit different learning speeds on the capabilities required by agent tasks; and (3) current approaches have side-effects when improving agent abilities by introducing hallucinations. Based on the above findings, we propose Agent-FLAN to effectively Fine-tune LANguage models for Agents.Through careful decomposition and redesign of the training corpus, Agent-FLAN enables Llama2-7B to outperform prior best works by 3.5% across various agent evaluation datasets. With comprehensively constructed negative samples, Agent-FLAN greatly alleviates the hallucination issues based on our established evaluation benchmark. Besides, it consistently improves the agent capability of LLMs when scaling model sizes while slightly enhancing the general capability of LLMs. The code and models are available at https://github.com/InternLM/Agent-FLAN.</abstract>
-      <url hash="ca03b0fb">2024.findings-acl.557</url>
+      <url hash="2c127a84">2024.findings-acl.557</url>
       <bibkey>chen-etal-2024-agent</bibkey>
       <doi>10.18653/v1/2024.findings-acl.557</doi>
     </paper>
@@ -13862,7 +13862,7 @@
       <author><first>Maxim</first><last>Panov</last><affiliation>Mohamed bin Zayed University of Artificial Intelligence</affiliation></author>
       <pages>9367-9385</pages>
       <abstract>Large language models (LLMs) are notorious for hallucinating, i.e., producing erroneous claims in their output. Such hallucinations can be dangerous, as occasional factual inaccuracies in the generated text might be obscured by the rest of the output being generally factually correct, making it extremely hard for the users to spot them. Current services that leverage LLMs usually do not provide any means for detecting unreliable generations. Here, we aim to bridge this gap. In particular, we propose a novel fact-checking and hallucination detection pipeline based on token-level uncertainty quantification. Uncertainty scores leverage information encapsulated in the output of a neural network or its layers to detect unreliable predictions, and we show that they can be used to fact-check the atomic claims in the LLM output. Moreover, we present a novel token-level uncertainty quantification method that removes the impact of uncertainty about what claim to generate on the current step and what surface form to use. Our method Claim Conditioned Probability (CCP) measures only the uncertainty of a particular claim value expressed by the model. Experiments on the task of biography generation demonstrate strong improvements for CCP compared to the baselines for seven different LLMs and four languages. Human evaluation reveals that the fact-checking pipeline based on uncertainty quantification is competitive with a fact-checking tool that leverages external knowledge.</abstract>
-      <url hash="c88a55ba">2024.findings-acl.558</url>
+      <url hash="0ee1b829">2024.findings-acl.558</url>
       <bibkey>fadeeva-etal-2024-fact</bibkey>
       <doi>10.18653/v1/2024.findings-acl.558</doi>
       <video href="2024.findings-acl.558.mp4"/>
@@ -13879,7 +13879,7 @@
       <author><first>Bing</first><last>Qin</last><affiliation>Harbin Institute of Technology</affiliation></author>
       <pages>9386-9406</pages>
       <abstract>Through pretraining on a corpus with various sources, Large Language Models (LLMs) have gained impressive performance. However, the impact of each component of the pretraining corpus remains opaque. As a result, the organization of the pretraining corpus is still empirical and may deviate from the optimal. To address this issue, we systematically analyze the impact of 48 datasets from 5 major categories of pretraining data of LLMs and measure their impacts on LLMs using benchmarks about nine major categories of model capabilities. Our analyses provide empirical results about the contribution of multiple corpora on the performances of LLMs, along with their joint impact patterns, including complementary, orthogonal, and correlational relationships. We also identify a set of “high-impact data” such as Books that is significantly related to a set of model capabilities. These findings provide insights into the organization of data to support more efficient pretraining of LLMs.</abstract>
-      <url hash="afdaef06">2024.findings-acl.559</url>
+      <url hash="6bb57f74">2024.findings-acl.559</url>
       <bibkey>zhao-etal-2024-deciphering</bibkey>
       <doi>10.18653/v1/2024.findings-acl.559</doi>
       <video href="2024.findings-acl.559.mp4"/>
@@ -13894,7 +13894,7 @@
       <author><first>Sara</first><last>Hooker</last><affiliation>Cohere For AI</affiliation></author>
       <pages>9407-9426</pages>
       <abstract>Neural Machine Translation models are extremely data and compute-hungry. However, not all datapoints contribute equally to model training and generalization. Data pruning to remove the low-value data points has the benefit of drastically reducing the compute budget without significantdrop in model performance. In this paper, we propose a new data pruning technique: CheckpointsAcross Time (CAT ), that leverages early model training dynamics to identify the most relevantdata points for model performance. We benchmark CAT against several data pruning techniquesincluding COMET-QE, LASER and LaBSE. We find that CAT outperforms the benchmarks onIndo-European languages on multiple test sets. When applied to English-German, English-Frenchand English-Swahili translation tasks, CAT achieves comparable performance to using the fulldataset, while pruning up to 50% of training data. We inspect the data points that CAT selectsand find that it tends to favour longer sentences and sentences with unique or rare words.</abstract>
-      <url hash="7b827f3b">2024.findings-acl.560</url>
+      <url hash="d57b1975">2024.findings-acl.560</url>
       <bibkey>chimoto-etal-2024-critical</bibkey>
       <doi>10.18653/v1/2024.findings-acl.560</doi>
       <video href="2024.findings-acl.560.mp4"/>
@@ -13907,7 +13907,7 @@
       <author><first>Mattia</first><last>Panni</last><affiliation>Università degli Studi di Modena e Reggio Emilia</affiliation></author>
       <pages>9427-9440</pages>
       <abstract>Scientific document summarization aims to condense complex and long articles in both technical and plain-language terms to facilitate the accessibility and dissemination of scientific findings. Existing datasets suffer from a deficiency in source heterogeneity, as their data predominantly stem from a single common resource, hindering effective model training and generalizability. First, we introduce SciLay, a novel dataset that includes documents from multiple natural science journals with expert-authored technical and lay summaries. Second, we propose PrunePert, a new transformer-based model that incorporates a differentiable perturbed top-<tex-math>k</tex-math> encoder layer to prune irrelevant tokens in end-to-end learning. Experimental results show that our model achieves a nearly 2x speed-up compared to a state-of-the-art linear transformer, remaining comparable in effectiveness. Additional examinations underscore the importance of employing a training dataset that includes different sources to enhance the generalizability of the models. Code is available at https://github.com/disi-unibo-nlp/sci-lay.</abstract>
-      <url hash="a512ff62">2024.findings-acl.561</url>
+      <url hash="9c120c2d">2024.findings-acl.561</url>
       <bibkey>ragazzi-etal-2024-token</bibkey>
       <doi>10.18653/v1/2024.findings-acl.561</doi>
       <video href="2024.findings-acl.561.mp4"/>
@@ -13922,7 +13922,7 @@
       <author><first>Hoang</first><last>Thanh Lam</last><affiliation>International Business Machines</affiliation></author>
       <pages>9441-9457</pages>
       <abstract>Zero-shot entity and relation classification models leverage available external information of unseen classes – e.g., textual descriptions – to annotate input text data. Thanks to the minimum data requirement, Zero-Shot Learning (ZSL) methods have high value in practice, especially in applications where labeled data is scarce. Even though recent research in ZSL has demonstrated significant results, our analysis reveals that those methods are sensitive to provided textual descriptions of entities (or relations). Even a minor modification of descriptions can lead to a change in the decision boundary between entity (or relation) classes. In this paper, we formally define the problem of identifying effective descriptions for zero shot inference. We propose a strategy for generating variations of an initial description, a heuristic for ranking them and an ensemble method capable of boosting the predictions of zero-shot models through description enhancement. Empirical results on four different entity and relation classification datasets show that our proposed method outperform existing approaches and achieve new SOTA results on these datasets under the ZSL settings. The source code of the proposed solutions and the evaluation framework are open-sourced.</abstract>
-      <url hash="339bdc81">2024.findings-acl.562</url>
+      <url hash="dc37dbae">2024.findings-acl.562</url>
       <bibkey>picco-etal-2024-description</bibkey>
       <doi>10.18653/v1/2024.findings-acl.562</doi>
       <video href="2024.findings-acl.562.mp4"/>
@@ -13937,7 +13937,7 @@
       <author><first>Min</first><last>Zhang</last><affiliation>Harbin Institute of Technology, Shenzhen</affiliation></author>
       <pages>9458-9469</pages>
       <abstract><tex-math>k</tex-math>NN-MT has utilized neighborhood knowledge for auxiliary decoding, significantly improving translation performance. Subsequently, <tex-math>k</tex-math>NN-KD transitions the use of neighborhood knowledge from the decoding phase to the training phase, to address the temporal and spatial inefficiencies inherent in <tex-math>k</tex-math>NN-MT. However, <tex-math>k</tex-math>NN-KD transfers all the <tex-math>k</tex-math>NN knowledge arbitrarily, which has the potential to restrict the learning of student models. In this paper, we propose a novel domain-aware <tex-math>k</tex-math>NN-KD method, which filters out domain-relevant neighborhood knowledge for learning in the distillation process. Notably, this entire process exclusively utilizes the neighborhood knowledge of the original model, eliminating the need for establishing any additional datastores. Experiments on four domain translation tasks demonstrate that our method achieves state-of-the-art performance, realizing an average gain of 1.55 COMET and 1.42 BLEU scores, by further enhancing the translation of rare words. Source code can be accessed at https://github.com/wangzx1219/Dk-KD.</abstract>
-      <url hash="6a10bc44">2024.findings-acl.563</url>
+      <url hash="9f67b750">2024.findings-acl.563</url>
       <bibkey>wang-etal-2024-domain-aware</bibkey>
       <doi>10.18653/v1/2024.findings-acl.563</doi>
     </paper>
@@ -13954,7 +13954,7 @@
       <author><first>Wenyu</first><last>Chen</last></author>
       <pages>9470-9487</pages>
       <abstract>Recent mainstream event argument extraction methods process each event in isolation, resulting in inefficient inference and ignoring the correlations among multiple events. To address these limitations, here we propose a multiple-event argument extraction model DEEIA (Dependency-guided Encoding and Event-specific Information Aggregation), capable of extracting arguments from all events within a document simultaneously. The proposed DEEIA model employs a multi-event prompt mechanism, comprising DE and EIA modules. The DE module is designed to improve the correlation between prompts and their corresponding event contexts, whereas the EIA module provides event-specific information to improve contextual understanding. Extensive experiments show that our method achieves new state-of-the-art performance on four public datasets (RAMS, WikiEvents, MLEE, and ACE05), while significantly saving the inference time compared to the baselines. Further analyses demonstrate the effectiveness of the proposed modules.</abstract>
-      <url hash="e27958ff">2024.findings-acl.564</url>
+      <url hash="9e3b40d1">2024.findings-acl.564</url>
       <bibkey>liu-etal-2024-beyond-single</bibkey>
       <doi>10.18653/v1/2024.findings-acl.564</doi>
       <video href="2024.findings-acl.564.mp4"/>
@@ -13972,7 +13972,7 @@
       <author><first>Wu</first><last>Yang</last></author>
       <pages>9488-9499</pages>
       <abstract>Speech-to-text (S2T) generation systems frequently face challenges in low-resource scenarios, primarily due to the lack of extensive labeled datasets. One emerging solution is constructing virtual training samples by interpolating inputs and labels, which has notably enhanced system generalization in other domains. Despite its potential, this technique’s application in S2T tasks has remained under-explored. In this paper, we delve into the utility of interpolation augmentation, guided by several pivotal questions. Our findings reveal that employing an appropriate strategy in interpolation augmentation significantly enhances performance across diverse tasks, architectures, and data scales, offering a promising avenue for more robust S2T systems in resource-constrained settings.</abstract>
-      <url hash="10d87f68">2024.findings-acl.565</url>
+      <url hash="25fab0e5">2024.findings-acl.565</url>
       <bibkey>xu-etal-2024-revisiting</bibkey>
       <doi>10.18653/v1/2024.findings-acl.565</doi>
     </paper>
@@ -13986,7 +13986,7 @@
       <author><first>Yi</first><last>Zhang</last><affiliation>Amazon</affiliation></author>
       <pages>9500-9522</pages>
       <abstract>Large language models (LLMs) are powerful dialogue agents, but specializing them towards fulfilling a specific function can be challenging. Instructing tuning, i.e. tuning models on instruction and sample responses generated by humans (Ouyang et al., 2022), has proven as an effective method to do so, yet requires a number of data samples that a) might not be available or b) costly to generate. Furthermore, this cost increases when the goal is to make the LLM follow a specific workflow within a dialogue instead of single instructions. Inspired by the self-play technique in reinforcement learning and the use of LLMs to simulate human agents, we propose a more effective method for data collection through LLMs engaging in a conversation in various roles. This approach generates a training data via “self-talk” of LLMs that can be refined and utilized for supervised fine-tuning. We introduce an automated way to measure the (partial) success of a dialogue. This metric is used to filter the generated conversational data that is fed back in LLM for training. Based on our automated and human evaluations of conversation quality, we demonstrate that such self-talk data improves results. In addition, we examine the various characteristics that showcase the quality of generated dialogues and how they can be connected to their potential utility as training data.</abstract>
-      <url hash="0e354e19">2024.findings-acl.566</url>
+      <url hash="665cfe19">2024.findings-acl.566</url>
       <bibkey>ulmer-etal-2024-bootstrapping</bibkey>
       <doi>10.18653/v1/2024.findings-acl.566</doi>
       <video href="2024.findings-acl.566.mp4"/>
@@ -13997,7 +13997,7 @@
       <author><first>Piji</first><last>Li</last><affiliation>Nanjing University of Aeronautics and Astronautics</affiliation></author>
       <pages>9523-9537</pages>
       <abstract>Parameter-Efficient Fine-Tuning (PEFT) methods enable efficient adaptation of Large Language Models (LLMs) to various downstream applications. However, the effectiveness of the PEFT diminishes notably when downstream tasks require accurate learning of specific knowledge. In this paper, we adopt a semantic perspective to investigate this phenomenon, uncovering the reasons behind PEFT’s limitations in knowledge learning task. Our findings reveals that: (1) PEFT presents a notable risk of pushing the model away from the intended knowledge target; (2) multiple knowledge interfere with each other, and such interference suppresses the learning and expression of knowledge features. Based on these insights, we introduce a data filtering strategy to exclude data that is detrimental to knowledge learning and a re-weighted learning strategy to make the model attentive to semantic distance during knowledge learning. Experimental results demonstrate the effectiveness of the proposed method on open-source large language model, further validate the semantic challenge in PEFT, thus paving the way for future research.</abstract>
-      <url hash="6a443dbc">2024.findings-acl.567</url>
+      <url hash="9b9bb1f4">2024.findings-acl.567</url>
       <bibkey>wang-li-2024-semantic</bibkey>
       <doi>10.18653/v1/2024.findings-acl.567</doi>
       <video href="2024.findings-acl.567.mp4"/>
@@ -14009,7 +14009,7 @@
       <author><first>Gabriel</first><last>Stanovsky</last><affiliation>Hebrew University of Jerusalem</affiliation></author>
       <pages>9538-9550</pages>
       <abstract>Document collections of various domains, e.g., legal, medical, or financial, often share some underlying collection-wide structure, which captures information that can aid both human users and structure-aware models.We propose to identify the typical structure of document within a collection, which requires to capture recurring topics across the collection, while abstracting over arbitrary header paraphrases, and ground each topic to respective document locations. These requirements pose several challenges: headers that mark recurring topics frequently differ in phrasing, certain section headers are unique to individual documents and do not reflect the typical structure, and the order of topics can vary between documents. Subsequently, we develop an unsupervised graph-based method which leverages both inter- and intra-document similarities, to extract the underlying collection-wide structure. Our evaluations on three diverse domains in both English and Hebrew indicate that our method extracts meaningful collection-wide structure, and we hope that future work will leverage our method for multi-document applications and structure-aware models.</abstract>
-      <url hash="b590117f">2024.findings-acl.568</url>
+      <url hash="4952fa16">2024.findings-acl.568</url>
       <bibkey>lior-etal-2024-leveraging</bibkey>
       <doi>10.18653/v1/2024.findings-acl.568</doi>
       <video href="2024.findings-acl.568.mp4"/>
@@ -14023,7 +14023,7 @@
       <author><first>Huajun</first><last>Chen</last><affiliation>Zhejiang University</affiliation></author>
       <pages>9551-9565</pages>
       <abstract>The development of Large Language Models (LLMs) has greatly advanced the field of drug discovery, with the belief that natural language can enhance human control over molecule design. However, the scarcity of high-quality labeled data remains a challenge for cross text-molecule learning. Existing datasets are limited due to the difficulty of collecting precise molecule-description pairs. Although recent efforts have utilized pseudo data generated by LLMs for augmentation, the lack of specialized chemistry knowledge of LLMs and the absence of an effective high quality data selector may introduce noise into the annotations, compromising the models’ robustness. To address these challenges, this paper introduces a novel framework that interweaves model fine-tuning and data augmentation to overcome the scarcity of high-quality data. The proposed approach involves an iterative procedure where the model plays dual roles in annotating unlabeled data and sampling a subset of high-quality data until convergence is achieved, enhancing the model’s understanding and adaptability. Additionally, a new dataset called SAPubChem-41 is presented, which comprises meticulously curated high-quality parallel molecule-description pairs designed specifically for fine-tuning purposes. This research provides an important contribution to the field by addressing the need for high-quality datasets and presenting an effective framework for cross text-molecule learning.</abstract>
-      <url hash="cce3bb3f">2024.findings-acl.569</url>
+      <url hash="3e8c3c48">2024.findings-acl.569</url>
       <bibkey>jiang-etal-2024-enhancing</bibkey>
       <doi>10.18653/v1/2024.findings-acl.569</doi>
     </paper>
@@ -14034,7 +14034,7 @@
       <author><first>Chunpu</first><last>Xu</last></author>
       <pages>9566-9579</pages>
       <abstract>A quote tweet enables users to share others’ content while adding their own commentary. In order to enhance public engagement through quote tweets, we investigate the task of generating popular quote tweets. This task aims to produce quote tweets that garner higher popularity, as indicated by increased likes, replies, and retweets. Despite the impressive language generation capabilities of large language models (LLMs), there has been limited research on how LLMs can effectively learn the popularity of text to better engage the public. Therefore, we introduce a novel approach called Response-augmented Popularity-Aligned Language Model (RePALM), which aligns language generation with popularity by leveraging insights from augmented auto-responses provided by readers. We utilize the Proximal Policy Optimization framework with a dual-reward mechanism to jointly optimize for the popularity of the quote tweet and its consistency with the auto-responses. In our experiments, we collected two datasets consisting of quote tweets containing external links and those referencing others’ tweets. Extensive results demonstrate the superiority of RePALM over advanced language models that do not incorporate response augmentation.</abstract>
-      <url hash="453a9abf">2024.findings-acl.570</url>
+      <url hash="ff3b187c">2024.findings-acl.570</url>
       <bibkey>yu-etal-2024-repalm</bibkey>
       <doi>10.18653/v1/2024.findings-acl.570</doi>
     </paper>
@@ -14046,7 +14046,7 @@
       <author><first>Tiago</first><last>Pimentel</last><affiliation>Department of Computer Science, ETHZ - ETH Zurich</affiliation></author>
       <pages>9580-9597</pages>
       <abstract>Tokenisation is a core part of language models (LMs). It involves splitting a character sequence into subwords which are assigned random indices before being served to the LM. However, this process—while typically lossless—may lead to less efficient LM training, because it removes character-level information, thereby making it more difficult to generalise across similar subwords, such as *now* and *Now*. We refer to such subwords as **near duplicates**. In this paper, we study the impact of near duplicate subwords on LM training efficiency. First, we design an experiment that gives us an upper bound to how much we should expect a model to improve if we could perfectly generalise across near duplicates. We do this, by duplicating each token in our LM’s vocabulary, creating perfectly equivalent classes of subwords. Experimentally, we find that LMs need roughly 17% more data when trained in a fully duplicated setting. Second, we investigate the impact of naturally occurring near duplicates on LMs. Here, we see that deduplicating them considerably hurts LM performance; but that this loss in performance can be easily mitigated.</abstract>
-      <url hash="56447932">2024.findings-acl.571</url>
+      <url hash="56fcbc3f">2024.findings-acl.571</url>
       <bibkey>schafer-etal-2024-effect</bibkey>
       <doi>10.18653/v1/2024.findings-acl.571</doi>
       <video href="2024.findings-acl.571.mp4"/>
@@ -14058,7 +14058,7 @@
       <author><first>Sandro</first><last>Pezzelle</last><affiliation>University of Amsterdam</affiliation></author>
       <pages>9598-9613</pages>
       <abstract>In everyday language use, speakers frequently utter and interpret sentences that are semantically underspecified, namely, whose content is insufficient to fully convey their message or interpret them univocally. For example, to interpret the underspecified sentence “Don’t spend too much”, which leaves implicit what (not) to spend, additional linguistic context or outside knowledge is needed. In this work, we propose a novel Dataset of semantically Underspecified Sentences grouped by Type (DUST) and use it to study whether pre-trained language models (LMs) correctly identify and interpret underspecified sentences. We find that newer LMs are reasonably able to identify underspecified sentences when explicitly prompted. However, interpreting them correctly is much harder for any LMs. Our experiments show that when interpreting underspecified sentences, LMs exhibit little uncertainty, contrary to what theoretical accounts of underspecification would predict. Overall, our study reveals limitations in current models’ processing of sentence semantics and highlights the importance of using naturalistic data and communicative scenarios when evaluating LMs’ language capabilities.</abstract>
-      <url hash="6e88a3bc">2024.findings-acl.572</url>
+      <url hash="4ce2db71">2024.findings-acl.572</url>
       <bibkey>wildenburg-etal-2024-pre</bibkey>
       <doi>10.18653/v1/2024.findings-acl.572</doi>
       <video href="2024.findings-acl.572.mp4"/>
@@ -14071,7 +14071,7 @@
       <author><first>Neil</first><last>Gong</last><affiliation>Duke University</affiliation></author>
       <pages>9614-9631</pages>
       <abstract>Visual hallucination (VH) means that a multi-modal LLM (MLLM) imagines incorrect details about an image in visual question answering. Existing studies find VH instances only in existing image datasets, which results in biased understanding of MLLMs’ performance under VH due to limited diversity of such VH instances. In this work, we propose a tool called VHTest to generate a diverse set of VH instances. Specifically, VHTest finds some initial VH instances in existing image datasets (e.g., COCO), generates a text description for each VH mode, and uses a text-to-image generative model (e.g., DALL-E-3) to generate VH images based on the text descriptions. We collect a benchmark dataset with 1,200 VH instances in 8 VH modes using VHTest. We find that existing MLLMs such as GPT-4, LLaVA-1.5, and MiniGPT-v2 hallucinate for a large fraction of the instances in our benchmark. Moreover, we find that fine-tuning an MLLM using our benchmark dataset reduces its likelihood to hallucinate without sacrificing its performance on other benchmarks. Our benchmarks are publicly available: https://github.com/wenhuang2000/VHTest.</abstract>
-      <url hash="3fed55e6">2024.findings-acl.573</url>
+      <url hash="89a14a1f">2024.findings-acl.573</url>
       <bibkey>huang-etal-2024-visual</bibkey>
       <doi>10.18653/v1/2024.findings-acl.573</doi>
       <video href="2024.findings-acl.573.mp4"/>
@@ -14087,7 +14087,7 @@
       <author><first>Weiqing</first><last>Huang</last><affiliation>Institute of Information Engineering, Chinese Academy of Sciences</affiliation></author>
       <pages>9632-9651</pages>
       <abstract>With the popularity of large language models (LLMs) and their ability to handle longer input documents, there is a growing need for high-quality long document summarization datasets. Although many models already support 16k input, current lengths of summarization datasets are inadequate, and salient information is not evenly distributed. To bridge these gaps, we collect a new summarization dataset called SumSurvey, consisting of more than 18k scientific survey papers. With an average document length exceeding 12k and a quarter exceeding 16k, as well as the uniformity metric outperforming current mainstream long document summarization datasets, SumSurvey brings new challenges and expectations to both fine-tuned models and LLMs. The informativeness of summaries and the models supporting the evaluation of long document summarization warrant further attention. Automatic and human evaluation results on this abstractive dataset confirm this view. Our dataset and code are available at https://github.com/Oswald1997/SumSurvey.</abstract>
-      <url hash="ad6697ff">2024.findings-acl.574</url>
+      <url hash="2caefc52">2024.findings-acl.574</url>
       <bibkey>liu-etal-2024-sumsurvey</bibkey>
       <doi>10.18653/v1/2024.findings-acl.574</doi>
       <video href="2024.findings-acl.574.mp4"/>
@@ -14100,7 +14100,7 @@
       <author><first>Esther</first><last>Setiawan</last><affiliation>Institut Sains dan Teknologi Terpadu Surabaya</affiliation></author>
       <pages>9652-9667</pages>
       <abstract>Named Entity Recognition (NER) is an important task, but to achieve great performance, it is usually necessary to collect a large amount of labeled data, incurring high costs. In this paper, we propose using open-source Large Language Models (LLM) to generate NER data with only a few labeled examples, reducing the cost of human annotations. Our proposed method is very simple and can perform well using only a few labeled data points. Experimental results on diverse low-resource NER datasets show that our proposed data generation method can significantly improve the baseline. Additionally, our method can be used to augment datasets with class-imbalance problems and consistently improves model performance on macro-F1 metrics.</abstract>
-      <url hash="8679e04c">2024.findings-acl.575</url>
+      <url hash="09e3c8a2">2024.findings-acl.575</url>
       <bibkey>santoso-etal-2024-pushing</bibkey>
       <doi>10.18653/v1/2024.findings-acl.575</doi>
       <video href="2024.findings-acl.575.mp4"/>
@@ -14115,7 +14115,7 @@
       <author><first>Ying</first><last>Wei</last><affiliation>Nanyang Technological University</affiliation></author>
       <pages>9668-9688</pages>
       <abstract>LLMs have marked a revolutonary shift, yet they falter when faced with compositional reasoning tasks. Our research embarks on a quest to uncover the root causes of compositional reasoning failures of LLMs, uncovering that most of them stem from the improperly generated or leveraged implicit reasoning results. Inspired by our empirical findings, we resort to Logit Lens and an intervention experiment to dissect the inner hidden states of LLMs. This deep dive reveals that implicit reasoning results indeed surface within middle layers and play a causative role in shaping the final explicit reasoning results. Our exploration further locates multi-head self-attention (MHSA) modules within these layers, which emerge as the linchpins in accurate generation and leveraing of implicit reasoning results. Grounded on the above findings, we develop CREME, a lightweight method to patch errors in compositional reasoning via editing the located MHSA modules. Our empirical evidence stands testament to CREME’s effectiveness, paving the way for autonomously and continuously enhancing compositional reasoning capabilities in language models.</abstract>
-      <url hash="933a4710">2024.findings-acl.576</url>
+      <url hash="f833ddc3">2024.findings-acl.576</url>
       <bibkey>li-etal-2024-understanding</bibkey>
       <doi>10.18653/v1/2024.findings-acl.576</doi>
       <video href="2024.findings-acl.576.mp4"/>
@@ -14125,7 +14125,7 @@
       <author><first>Elena</first><last>Chistova</last><affiliation>FRC CSC RAS</affiliation></author>
       <pages>9689-9706</pages>
       <abstract>Discourse parsing is a crucial task in natural language processing that aims to reveal the higher-level relations in a text. Despite growing interest in cross-lingual discourse parsing, challenges persist due to limited parallel data and inconsistencies in the Rhetorical Structure Theory (RST) application across languages and corpora. To address this, we introduce a parallel Russian annotation for the large and diverse English GUM RST corpus. Leveraging recent advances, our end-to-end RST parser achieves state-of-the-art results on both English and Russian corpora. It demonstrates effectiveness in both monolingual and bilingual settings, successfully transferring even with limited second-language annotation. To the best of our knowledge, this work is the first to evaluate the potential of cross-lingual end-to-end RST parsing on a manually annotated parallel corpus.</abstract>
-      <url hash="9485f778">2024.findings-acl.577</url>
+      <url hash="996eab1f">2024.findings-acl.577</url>
       <bibkey>chistova-2024-bilingual</bibkey>
       <doi>10.18653/v1/2024.findings-acl.577</doi>
       <video href="2024.findings-acl.577.mp4"/>
@@ -14139,7 +14139,7 @@
       <author><first>Mrinmaya</first><last>Sachan</last><affiliation>Swiss Federal Institute of Technology</affiliation></author>
       <pages>9707-9731</pages>
       <abstract>Educational chatbots are a promising tool for assisting student learning. However, the development of effective chatbots in education has been challenging, as high-quality data is seldom available in this domain. In this paper, we propose a framework for generating synthetic teacher-student interactions grounded in a set of textbooks. Our approaches capture a key aspect of learning interactions where curious students with partial knowledge interactively ask teachers questions about the material in the textbook. We highlight various quality criteria that such dialogues must fulfill and compare several approaches relying on either prompting or finetuning large language models according to these criteria. We use the synthetic dialogues to train educational chatbots and show the benefits of further fine-tuning in educational domains. However, careful human evaluation shows that our best data synthesis method still suffers from hallucinations and tends to reiterate information from previous conversations. Our findings offer insights for future efforts in synthesizing conversational data that strikes a balance between size and quality. We will open-source our data and code.</abstract>
-      <url hash="bd641be8">2024.findings-acl.578</url>
+      <url hash="e5389f72">2024.findings-acl.578</url>
       <bibkey>wang-etal-2024-book2dial</bibkey>
       <doi>10.18653/v1/2024.findings-acl.578</doi>
       <video href="2024.findings-acl.578.mp4"/>
@@ -14152,7 +14152,7 @@
       <author><first>Feng</first><last>Zhang</last><affiliation>Peking University</affiliation></author>
       <pages>9732-9741</pages>
       <abstract>The meta-learning paradigm has demonstrated significant effectiveness in few-shot text classification. Currently, numerous efforts are grounded in metric-based learning, utilizing textual feature vectors for classification, with a common emphasis on enlarging inter-class distances to achieve improved classification effectiveness. However, many methods predominantly focus on enhancing the separation of prototypes without taking the semantic relationships between prototypes and class clusters into consideration. This oversight results in incomplete and inaccurate encoding of prototypes within the semantic space, affecting the generality of the learned metric space. In this paper, we propose the utilization of <tex-math>\textbf{S}</tex-math>emantically <tex-math>\textbf{E}</tex-math>nhanced <tex-math>\textbf{L}</tex-math>abels for calibrating class <tex-math>\textbf{P}</tex-math>rototypes (<tex-math>\textbf{SELP}</tex-math>), thereby obtaining prototypes that are more separated and semantically accurate. Additionally, we have devised a center loss to enhance intra-class compactness, coupled with the introduction of a simulated label distribution method to address the overfitting problem. Extensive experiments on eight few-shot text classification datasets show that the proposed method outperforms baselines significantly. Our code is available at https://github.com/tttyyyzzz-zty/SELP.git.</abstract>
-      <url hash="66ac1a21">2024.findings-acl.579</url>
+      <url hash="c6b6cb73">2024.findings-acl.579</url>
       <bibkey>liang-etal-2024-selp</bibkey>
       <doi>10.18653/v1/2024.findings-acl.579</doi>
     </paper>
@@ -14163,7 +14163,7 @@
       <author><first>Andreas</first><last>Vlachos</last><affiliation>University of Cambridge</affiliation></author>
       <pages>9742-9763</pages>
       <abstract>Scientific writing is a challenging task, particularly for novice researchers who often rely on feedback from experienced peers. Recent work has primarily focused on improving surface form and style rather than manuscript content. In this paper, we propose a novel task: automated focused feedback generation for scientific writing assistance. We present SWIF<tex-math>^2</tex-math>T: a Scientific WrIting Focused Feedback Tool. It is designed to generate specific, actionable and coherent comments, which identify weaknesses in a scientific paper and/or propose revisions to it. Our approach consists of four components - planner, investigator, reviewer and controller - leveraging multiple Large Language Models (LLMs) to implement them. We compile a dataset of 300 peer reviews citing weaknesses in scientific papers and conduct human evaluation. The results demonstrate the superiority in specificity, reading comprehension, and overall helpfulness of SWIF<tex-math>^2</tex-math>T’s feedback compared to other approaches. In our analysis, we also identified cases where automatically generated reviews were judged better than human ones, suggesting opportunities for integration of AI-generated feedback in scientific writing.</abstract>
-      <url hash="76c1738a">2024.findings-acl.580</url>
+      <url hash="54ed89d9">2024.findings-acl.580</url>
       <bibkey>chamoun-etal-2024-automated</bibkey>
       <doi>10.18653/v1/2024.findings-acl.580</doi>
       <video href="2024.findings-acl.580.mp4"/>
@@ -14176,7 +14176,7 @@
       <author><first>Jundong</first><last>Li</last><affiliation>University of Virginia</affiliation></author>
       <pages>9764-9780</pages>
       <abstract>In-context learning (ICL) empowers large language models (LLMs) to tackle new tasks by using a series of training instances as prompts. Since generating the prompts needs to sample from a vast pool of instances and annotate them (e.g., add labels in classification task), existing methods have proposed to select a subset of unlabeled examples for annotation, thus enhancing the quality of prompts and concurrently mitigating annotation costs. However, these methods often require a long time to select instances due to their complexity, hindering their practical viability. To address this limitation, we propose a graph-based selection method, FastGAS, designed to efficiently identify high-quality instances while minimizing computational overhead. Initially, we construct a data similarity graph based on instance similarities. Subsequently, employing a graph partitioning algorithm, we partition the graph into pieces. Within each piece (i.e., subgraph), we adopt a greedy approach to pick the most representative nodes. By aggregating nodes from diverse pieces and annotating the corresponding instances, we identify a set of diverse and representative instances for ICL. Compared to prior approaches, our method not only exhibits superior performance on different tasks but also significantly reduces selection time. In addition, we demonstrate the efficacy of our approach in LLMs of larger sizes.</abstract>
-      <url hash="29c65a93">2024.findings-acl.581</url>
+      <url hash="5139a283">2024.findings-acl.581</url>
       <bibkey>chen-etal-2024-fastgas</bibkey>
       <doi>10.18653/v1/2024.findings-acl.581</doi>
     </paper>
@@ -14191,7 +14191,7 @@
       <author><first>Weiping</first><last>Wang</last></author>
       <pages>9781-9793</pages>
       <abstract>Structured pruning fundamentally reduces computational and memory overheads of large language models (LLMs) and offers a feasible solution for end-side LLM deployment. Structurally pruned models remain dense and high-precision, highly compatible with further tuning and compression. However, as the coarse-grained structured pruning poses large damage to the highly interconnected model, achieving a high compression ratio for scaled-up LLMs remains a challenge. In this paper, we introduce a task-agnostic structured pruning approach coupled with a compact Transformer architecture design. The proposed approach, named TransAct, reduces transitional activations inside multi-head attention (MHA) and multi-layer perceptron (MLP) modules, while preserving the inter-module activations that are sensitive to perturbations. Hence, the LLM is pruned into an intra-module low-rank architecture, significantly reducing weights, KV Cache and attention computation. TransAct is implemented on the LLaMA model and evaluated on downstream benchmarks. Results verify the optimality of our approach at high compression with respect to both efficiency and performance. Further, ablation studies reveal the strength of activation-guided iterative pruning and provide experimental analysis on the redundancy of MHA and MLP modules.</abstract>
-      <url hash="7840ab86">2024.findings-acl.582</url>
+      <url hash="71b83942">2024.findings-acl.582</url>
       <bibkey>shen-etal-2024-pruning</bibkey>
       <doi>10.18653/v1/2024.findings-acl.582</doi>
     </paper>
@@ -14201,7 +14201,7 @@
       <author><first>Yang</first><last>Feng</last><affiliation>Institute of Computing Technology, Chinese Academy of Sciences</affiliation></author>
       <pages>9794-9801</pages>
       <abstract>Subword tokenization is a common method for vocabulary building in Neural Machine Translation (NMT) models. However, increasingly complex tasks have revealed its disadvantages. First, a vocabulary cannot be modified once it is learned, making it hard to adapt to new words. Second, in multilingual translation, the imbalance in data volumes across different languages spreads to the vocabulary, exacerbating translations involving low-resource languages. While byte-based tokenization addresses these issues, byte-based models struggle with the low information density inherent in UTF-8 byte sequences. Previous works enhance token semantics through local contextualization but fail to select an appropriate contextualizing scope based on the input. Consequently, we propose the Multi-Scale Contextualization (MSC) method, which learns contextualized information of varying scales across different hidden state dimensions. It then leverages the attention module to dynamically integrate the multi-scale contextualized information. Experiments show that MSC significantly outperforms subword-based and other byte-based methods in both multilingual and out-of-domain scenarios. Code can be found in https://github.com/ictnlp/Multiscale-Contextualization.</abstract>
-      <url hash="62762394">2024.findings-acl.583</url>
+      <url hash="60166491">2024.findings-acl.583</url>
       <bibkey>huang-feng-2024-integrating</bibkey>
       <doi>10.18653/v1/2024.findings-acl.583</doi>
     </paper>
@@ -14214,7 +14214,7 @@
       <author><first>Jacob</first><last>Andreas</last><affiliation>Massachusetts Institute of Technology and Microsoft</affiliation></author>
       <pages>9802-9818</pages>
       <abstract>While language models (LMs) can sometimes generate factually correct text and estimate truth values of individual claims, these generally do not reflect a globally coherent, manipulable model of the world. As a consequence, current LMs also generate incorrect or nonsensical content, and are difficult to edit and bring up to date. We present a method called Deductive Closure Training (DCT) that uses LMs themselves to identify implications of (and contradictions within) the text that they generate, yielding an efficient self-supervised procedure for improving LM factuality. Given a collection of seed documents, DCT prompts LMs to generate additional text implied by these documents, reason globally about the correctness of this generated text, and finally fine-tune on text inferred to be correct. Given seed documents from a trusted source, DCT provides a tool for supervised model updating; if seed documents are sampled from the LM itself, DCT enables fully unsupervised fine-tuning for improved coherence and accuracy. Across the CREAK, MQuAKE, and Reversal Curse datasets, supervised DCT improves LM fact verification and text generation accuracy by 3-26%; on CREAK, fully unsupervised DCT improves verification accuracy by 12%. These results show that LMs’ reasoning capabilities during inference can be leveraged during training to improve their reliability.</abstract>
-      <url hash="d25c64da">2024.findings-acl.584</url>
+      <url hash="f7fef421">2024.findings-acl.584</url>
       <bibkey>akyurek-etal-2024-deductive</bibkey>
       <doi>10.18653/v1/2024.findings-acl.584</doi>
     </paper>
@@ -14227,7 +14227,7 @@
       <author><first>Zhou</first><last>Zhao</last><affiliation>Zhejiang University and Zhejiang University</affiliation></author>
       <pages>9819-9831</pages>
       <abstract>Speech-to-singing voice conversion (STS) task always suffers from data scarcity, because it requires paired speech and singing data. Compounding this issue are the challenges of content-pitch alignment and the suboptimal quality of generated outputs, presenting significant hurdles in STS research. This paper presents SVPT, an STS approach boosted by a self-supervised singing voice pre-training model.We leverage spoken language model techniques to tackle the rhythm alignment problem and the in-context learning capability to achieve zero-shot conversion. We adopt discrete-unit random resampling and pitch corruption strategies, enabling training with unpaired singing data and thus mitigating the issue of data scarcity. SVPT also serves as an effective backbone for singing voice synthesis (SVS), offering insights into scaling up SVS models. Experimental results indicate that SVPT delivers notable improvements in both STS and SVS endeavors. Audio samples are available at https://speech2sing.github.io.</abstract>
-      <url hash="bb927522">2024.findings-acl.585</url>
+      <url hash="bdc48c19">2024.findings-acl.585</url>
       <bibkey>li-etal-2024-self-supervised</bibkey>
       <doi>10.18653/v1/2024.findings-acl.585</doi>
     </paper>
@@ -14238,7 +14238,7 @@
       <author><first>Daniel</first><last>Fried</last><affiliation>Carnegie Mellon University</affiliation></author>
       <pages>9832-9850</pages>
       <abstract>The task of persona-steered text generation requires large language models (LLMs) to generate text that reflects the distribution of views that an individual fitting a persona could have. People have multifaceted personas, but prior work on bias in LLM-generated opinions has only explored multiple-choice settings or one-dimensional personas. We define an incongruous persona as a persona with multiple traits where one trait makes its other traits less likely in human survey data, e.g. political liberals who support increased military spending. We find that LLMs are 9.7% less steerable towards incongruous personas than congruous ones, sometimes generating the stereotypical stance associated with its demographic rather than the target stance. Models that we evaluate that are fine-tuned with Reinforcement Learning from Human Feedback (RLHF) are more steerable, especially towards stances associated with political liberals and women, but present significantly less diverse views of personas. We also find variance in LLM steerability that cannot be predicted from multiple-choice opinion evaluation. Our results show the importance of evaluating models in open-ended text generation, as it can surface new LLM opinion biases. Moreover, such a setup can shed light on our ability to steer models toward a richer and more diverse range of viewpoints.</abstract>
-      <url hash="0ef90b4d">2024.findings-acl.586</url>
+      <url hash="622635ac">2024.findings-acl.586</url>
       <bibkey>liu-etal-2024-evaluating-large</bibkey>
       <doi>10.18653/v1/2024.findings-acl.586</doi>
       <video href="2024.findings-acl.586.mp4"/>
@@ -14254,7 +14254,7 @@
       <author><first>Enhong</first><last>Chen</last><affiliation>University of Science and Technology of China</affiliation></author>
       <pages>9851-9862</pages>
       <abstract>The rapid increase in multimedia data has spurred advancements in Multimodal Summarization with Multimodal Output (MSMO), which aims to produce a multimodal summary that integrates both text and relevant images. The inherent heterogeneity of content within multimodal inputs and outputs presents a significant challenge to the execution of MSMO. Traditional approaches typically adopt a holistic perspective on coarse image-text data or individual visual objects, overlooking the essential connections between objects and the entities they represent. To integrate the fine-grained entity knowledge, we propose an Entity-Guided Multimodal Summarization model (EGMS). Our model, building on BART, utilizes dual multimodal encoders with shared weights to process text-image and entity-image information concurrently. A gating mechanism then combines visual data for enhanced textual summary generation, while image selection is refined through knowledge distillation from a pre-trained vision-language model. Extensive experiments on public MSMO dataset validate the superiority of the EGMS method, which also prove the necessity to incorporate entity information into MSMO problem.</abstract>
-      <url hash="17d4cb0f">2024.findings-acl.587</url>
+      <url hash="168747cf">2024.findings-acl.587</url>
       <bibkey>zhang-etal-2024-leveraging</bibkey>
       <doi>10.18653/v1/2024.findings-acl.587</doi>
       <video href="2024.findings-acl.587.mp4"/>
@@ -14269,7 +14269,7 @@
       <author><first>Mengxin</first><last>Zheng</last><affiliation>University of Central Florida</affiliation></author>
       <pages>9863-9875</pages>
       <abstract>It is imperative to ensure the stability of every prediction made by a language model; that is, a language’s prediction should remain consistent despite minor input variations, like word substitutions. In this paper, we investigate the problem of certifying a language model’s robustness against Universal Text Perturbations (UTPs), which have been widely used in universal adversarial attacks and backdoor attacks. Existing certified robustness based on random smoothing has shown considerable promise in certifying the input-specific text perturbations (ISTPs), operating under the assumption that any random alteration of a sample’s clean or adversarial words would negate the impact of sample-wise perturbations. However, with UTPs, masking only the adversarial words can eliminate the attack. A naive method is to simply increase the masking ratio and the likelihood of masking attack tokens, but it leads to a significant reduction in both certified accuracy and the certified radius due to input corruption by extensive masking. To solve this challenge, we introduce a novel approach, the <i>superior prompt search</i> method, designed to identify a <i>superior prompt</i> that maintains higher certified accuracy under extensive masking. Additionally, we theoretically motivate why ensembles are a particularly suitable choice as base prompts for random smoothing. The method is denoted by <i>superior prompt ensembling</i> technique. We also empirically confirm this technique, obtaining state-of-the-art results in multiple settings. These methodologies, for the first time, enable high certified accuracy against both UTPs and ISTPs. The source code of CR-UTP is available at <url>https://github.com/UCF-ML-Research/CR-UTP</url>.</abstract>
-      <url hash="a63ee7d7">2024.findings-acl.588</url>
+      <url hash="709e8dd4">2024.findings-acl.588</url>
       <bibkey>lou-etal-2024-cr</bibkey>
       <doi>10.18653/v1/2024.findings-acl.588</doi>
       <video href="2024.findings-acl.588.mp4"/>
@@ -14281,7 +14281,7 @@
       <author><first>Philipp</first><last>Koehn</last><affiliation>Johns Hopkins University</affiliation></author>
       <pages>9876-9890</pages>
       <abstract>In machine translation, historical models were incapable of handling longer contexts, so the lack of document-level datasets was less noticeable. Now, despite the emergence of long-sequence methods, we remain within a sentence-level paradigm and without data to adequately approach context-aware machine translation. Most large-scale datasets have been processed through a pipeline that discards document-level metadata. In this work, we reconstruct document-level information for three (ParaCrawl, News Commentary, and Europarl) large datasets in German, French, Spanish, Italian, Polish, and Portuguese (paired with English). We then introduce a document-level filtering technique as an alternative to traditional bitext filtering. We present this filtering with analysis to show that this method prefers context-consistent translations rather than those that may have been sentence-level machine translated. Last we train models on these longer contexts and demonstrate improvement in document-level translation without degradation of sentence-level translation. We release our dataset, ParaDocs, and resulting models as a resource to the community.</abstract>
-      <url hash="e7d06625">2024.findings-acl.589</url>
+      <url hash="c8e53dee">2024.findings-acl.589</url>
       <bibkey>wicks-etal-2024-recovering</bibkey>
       <doi>10.18653/v1/2024.findings-acl.589</doi>
     </paper>
@@ -14294,7 +14294,7 @@
       <author><first>Erik</first><last>Cambria</last><affiliation>Nanyang Technological University</affiliation></author>
       <pages>9891-9908</pages>
       <abstract>Metaphor interpretation is a difficult task in natural language understanding. The development of relevant techniques in this domain is slow, mostly because of the lack of large annotated datasets and effective pre-trained language models (PLMs) for metaphor learning. Thus, we propose a large annotated dataset and a PLM for the metaphor interpretation task. Our foundation model is based on a novel anomalous language modeling (ALM) method, which we benchmark with comparable PLM baselines on the new dataset, finding that it largely improves model performance on metaphor identification and interpretation.</abstract>
-      <url hash="75064667">2024.findings-acl.590</url>
+      <url hash="45c59988">2024.findings-acl.590</url>
       <bibkey>mao-etal-2024-metapro</bibkey>
       <doi>10.18653/v1/2024.findings-acl.590</doi>
       <video href="2024.findings-acl.590.mp4"/>
@@ -14313,7 +14313,7 @@
       <author><first>Gao</first><last>Huang</last><affiliation>Tsinghua University, Tsinghua University</affiliation></author>
       <pages>9909-9953</pages>
       <abstract>Recent advances in large language models (LLMs) have led to significant success in using LLMs as agents. Nevertheless, a common assumption that LLMs always process honest information neglects the widespread deceptive or misleading content in human and AI-generated material. This oversight might expose LLMs to malicious manipulations. To enhance LLMs’ ability to identify and counteract deceptive information, in this paper, inspired by humans’ recursive thinking and perspective-taking, we introduce a novel cognitive framework, Recursive Contemplation (ReCon). ReCon combines formulation and refinement contemplation processes; formulation contemplation produces initial thoughts and speech, while refinement contemplation further polishes them. Additionally, we incorporate first-order and second-order perspective transitions into these processes respectively. Specifically, the first-order allows an LLM agent to infer others’ mental states, and the second-order involves understanding how others perceive the agent’s mental state. After integrating ReCon with various LLMs, extensive experiment results from the Avalon game and BigTom benchmark indicate ReCon’s efficacy in aiding LLMs to discern and maneuver around deceptive information without extra fine-tuning and data. Finally, we demonstrate ReCon’s scaling trend with model parameters, and explore the current limitations of LLMs in terms of safety and reasoning, potentially furnishing insights for subsequent research. Our project page can be found at https://shenzhi-wang.github.io/avalon_recon.</abstract>
-      <url hash="e521a7cc">2024.findings-acl.591</url>
+      <url hash="ac45b0d7">2024.findings-acl.591</url>
       <bibkey>wang-etal-2024-boosting-llm</bibkey>
       <doi>10.18653/v1/2024.findings-acl.591</doi>
       <video href="2024.findings-acl.591.mp4"/>
@@ -14325,7 +14325,7 @@
       <author><first>Ryan</first><last>Cotterell</last><affiliation>Swiss Federal Institute of Technology</affiliation></author>
       <pages>9954-9972</pages>
       <abstract>Direct preference optimization (DPO) is a successful fine-tuning strategy for aligning large language models with human preferences without the need to train a reward model or employ reinforcement learning. DPO, as originally formulated, relies on binary preference data and fine-tunes a language model to increase the likelihood of a preferred response over a dispreferred response. However, not all preference pairs are equal. Sometimes, the preferred response is only slightly better than the dispreferred one. In other cases, the preference is much stronger. For instance, if a response contains harmful or toxic content, the annotator will have a strong preference for that response. In this paper, we propose a generalization of DPO, termed DPO with an offset (ODPO), that does not treat every preference pair equally during fine-tuning. Intuitively, ODPO requires the difference between the likelihood of the preferred and dispreferred response to be greater than an offset value. The offset is determined based on the extent to which one response is preferred over another. Our experiments on various tasks suggest that ODPO significantly outperforms DPO in aligning language models, especially when the number of preference pairs is limited.</abstract>
-      <url hash="61a75103">2024.findings-acl.592</url>
+      <url hash="ff3f4ced">2024.findings-acl.592</url>
       <bibkey>amini-etal-2024-direct</bibkey>
       <doi>10.18653/v1/2024.findings-acl.592</doi>
       <video href="2024.findings-acl.592.mp4"/>
@@ -14344,7 +14344,7 @@
       <author><first>Zhou</first><last>Zhao</last><affiliation>Zhejiang University and Zhejiang University</affiliation></author>
       <pages>9973-9986</pages>
       <abstract>Direct speech-to-speech translation achieves high-quality results through the introduction of discrete units obtained from self-supervised learning. However, talking head translation, converting audio-visual speech (i.e., talking head video) from one language into another, still confronts several challenges compared to audio speech: (1) Existing methods invariably rely on cascading, synthesizing via both audio and text, resulting in delays and cascading errors. (2) Talking head translation has a limited set of reference frames. If the generated translation exceeds the length of the original speech, the video sequence needs to be supplemented by repeating frames, leading to jarring video transitions. In this work, we propose a model for talking head translation, <b>TransFace</b>, which can directly translate audio-visual speech into audio-visual speech in other languages. It consists of a speech-to-unit translation model to convert audio speech into discrete units and a unit-based audio-visual speech synthesizer, Unit2Lip, to re-synthesize synchronized audio-visual speech from discrete units in parallel. Furthermore, we introduce a Bounded Duration Predictor, ensuring isometric talking head translation and preventing duplicate reference frames. Experiments demonstrate that Unit2Lip significantly improves synchronization and boosts inference speed by a factor of 4.35 on LRS2. Additionally, TransFace achieves impressive BLEU scores of 61.93 and 47.55 for Es-En and Fr-En on LRS3-T and 100% isochronous translations. The samples are available at https://transface-demo.github.io .</abstract>
-      <url hash="a4dbbfeb">2024.findings-acl.593</url>
+      <url hash="7da79676">2024.findings-acl.593</url>
       <bibkey>cheng-etal-2024-transface</bibkey>
       <doi>10.18653/v1/2024.findings-acl.593</doi>
     </paper>
@@ -14359,7 +14359,7 @@
       <author><first>Mykola</first><last>Pechenizkiy</last><affiliation>Eindhoven University of Technology</affiliation></author>
       <pages>9987-10001</pages>
       <abstract>Pretrained models learned from real corpora can often capture undesirable features, leading to bias issues against different demographic groups. Most existing studies on bias dataset construction or bias mitigation methods only focus on one demographic group pair to study a certain bias, e.g. <i>black</i> vs. <i>white</i> for racial bias. However, in real-world applications, there are more than two demographic groups that are at risk of the same bias. In this paper, we propose to analyze and reduce biases across multiple demographic groups. We collect and build a multi-demographic bias dataset including five commonly discussed bias dimensions. To mitigate multi-demographic bias, we adopt several novel debiasing methods, including regularisation-based and augmentation-based methods, as well as appropriate evaluation metrics for multi-demographic bias measurement. Experimental results on the proposed multi-demographic dataset show that a fairer model can be achieved using a multi-demographic debiasing approach. Also, the model debiased using the proposed multi-demographic debiasing methods can better transfer to unseen demographics without sacrificing the performance of the pretrained model.</abstract>
-      <url hash="13cc40f0">2024.findings-acl.594</url>
+      <url hash="9b788119">2024.findings-acl.594</url>
       <bibkey>zhao-etal-2024-minorities</bibkey>
       <doi>10.18653/v1/2024.findings-acl.594</doi>
       <video href="2024.findings-acl.594.mp4"/>
@@ -14373,7 +14373,7 @@
       <author><first>Dong</first><last>Wang</last><affiliation>University of Illinois at Urbana-Champaign</affiliation></author>
       <pages>10002-10017</pages>
       <abstract>Existing literature that integrates CLIP into federated learning (FL) largely ignores the inherent group unfairness within CLIP and its ethical implications on FL applications. Furthermore, such CLIP bias may be amplified in FL, due to the unique issue of data heterogeneity across clients. However, in identity-sensitive FL applications, model fairness (i.e., group fairness) is imperative for model development. Therefore, this work explores a critical question ignored by the existing literature: how can we build a fair FL framework using biased pre-trained VLMs (e.g., CLIP)? To address this problem, we propose a fairness-aware adaptation framework tailored for VLM (e.g., CLIP) in the context of FL, named <b>F</b>air <b>F</b>ederated <b>D</b>eep <b>V</b>isiual <b>P</b>rompting or <b>FF-DVP</b>. As implied by its name, trains a fair FL model with fairness-aware deep visual prompting (DVP). Moreover, incorporates modality-fused classification heads to learn client-specific knowledge and fairness constraints. These modules explicitly addresses a unique bias in FL, namely the bias triggered by data heterogeneity. We show that can be readily extended to prevailing parameter-efficient fine-tuning methods (e.g., adapter or LoRA) for debiasing. To the best of our knowledge, is the first to leverage biased VLMs for building fair FL frameworks. Extensive results on human face attribute recognition (FAR) applications suggest that effectively improves model fairness and training convergence, outperforming state-of-the-art baselines.</abstract>
-      <url hash="57f92d20">2024.findings-acl.595</url>
+      <url hash="b7e2e1ba">2024.findings-acl.595</url>
       <bibkey>zeng-etal-2024-fair</bibkey>
       <doi>10.18653/v1/2024.findings-acl.595</doi>
     </paper>
@@ -14395,7 +14395,7 @@
       <author><first>Katrin</first><last>Kirchhoff</last></author>
       <pages>10018-10035</pages>
       <abstract>Integrated Speech and Large Language Models (SLMs) that can follow speech instructions and generate relevant text responses have gained popularity lately. However, the safety and robustness of these models remains largely unclear. In this work, we investigate the potential vulnerabilities of such instruction-following speech-language models to adversarial attacks and jailbreaking. Specifically, we design algorithms that can generate adversarial examples to jailbreak SLMs in both white-box and black-box attack settings without human involvement. Additionally, we propose countermeasures to thwart such jailbreaking attacks. Our models, trained on dialog data with speech instructions, achieve state-of-the-art performance on spoken question-answering task, scoring over 80% on both safety and helpfulness metrics. Despite safety guardrails, experiments on jailbreaking demonstrate the vulnerability of SLMs to adversarial perturbations and transfer attacks, with average attack success rates of 90% and 10% respectively when evaluated on a dataset of carefully designed harmful questions spanning 12 different toxic categories. However, we demonstrate that our proposed countermeasures reduce the attack success significantly.</abstract>
-      <url hash="e8cdaf3d">2024.findings-acl.596</url>
+      <url hash="9196fce0">2024.findings-acl.596</url>
       <bibkey>peri-etal-2024-speechguard</bibkey>
       <doi>10.18653/v1/2024.findings-acl.596</doi>
       <video href="2024.findings-acl.596.mp4"/>
@@ -14410,7 +14410,7 @@
       <author><first>Ramakanth</first><last>Pasunuru</last></author>
       <pages>10036-10056</pages>
       <abstract>The impressive generation capabilities of large language models (LLMs) have made it harder to detect the subtle hallucinations they make in abstractive summarization, where generated summaries consist of a blend of correct and incorrect information w.r.t. a given document. Recently-proposed LLM-based evaluation metrics attempt to capture this, but still face challenges: (1) they are biased towards summaries generated from the same underlying LLM, and (2) they lack interpretability, offering only a single score. In this work, we present ACUEval, a metric that leverages the power of LLMs to perform two sub-tasks: decomposing summaries into atomic content units (ACUs), and validating them against the source document. Compared to current strong LLM-based metrics, our two-step evaluation strategy improves correlation with human judgments of faithfulness on three summarization evaluation benchmarks by 3% in balanced accuracy compared to the next-best metric, and also shows reduced preference bias towards LLM-generated summary. Further, we show that errors detected by ACUEval can be used to generate actionable feedback for refining the summary, improving the faithfulness scores by more than 10%.</abstract>
-      <url hash="44f4e761">2024.findings-acl.597</url>
+      <url hash="604de2c3">2024.findings-acl.597</url>
       <bibkey>wan-etal-2024-acueval</bibkey>
       <doi>10.18653/v1/2024.findings-acl.597</doi>
     </paper>
@@ -14424,18 +14424,18 @@
       <author><first>Jeff</first><last>Pan</last><affiliation>University of Edinburgh, University of Edinburgh</affiliation></author>
       <pages>10057-10084</pages>
       <abstract>Multimodal Large Language Models (MLLMs) fine-tuned with multimodal instruction-following data have demonstrated formidable capabilities in multimodal tasks. However, fine-tuning all parameters of MLLMs has become challenging due to the rapid growth of the overall model’s parameters. To address this issue, we study Parameter-Efficient Fine-Tuning (PEFT) methods for MLLMs. We aim to identify effective methods for enhancing performance in scenarios where only a limited number of parameters are trained. This paper conducts empirical studies that employ four widely used PEFT methods to fine-tune the LLM component of open-source MLLMs. We present a comprehensive analysis that encompasses various aspects, including the impact of PEFT methods on various models, parameters and location of PEFT module, fine-tuning data scale, model stability based on PEFT method, MLLM’s generalization, and hallucination. We evaluated four PEFT methods on seven datasets from two different categories, unseen and seen datasets. Across all experiments, we show that the adapter is the best-performing PEFT method in various aspects. At the same time, fine-tuning the connector layers leads to improved performance in most MLLMs.</abstract>
-      <url hash="9db47324">2024.findings-acl.598</url>
+      <url hash="6b4644f5">2024.findings-acl.598</url>
       <bibkey>zhou-etal-2024-empirical</bibkey>
       <doi>10.18653/v1/2024.findings-acl.598</doi>
     </paper>
     <paper id="599">
       <title><fixed-case>PARADISE</fixed-case>: Evaluating Implicit Planning Skills of Language Models with Procedural Warnings and Tips Dataset</title>
-      <author><first>Arda</first><last>Uzunoglu</last><affiliation>KUIS AI Lab</affiliation></author>
-      <author><first>Gözde</first><last>Şahin</last><affiliation>Koç University</affiliation></author>
-      <author><first>Abdulfattah</first><last>Safa</last></author>
+      <author><first>Arda</first><last>Uzunoğlu</last><affiliation>Johns Hopkins University</affiliation></author>
+      <author><first>Abdulfattah</first><last>Safa</last><affiliation>Koç University</affiliation></author>
+      <author><first>Gözde Gül</first><last>Şahin</last><affiliation>Koç University</affiliation></author>
       <pages>10085-10102</pages>
       <abstract>Recently, there has been growing interest within the community regarding whether large language models are capable of planning or executing plans. However, most prior studies use LLMs to generate high-level plans for simplified scenarios lacking linguistic complexity and domain diversity, limiting analysis of their planning abilities. These setups constrain evaluation methods (e.g., predefined action space), architectural choices (e.g., only generative models), and overlook the linguistic nuances essential for realistic analysis. To tackle this, we present PARADISE, an abductive reasoning task using Q&amp;A format on practical procedural text sourced from wikiHow. It involves tip and warning inference tasks directly associated with goals, excluding intermediary steps, with the aim of testing the ability of the models to infer implicit knowledge of the plan solely from the given goal. Our experiments, utilizing fine-tuned language models and zero-shot prompting, reveal the effectiveness of task-specific small models over large language models in most scenarios. Despite advancements, all models fall short of human performance. Notably, our analysis uncovers intriguing insights, such as variations in model behavior with dropped keywords, struggles of BERT-family and GPT-4 with physical and abstract goals, and the proposed tasks offering valuable prior knowledge for other unseen procedural tasks. The PARADISE dataset and associated resources are publicly available for further research exploration with https://anonymous.4open.science/r/paradise-53BD/README.md.</abstract>
-      <url hash="1dbbf2d2">2024.findings-acl.599</url>
+      <url hash="2e6b25f4">2024.findings-acl.599</url>
       <bibkey>uzunoglu-etal-2024-paradise</bibkey>
       <doi>10.18653/v1/2024.findings-acl.599</doi>
       <video href="2024.findings-acl.599.mp4"/>
@@ -14450,7 +14450,7 @@
       <author><first>Susan</first><last>Üsküdarlı</last><affiliation>Boğaziçi University</affiliation></author>
       <pages>10103-10117</pages>
       <abstract>The recent advances in natural language processing have predominantly favored well-resourced English-centric models, resulting in a significant gap with low-resource languages. In this work, we introduce TURNA, a language model developed for the low-resource language Turkish and is capable of both natural language understanding and generation tasks.TURNA is pretrained with an encoder-decoder architecture based on the unified framework UL2 with a diverse corpus that we specifically curated for this purpose. We evaluated TURNA with three generation tasks and five understanding tasks for Turkish. The results show that TURNA outperforms several multilingual models in both understanding and generation tasks and competes with monolingual Turkish models in understanding tasks.</abstract>
-      <url hash="ab20c07a">2024.findings-acl.600</url>
+      <url hash="7680694b">2024.findings-acl.600</url>
       <bibkey>uludogan-etal-2024-turna</bibkey>
       <doi>10.18653/v1/2024.findings-acl.600</doi>
       <video href="2024.findings-acl.600.mp4"/>
@@ -14466,7 +14466,7 @@
       <author><first>Sadao</first><last>Kurohashi</last><affiliation>Kyoto University</affiliation></author>
       <pages>10118-10126</pages>
       <abstract>Emotion plays a crucial role in human conversation. This paper underscores the significance of considering emotion in speech translation. We present the MELD-ST dataset for the emotion-aware speech translation task, comprising English-to-Japanese and English-to-German language pairs. Each language pair includes about 10,000 utterances annotated with emotion labels from the MELD dataset. Baseline experiments using the SeamlessM4T model on the dataset indicate that fine-tuning with emotion labels can enhance translation performance in some settings, highlighting the need for further research in emotion-aware speech translation systems.</abstract>
-      <url hash="a1ca33e9">2024.findings-acl.601</url>
+      <url hash="1c64e92c">2024.findings-acl.601</url>
       <bibkey>chen-etal-2024-meld</bibkey>
       <doi>10.18653/v1/2024.findings-acl.601</doi>
       <video href="2024.findings-acl.601.mp4"/>
@@ -14478,7 +14478,7 @@
       <author><first>Varun</first><last>Chandrasekaran</last><affiliation>University of Illinois Urbana-Champaign</affiliation></author>
       <pages>10127-10135</pages>
       <abstract>Pretrained language models (PLMs) have shown remarkable few-shot learning capabilities when provided with properly formatted examples. However, selecting the “best” examples remains an open challenge. We propose a complexity-based prompt selection approach for sequence tagging tasks. This approach avoids the training of a dedicated model for selection of examples, and instead uses certain metrics to align the syntactico-semantic complexity of test sentences and examples. We use both sentence- and word-level metrics to match the complexity of examples to the (test) sentence being considered. Our results demonstrate that our approach extracts greater performance from PLMs: it achieves state-of-the-art performance on few-shot NER, achieving a 5% absolute improvement in F1 score on the CoNLL2003 dataset for GPT-4. We also see large gains of upto 28.85 points (F1/Acc.) in smaller models like GPT-j-6B.</abstract>
-      <url hash="5dfff3bd">2024.findings-acl.602</url>
+      <url hash="e3c29ee9">2024.findings-acl.602</url>
       <bibkey>adiga-etal-2024-designing</bibkey>
       <doi>10.18653/v1/2024.findings-acl.602</doi>
       <video href="2024.findings-acl.602.mp4"/>
@@ -14495,7 +14495,7 @@
       <author><first>Kun</first><last>Kuang</last><affiliation>Zhejiang University</affiliation></author>
       <pages>10136-10142</pages>
       <abstract>In-context learning (ICL) has emerged as a powerful tool for enhancing large language models (LLMs) in addressing downstream tasks. In this paper, we explore the vital task of example selection in ICL by mimicking the human learning process. We propose a Chain-of-Quizzes (CoQ) framework inspired by educational theories such as Bruner’s Spiral Learning and Mastery Learning theory. Specifically, our framework employs the LLMs to answer the quiz (question in the example) to sift ‘good’ examples, combines these examples iteratively with the increasing complexity, and utilizes a final exam to gauge the combined example chains. Our extensive experiments on diverse reasoning datasets show the proposed approach outperforms baseline models. These findings underscore the framework’s potential for future research.</abstract>
-      <url hash="c83e3ceb">2024.findings-acl.603</url>
+      <url hash="512f003d">2024.findings-acl.603</url>
       <bibkey>wu-etal-2024-chain</bibkey>
       <doi>10.18653/v1/2024.findings-acl.603</doi>
       <video href="2024.findings-acl.603.mp4"/>
@@ -14507,7 +14507,7 @@
       <author><first>Rachel</first><last>Rudinger</last><affiliation>University of Maryland, College Park</affiliation></author>
       <pages>10143-10166</pages>
       <abstract>Chain-of-thought (COT) prompting can help large language models (LLMs) reason toward correct answers, but its efficacy in reasoning toward incorrect answers is unexplored. This process of elimination (PoE), when used with COT, can enhance self-consistency, interpretability, and tasks such as medical diagnoses of exclusion. Thus, we propose PoE with COT, where LLMs must reason toward incorrect options on multiple-choice questions. We evaluate the ability of GPT-3.5, LLaMA-2, and Falcon to perform PoE with COT on a total of four commonsense and scientific reasoning datasets. We find that the strategy of PoE always underperforms the strategy of choosing the correct answer. The agreement of these strategies is also lower than the self-consistency of each strategy. To study these issues further, we conduct error analyses and give suggestions for future work.</abstract>
-      <url hash="0245a2f0">2024.findings-acl.604</url>
+      <url hash="c7b2016f">2024.findings-acl.604</url>
       <bibkey>balepur-etal-2024-easy</bibkey>
       <doi>10.18653/v1/2024.findings-acl.604</doi>
       <video href="2024.findings-acl.604.mp4"/>
@@ -14523,7 +14523,7 @@
       <author><first>Jiabin</first><last>Zheng</last></author>
       <pages>10167-10183</pages>
       <abstract>Intent detection aims to identify user goals from utterances, and is a ubiquitous step towards the satisfaction of user desired needs in many interaction systems. As dynamic and varied intents arise, models that are capable of identifying new intents promptly are required. However, existing studies usually fine-tune discriminative models on the specific defined intent classes, precluding them from being directly adopted to new intent domains. In this paper, we introduce a generative pre-trained intent model that can recognize new intents from different domains in low-resource scenarios. We reformulate intent detection into a generation task and design descriptive and regularized instructions to guide the model effectively to detect new intents in open domains with no parameter updates. To validate the proposed method, we introduce a new intent detection benchmark, including the Meta-Intent Dataset and three types of representative evaluation settings. We conduct extensive experiments which demonstrate that our method outperforms a range of strong baselines that needs further fine-tuning or domain-specific samples.</abstract>
-      <url hash="da60414e">2024.findings-acl.605</url>
+      <url hash="e8994285">2024.findings-acl.605</url>
       <bibkey>zhang-etal-2024-discrimination</bibkey>
       <doi>10.18653/v1/2024.findings-acl.605</doi>
       <video href="2024.findings-acl.605.mp4"/>
@@ -14535,7 +14535,7 @@
       <author><first>Aitzaz</first><last>Ahmad</last><affiliation>Amazon</affiliation></author>
       <pages>10184-10201</pages>
       <abstract>Large language models (LLMs) have demonstrated remarkable open-domain capabilities. LLMs tailored for a domain are typically trained entirely on domain corpus to excel at handling domain-specific tasks. In this work, we explore an alternative strategy of continual pre-training as a means to develop domain-specific LLMs over an existing open-domain LLM. We introduce <i>FinPythia-6.9B</i>, developed through domain-adaptive continual pre-training on the financial domain.Continual pre-trained FinPythia showcases consistent improvements on financial tasks over the original foundational model. We further explore simple but effective data selection strategies for continual pre-training. Our data selection strategies outperform vanilla continual pre-training’s performance with just 10% of corpus size and cost, without any degradation on open-domain standard tasks. Our work proposes an alternative solution to building domain-specific LLMs cost-effectively.</abstract>
-      <url hash="8f9c9706">2024.findings-acl.606</url>
+      <url hash="68632580">2024.findings-acl.606</url>
       <bibkey>xie-etal-2024-efficient</bibkey>
       <doi>10.18653/v1/2024.findings-acl.606</doi>
     </paper>
@@ -14549,7 +14549,7 @@
       <author><first>Cong</first><last>Liu</last><affiliation>University of California, Riverside</affiliation></author>
       <pages>10202-10217</pages>
       <abstract>Joint entity and relation extraction is a process that identifies entity pairs and their relations using a single model. We focus on the problem of joint extraction in distantly-labeled data, whose labels are generated by aligning entity mentions with the corresponding entity and relation tags using a knowledge base (KB). One key challenge is the presence of noisy labels arising from both incorrect entity and relation annotations, which significantly impairs the quality of supervised learning. Existing approaches, either considering only one source of noise or making decisions using external knowledge, cannot well-utilize significant information in the training data. We propose DENRL, a generalizable framework that 1) incorporates a lightweight transformer backbone into a sequence labeling scheme for joint tagging, and 2) employs a noise-robust framework that regularizes the tagging model with significant relation patterns and entity-relation dependencies, then iteratively self-adapts to instances with less noise from both sources. Surprisingly, experiments on two benchmark datasets show that DENRL, using merely its own parametric distribution and simple data-driven heuristics, outperforms strong baselines by a large margin with better interpretability.</abstract>
-      <url hash="5675a260">2024.findings-acl.607</url>
+      <url hash="a4de3574">2024.findings-acl.607</url>
       <bibkey>li-etal-2024-distantly</bibkey>
       <doi>10.18653/v1/2024.findings-acl.607</doi>
     </paper>
@@ -14563,7 +14563,7 @@
       <author><first>Kai</first><last>Chen</last></author>
       <pages>10218-10230</pages>
       <abstract>Large Language Models (LLMs) have revolutionized various domains with extensive knowledge and creative capabilities. However, a critical issue with LLMs is their tendency to produce outputs that diverge from factual reality. This phenomenon is particularly concerning in sensitive applications such as medical consultation and legal advice, where accuracy is paramount. Inspired by human lie detectors using physiological responses, we introduce the LLM Factoscope, a novel Siamese network-based model that leverages the inner states of LLMs for factual detection. Our investigation reveals distinguishable patterns in LLMs’ inner states when generating factual versus non-factual content. We demonstrate its effectiveness across various architectures, achieving over 96% accuracy on our custom-collected factual detection dataset. Our work opens a new avenue for utilizing LLMs’ inner states for factual detection and encourages further exploration into LLMs’ inner workings for enhanced reliability and transparency.</abstract>
-      <url hash="b1ed1be9">2024.findings-acl.608</url>
+      <url hash="2b11b9d9">2024.findings-acl.608</url>
       <bibkey>he-etal-2024-llm</bibkey>
       <doi>10.18653/v1/2024.findings-acl.608</doi>
     </paper>
@@ -14576,7 +14576,7 @@
       <author><first>Yanfeng</first><last>Wang</last><affiliation>Shanghai Jiao Tong University</affiliation></author>
       <pages>10231-10241</pages>
       <abstract>Structured data offers an efficient means of organizing information. Exsisting text-serialization based methods for processing structured data using large language models (LLMs) are not designed to explicitly capture the heterogeneity of structured data. Such methods are suboptimal for LLMs to process structured data, and may lead to large input token size and poor robustness to input perturbation. In this paper, we propose a novel framework called DictLLM, which is an efficient and effective framework for the modeling of medical lab report to deal with the report-assisted diagnosis generation task. DictLLM introduce 1) group positional encoding to maintain the permutation invariance, 2) hierarchical attention bias to capture the inductive bias of structured data, and 3) a optimal transport alignment layer to align the embeddings generated by the dict encoder with the LLM, producing a list of fixed-length virtual tokens. We conduct experiments with multiple LLM models on a large-scale real-world medical lab report dataset for automatic diagnosis generation. The results show that our proposed framework outperforms the baseline methods and few-shot GPT-4 in terms of both Rouge-L and Knowledge F1 score. We also conduct multiple experiments and analyze the scalability and robustness of our proposed framework, demonstrating the superiority of our method in modeling the heterogeneous structure of medical dictionaries data.</abstract>
-      <url hash="b76e1a3c">2024.findings-acl.609</url>
+      <url hash="372648d2">2024.findings-acl.609</url>
       <bibkey>guo-etal-2024-dictllm</bibkey>
       <doi>10.18653/v1/2024.findings-acl.609</doi>
     </paper>
@@ -14588,7 +14588,7 @@
       <author><first>Yefeng</first><last>Zheng</last></author>
       <pages>10242-10257</pages>
       <abstract>Automatic evaluation of natural language generation (NLG) tasks has gained extensive research interests, since it can rapidly assess the performance of large language models (LLMs). However, automatic NLG evaluation struggles with medical QA because it fails to focus on the crucial correctness of medical facts throughout the generated text. To address this, this paper introduces a new data structure, <i>imap</i>, designed to capture key information in questions and answers, enabling evaluators to focus on essential details. The <i>imap</i> comprises three components: Query, Constraint, and Inform, each of which is in the form of term-value pairs to represent medical facts in a structural manner. We then introduce <i>imap</i>Score, which compares the corresponding medical term-value pairs in the <i>imap</i> to score generated texts. We utilize GPT-4 to extract <i>imap</i> from questions, human-annotated answers, and generated responses. To mitigate the diversity in medical terminology for fair term-value pairs comparison, we use a medical knowledge graph to assist GPT-4 in determining matches. To compare <i>imap</i>Score with existing NLG metrics, we establish a new benchmark dataset. The experimental results show that <i>imap</i>Score consistently outperforms state-of-the-art metrics, demonstrating an average improvement of 79.8% in correlation with human scores. Furthermore, incorporating <i>imap</i> into n-gram, embedding, and LLM metrics boosts the base versions, increasing correlation with human scores by averages of 89.9%, 81.7%, and 32.6%, respectively.</abstract>
-      <url hash="331c0978">2024.findings-acl.610</url>
+      <url hash="dbc34a3e">2024.findings-acl.610</url>
       <bibkey>wang-etal-2024-imapscore</bibkey>
       <doi>10.18653/v1/2024.findings-acl.610</doi>
       <video href="2024.findings-acl.610.mp4"/>
@@ -14604,7 +14604,7 @@
       <author><first>Xuanjing</first><last>Huang</last><affiliation>Fudan University</affiliation></author>
       <pages>10258-10273</pages>
       <abstract>Large language models (LLMs) have shown great potential to empower various domains and are often customized by fine-tuning for the requirements of different applications. However, the powerful learning ability of LLMs not only enables them to learn new tasks but also makes them vulnerable to learning undesired behaviors, such as harmfulness and hallucination, as the fine-tuning data often implicitly or explicitly contains such content. Can we fine-tune LLMs on harmful data without learning harmful behaviors? This paper proposes a controllable training framework to make undesired behaviors unlearnable during the fine-tuning process. Specifically, we introduce security vectors to control the model’s behavior and make it consistent with the undesired behavior. Security vectors are activated during fine-tuning, the consistent behavior makes the model believe that such behavior has already been learned and there is no need for further optimization, while inconsistent data can still be learned. After fine-tuning, security vectors are deactivated to restore the LLM’s normal behavior. Our experiments show that the security vectors can prevent LLM from learning harmful and hallucination behavior while preserving the ability to learn other information.</abstract>
-      <url hash="30fccefc">2024.findings-acl.611</url>
+      <url hash="449fbcd5">2024.findings-acl.611</url>
       <bibkey>zhou-etal-2024-making</bibkey>
       <doi>10.18653/v1/2024.findings-acl.611</doi>
     </paper>
@@ -14615,7 +14615,7 @@
       <author><first>Manabu</first><last>Okumura</last><affiliation>Tokyo Institute of Technology, Tokyo Institute of Technology</affiliation></author>
       <pages>10274-10287</pages>
       <abstract>Due to biases inherently present in data for pre-training, current pre-trained Large Language Models (LLMs) also ubiquitously manifest the same phenomena. Since the bias influences the output from the LLMs across various tasks, the widespread deployment of the LLMs is hampered. We propose a simple method that utilizes structured knowledge to alleviate this issue, aiming to reduce the bias embedded within the LLMs and ensuring they have an encompassing perspective when used in applications. Experimental results indicated that our method has good debiasing ability when applied to existing both autoregressive and masked language models. Additionally, it could ensure that the performances of LLMs on downstream tasks remain uncompromised.Our method outperforms state-of-the-art (SOTA) baselines in the debiasing ability. Importantly, our method obviates the need for training from scratch, thus offering enhanced scalability and cost-effectiveness.</abstract>
-      <url hash="89dacbf0">2024.findings-acl.612</url>
+      <url hash="fdf6f4d6">2024.findings-acl.612</url>
       <bibkey>ma-etal-2024-debiasing</bibkey>
       <doi>10.18653/v1/2024.findings-acl.612</doi>
       <video href="2024.findings-acl.612.mp4"/>
@@ -14632,7 +14632,7 @@
       <author><first>Muhao</first><last>Chen</last><affiliation>University of California, Davis and University of Southern California</affiliation></author>
       <pages>10288-10302</pages>
       <abstract>Instruction tuning has been used as a promising approach to improve the performance of large language models (LLMs) on unseen tasks. However, current LLMs exhibit limited robustness to unseen instructions, generating inconsistent outputs when the same instruction is phrased with slightly varied forms or language styles. This behavior indicates LLMs’ lack of robustness to textual variations and generalizability to unseen instructions, potentially leading to trustworthiness issues. Accordingly, we propose Contrastive Instruction Tuning, which maximizes the similarity between the hidden representations of semantically equivalent instruction-instance pairs while minimizing the similarity between semantically different ones. To facilitate this approach, we augment the existing FLAN collection by paraphrasing task instructions. Experiments on the PromptBench benchmark show that CoIN consistently improves LLMs’ robustness to unseen instructions with variations across character, word, sentence, and semantic levels by an average of +2.5% in accuracy.</abstract>
-      <url hash="1d78fb55">2024.findings-acl.613</url>
+      <url hash="1f01fe8c">2024.findings-acl.613</url>
       <bibkey>yan-etal-2024-contrastive</bibkey>
       <doi>10.18653/v1/2024.findings-acl.613</doi>
       <video href="2024.findings-acl.613.mp4"/>
@@ -14647,7 +14647,7 @@
       <author><first>Xueqi</first><last>Cheng</last><affiliation>, Chinese Academy of Sciences</affiliation></author>
       <pages>10303-10317</pages>
       <abstract>Generative retrieval uses differentiable search indexes to directly generate relevant document identifiers in response to a query. Recent studies have highlighted the potential of a strong generative retrieval model, trained with carefully crafted pre-training tasks, to enhance downstream retrieval tasks via fine-tuning. However, the full power of pre-training for generative retrieval remains underexploited due to its reliance on pre-defined static document identifiers, which may not align with evolving model parameters. In this work, we introduce BootRet, a bootstrapped pre-training method for generative retrieval that dynamically adjusts document identifiers during pre-training to accommodate the continuing memorization of the corpus. BootRet involves three key training phases: (i) initial identifier generation, (ii) pre-training via corpus indexing and relevance prediction tasks, and (iii) bootstrapping for identifier updates. To facilitate the pre-training phase, we further introduce noisy documents and pseudo-queries, generated by large language models, to resemble semantic connections in both indexing and retrieval tasks. Experimental results demonstrate that BootRet significantly outperforms existing pre-training generative retrieval baselines and performs well even in zero-shot settings.</abstract>
-      <url hash="24cf73a4">2024.findings-acl.614</url>
+      <url hash="ead2197a">2024.findings-acl.614</url>
       <bibkey>tang-etal-2024-bootstrapped</bibkey>
       <doi>10.18653/v1/2024.findings-acl.614</doi>
       <video href="2024.findings-acl.614.mp4"/>
@@ -14664,7 +14664,7 @@
       <author><first>Donghong</first><last>Ji</last></author>
       <pages>10318-10329</pages>
       <abstract>Aspect-based Sentiment Analysis (ABSA) is extensively researched in the NLP community, yet related models face challenges due to data sparsity when shifting to a new domain. Hence, data augmentation for cross-domain ABSA has attracted increasing attention in recent years. However, two key points have been neglected in prior studies: First, target domain unlabeled data are labeled with pseudo labels by the model trained in the source domain with little quality control, leading to inaccuracy and error propagation. Second, the label and text patterns of generated labeled data are monotonous, thus limiting the robustness and generalization ability of trained ABSA models. In this paper, we aim to design a simple yet effective framework to address the above shortages in ABSA data augmentation, called Refining and Synthesis Data Augmentation (RSDA). Our framework roughly includes two steps: First, it refines generated labeled data using a natural language inference (NLI) filter to control data quality. Second, it synthesizes diverse labeled data via novel label composition and paraphrase approaches. We conduct experiments on 4 kinds of ABSA subtasks, and our framework outperforms 7 strong baselines, demonstrating its effectiveness.</abstract>
-      <url hash="5ec1f21e">2024.findings-acl.615</url>
+      <url hash="c6645a43">2024.findings-acl.615</url>
       <bibkey>wang-etal-2024-refining</bibkey>
       <doi>10.18653/v1/2024.findings-acl.615</doi>
       <video href="2024.findings-acl.615.mp4"/>
@@ -14683,7 +14683,7 @@
       <author><first>Hung-yi</first><last>Lee</last><affiliation>National Taiwan University</affiliation></author>
       <pages>10330-10348</pages>
       <abstract>The sound codec’s dual roles in minimizing data transmission latency and serving as tokenizers underscore its critical importance.Recent years have witnessed significant developments in codec models.The ideal sound codec should preserve content, paralinguistics, speakers, and audio information.However, the question of which codec achieves optimal sound information preservation remains unanswered, as in different papers, models are evaluated on their selected experimental settings.This study introduces Codec-SUPERB, an acronym for Codec sound processing Universal PERformance Benchmark.It is an ecosystem designed to assess codec models across representative sound applications and signal-level metrics rooted in sound domain knowledge.Codec-SUPERB simplifies result sharing through an online leaderboard, promoting collaboration within a community-driven benchmark database, thereby stimulating new development cycles for codecs.Furthermore, we undertake an in-depth analysis to offer insights into codec models from both application and signal perspectives, diverging from previous codec papers mainly concentrating on signal-level comparisons.Finally, we will release codes, the leaderboard, and data to accelerate progress within the community.</abstract>
-      <url hash="522880a1">2024.findings-acl.616</url>
+      <url hash="c50ec807">2024.findings-acl.616</url>
       <bibkey>wu-etal-2024-codec</bibkey>
       <doi>10.18653/v1/2024.findings-acl.616</doi>
       <video href="2024.findings-acl.616.mp4"/>
@@ -14698,7 +14698,7 @@
       <author><first>Piji</first><last>Li</last><affiliation>Nanjing University of Aeronautics and Astronautics</affiliation></author>
       <pages>10349-10360</pages>
       <abstract>Social media bot detection is increasingly crucial with the rise of social media platforms. Existing methods predominantly construct social networks as graph and utilize graph neural networks (GNNs) for bot detection. However, most of these methods focus on how to improve the performance of GNNs while neglecting the community structure within social networks. Moreover, GNNs based methods still face problems such as poor model generalization due to the relatively small scale of the dataset and over-smoothness caused by information propagation mechanism. To address these problems, we propose the Community-Aware Heterogeneous Graph Contrastive Learning framework (i.e., CACL), which constructs social network as heterogeneous graph with multiple node types and edge types, and then utilizes community-aware module to mine both hard positive samples and hard negative samples for supervised graph contrastive learning with adaptive graph enhancement algorithms. Extensive experiments demonstrate that our framework addresses the previously mentioned challenges and outperforms competitive baselines on three social media bot benchmarks.</abstract>
-      <url hash="5516e409">2024.findings-acl.617</url>
+      <url hash="e53bb3af">2024.findings-acl.617</url>
       <bibkey>chen-etal-2024-cacl</bibkey>
       <doi>10.18653/v1/2024.findings-acl.617</doi>
     </paper>
@@ -14711,7 +14711,7 @@
       <author><first>Xiang</first><last>Ren</last></author>
       <pages>10361-10386</pages>
       <abstract>Making inferences in text comprehension to understand the meaning is essential in language processing. This work studies the entailment verification (EV) problem of complex, multi-sentence premises requiring a system to make multiple inferences implicitly. Modern applications of EV in detecting inconsistent model-generated rationales require complex multi-hop reasoning. However, current textual inference datasets mostly contain short-sentence premises that partially focus on this. To address this, we compile an EV benchmark that includes datasets from three NLP domains (NLI, contextual QA, and rationales) containing multi-sentence premises. On benchmarking humans and LLMs, we find that LLMs are better than humans in multi-hop reasoning across extended contexts, while humans perform better in simple deductive reasoning tasks. We also finetune a Flan-T5 model for EV using two training objectives to obtain a strong open-source model that outperforms GPT-3.5 and rivals GPT-4. Finally, we use our finetuned model to filter out inconsistent model-generated rationales in self-consistency decoding, resulting in a 6% accuracy improvement on average across three MCQ datasets.</abstract>
-      <url hash="eb572d33">2024.findings-acl.618</url>
+      <url hash="fe4e62a3">2024.findings-acl.618</url>
       <bibkey>sanyal-etal-2024-machines</bibkey>
       <doi>10.18653/v1/2024.findings-acl.618</doi>
       <video href="2024.findings-acl.618.mp4"/>
@@ -14725,7 +14725,7 @@
       <author><first>Shafiq</first><last>Joty</last><affiliation>SalesForce.com and Nanyang Technological University</affiliation></author>
       <pages>10387-10409</pages>
       <abstract>Charts provide visual representations of data and are widely used for analyzing information, addressing queries, and conveying insights to others. Various chart-related downstream tasks have emerged recently, such as question-answering and summarization. A common strategy to solve these tasks is to fine-tune various models originally trained on vision tasks language. However, such task-specific models are not capable of solving a wide range of chart-related tasks, constraining their real-world applicability. To overcome these challenges, we introduce ChartInsruct: a novel chart-specific vision-language Instruction-following dataset comprising 191K instructions generated with 71K charts. We then present two distinct systems for instruction tuning on such datasets: (1) an end-to-end model that connects a vision encoder for chart understanding with a LLM; and (2) a pipeline model that employs a two-step approach to extract chart data tables and input them into the LLM. In experiments on four downstream tasks, we first show the effectiveness of our model–achieving a new set of state-of-the-art results. Further evaluation shows that our instruction-tuning approach supports a wide array of real-world chart comprehension and reasoning scenarios, thereby expanding the scope and applicability of our models to new kinds of tasks.</abstract>
-      <url hash="3ef236a5">2024.findings-acl.619</url>
+      <url hash="5149bfff">2024.findings-acl.619</url>
       <bibkey>masry-etal-2024-chartinstruct</bibkey>
       <doi>10.18653/v1/2024.findings-acl.619</doi>
       <video href="2024.findings-acl.619.mp4"/>
@@ -14737,7 +14737,7 @@
       <author><first>Yang</first><last>Feng</last><affiliation>Institute of Computing Technology, Chinese Academy of Sciences</affiliation></author>
       <pages>10410-10423</pages>
       <abstract>The many-to-many multilingual neural machine translation can be regarded as the process of integrating semantic features from the source sentences and linguistic features from the target sentences. To enhance zero-shot translation, models need to share knowledge across languages, which can be achieved through auxiliary tasks for learning a universal representation or cross-lingual mapping. To this end, we propose to exploit both semantic and linguistic features between multiple languages to enhance multilingual translation. On the encoder side, we introduce a disentangling learning task that aligns encoder representations by disentangling semantic and linguistic features, thus facilitating knowledge transfer while preserving complete information. On the decoder side, we leverage a linguistic encoder to integrate low-level linguistic features to assist in the target language generation. Experimental results on multilingual datasets demonstrate significant improvement in zero-shot translation compared to the baseline system, while maintaining performance in supervised translation. Further analysis validates the effectiveness of our method in leveraging both semantic and linguistic features.</abstract>
-      <url hash="2cd1d761">2024.findings-acl.620</url>
+      <url hash="a756b206">2024.findings-acl.620</url>
       <bibkey>bu-etal-2024-improving</bibkey>
       <doi>10.18653/v1/2024.findings-acl.620</doi>
     </paper>
@@ -14758,7 +14758,7 @@
       <author><first>Vikas</first><last>Chandra</last><affiliation>Meta</affiliation></author>
       <pages>10424-10443</pages>
       <abstract>Weight-sharing supernets are crucial for performance estimation in cutting-edge neural architecture search (NAS) frameworks. Despite their ability to generate diverse subnetworks without retraining, the quality of these subnetworks is not guaranteed due to weight sharing. In NLP tasks like machine translation and pre-trained language modeling, there is a significant performance gap between supernet and training from scratch for the same model architecture, necessitating retraining post optimal architecture identification.This study introduces a solution called mixture-of-supernets, a generalized supernet formulation leveraging mixture-of-experts (MoE) to enhance supernet model expressiveness with minimal training overhead. Unlike conventional supernets, this method employs an architecture-based routing mechanism, enabling indirect sharing of model weights among subnetworks. This customization of weights for specific architectures, learned through gradient descent, minimizes retraining time, significantly enhancing training efficiency in NLP. The proposed method attains state-of-the-art (SoTA) performance in NAS for fast machine translation models, exhibiting a superior latency-BLEU tradeoff compared to HAT, the SoTA NAS framework for machine translation. Furthermore, it excels in NAS for building memory-efficient task-agnostic BERT models, surpassing NAS-BERT and AutoDistil across various model sizes. The code can be found at: https://github.com/UBC-NLP/MoS.</abstract>
-      <url hash="1811e1a7">2024.findings-acl.621</url>
+      <url hash="7ef73536">2024.findings-acl.621</url>
       <bibkey>jawahar-etal-2024-mixture</bibkey>
       <doi>10.18653/v1/2024.findings-acl.621</doi>
       <video href="2024.findings-acl.621.mp4"/>
@@ -14771,7 +14771,7 @@
       <author><first>Yo-Sub</first><last>Han</last><affiliation>Yonsei University</affiliation></author>
       <pages>10444-10455</pages>
       <abstract>The ever-growing presence of hate speech on social network services and other online platforms not only fuels online harassment but also presents a growing challenge for hate speech detection. As this task is akin to binary classification, one of the promising approaches for hate speech detection is the utilization of contrastive learning. Recent studies suggest that classifying hateful posts in just a binary manner may not adequately address the nuanced task of detecting implicit hate speech. This challenge is largely due to the subtle nature and context dependency of such pejorative remarks. Previous studies proposed a modified contrastive learning approach equipped with additional aids such as human-written implications or machine-generated augmented data for better implicit hate speech detection. While this approach can potentially enhance the overall performance by its additional data in general, it runs the risk of overfitting as well as heightened cost and time to obtain. These drawbacks serve as motivation for us to design a methodology that is not dependent on human-written or machine-generated augmented data for training. We propose a straightforward, yet effective, clustering-based contrastive learning approach that leverages the shared semantics among the data.</abstract>
-      <url hash="a188e603">2024.findings-acl.622</url>
+      <url hash="7c7f7a8e">2024.findings-acl.622</url>
       <bibkey>ahn-etal-2024-sharedcon</bibkey>
       <doi>10.18653/v1/2024.findings-acl.622</doi>
     </paper>
@@ -14782,7 +14782,7 @@
       <author><first>Jingbo</first><last>Shang</last><affiliation>University of California, San Diego</affiliation></author>
       <pages>10456-10470</pages>
       <abstract>Instruction-tuning language models has become a crucial step in aligning them for general use. Typically, this process involves extensive training on large datasets, incurring high training costs. In this paper, we introduce a novel training data selection based on the learning percentage of the samples. We assert that current language models possess the capability to autonomously select high-quality training data, leading to comparable or improved performance compared to training on the entire dataset. Our experiments span different-sized models, revealing that this characteristic holds for models ranging from 1B (small) to 13B (large) in size. Moreover, we demonstrate an interesting finding that the data hardness transfers across model sizes, and a smaller 350M model can effectively curate high-quality training data with hard samples for a larger 13B model, resulting in an equally or superior instruction-tuned model compared to training on the complete dataset. Utilizing open-sourced OPT and Llama-2 models up to 13B in size, two publicly available instruction-tuning training datasets and evaluated by both automatic metrics &amp; humans, our paper introduces a novel approach to training data selection, showcasing a more efficient alternative.</abstract>
-      <url hash="34087d4c">2024.findings-acl.623</url>
+      <url hash="f9e02b70">2024.findings-acl.623</url>
       <bibkey>mekala-etal-2024-smaller</bibkey>
       <doi>10.18653/v1/2024.findings-acl.623</doi>
       <video href="2024.findings-acl.623.mp4"/>
@@ -14795,7 +14795,7 @@
       <author><first>Daniel</first><last>Kang</last><affiliation>Department of Computer Science</affiliation></author>
       <pages>10471-10506</pages>
       <abstract>Recent work has embodied LLMs as agents, allowing them to access tools, perform actions, and interact with external content (e.g., emails or websites). However, external content introduces the risk of indirect prompt injection (IPI) attacks, where malicious instructions are embedded within the content processed by LLMs, aiming to manipulate these agents into executing detrimental actions against users. Given the potentially severe consequences of such attacks, establishing benchmarks to assess and mitigate these risks is imperative.In this work, we introduce InjecAgent, a benchmark designed to assess the vulnerability of tool-integrated LLM agents to IPI attacks. InjecAgent comprises 1,054 test cases covering 17 different user tools and 62 attacker tools. We categorize attack intentions into two primary types: direct harm to users and exfiltration of private data. We conduct a comprehensive evaluation of 30 different LLM agents and show that agents are vulnerable to IPI attacks, with ReAct-prompted GPT-4 vulnerable to attacks 24% of the time. Further investigation into an enhanced setting, where the attacker instructions are reinforced with a hacking prompt, shows additional increases in success rates. Our findings raise questions about the widespread deployment of LLM Agents.</abstract>
-      <url hash="fb92b5d8">2024.findings-acl.624</url>
+      <url hash="48802818">2024.findings-acl.624</url>
       <bibkey>zhan-etal-2024-injecagent</bibkey>
       <doi>10.18653/v1/2024.findings-acl.624</doi>
       <video href="2024.findings-acl.624.mp4"/>
@@ -14812,7 +14812,7 @@
       <author><first>Hai</first><last>Jin</last><affiliation>Huazhong University of Science and Technology</affiliation></author>
       <pages>10507-10521</pages>
       <abstract>Code Pre-trained Models (CodePTMs) based vulnerability detection have achieved promising results over recent years. However, these models struggle to generalize as they typically learn superficial mapping from source code to labels instead of understanding the root causes of code vulnerabilities, resulting in poor performance in real-world scenarios beyond the training instances. To tackle this challenge, we introduce VulLLM, a novel framework that integrates multi-task learning with Large Language Models (LLMs) to effectively mine deep-seated vulnerability features. Specifically, we construct two auxiliary tasks beyond the vulnerability detection task. First, we utilize the vulnerability patches to construct a vulnerability localization task. Second, based on the vulnerability features extracted from patches, we leverage GPT-4 to construct a vulnerability interpretation task. VulLLM innovatively augments vulnerability classification by leveraging generative LLMs to understand complex vulnerability patterns, thus compelling the model to capture the root causes of vulnerabilities rather than overfitting to spurious features of a single task. The experiments conducted on six large datasets demonstrate that VulLLM surpasses seven state-of-the-art models in terms of effectiveness, generalization, and robustness.</abstract>
-      <url hash="baa4a189">2024.findings-acl.625</url>
+      <url hash="91968d82">2024.findings-acl.625</url>
       <bibkey>du-etal-2024-generalization</bibkey>
       <doi>10.18653/v1/2024.findings-acl.625</doi>
     </paper>
@@ -14825,7 +14825,7 @@
       <author><first>Lianwen</first><last>Jin</last><affiliation>South China University of Technology</affiliation></author>
       <pages>10522-10539</pages>
       <abstract>Visually-rich document information extraction (VIE) is a vital aspect of document understanding, wherein Semantic Entity Recognition (SER) plays a significant role. However, few-shot SER on visually-rich documents remains relatively unexplored despite its considerable potential for practical applications. To address this issue, we propose a simple yet effective Plug-and-Play Tag-guided method for few-shot Semantic Entity Recognition (PPTSER) on visually-rich documents. PPTSER is built upon off-the-shelf multi-modal pre-trained models. It leverages the semantics of the tags to guide the SER task, reformulating SER into entity typing and span detection, handling both tasks simultaneously via cross-attention. Experimental results illustrate that PPTSER outperforms existing fine-tuning and few-shot methods, especially in low-data regimes. With full training data, PPTSER achieves comparable or superior performance to fine-tuning baseline. For instance, on the FUNSD benchmark, our method improves the performance of LayoutLMv3-base in 1-shot, 3-shot and 5-shot scenarios by 15.61%, 2.13%, and 2.01%, respectively. Overall, PPTSER demonstrates promising generalizability, effectiveness, and plug-and-play nature for few-shot SER on visually-rich documents. The codes will be available at [https://github.com/whlscut/PPTSER](https://github.com/whlscut/PPTSER).</abstract>
-      <url hash="8aedc86e">2024.findings-acl.626</url>
+      <url hash="d0eddd83">2024.findings-acl.626</url>
       <bibkey>liao-etal-2024-pptser</bibkey>
       <doi>10.18653/v1/2024.findings-acl.626</doi>
       <video href="2024.findings-acl.626.mp4"/>
@@ -14838,7 +14838,7 @@
       <author><first>Dujian</first><last>Ding</last><affiliation>Computing Science, University of British Columbia</affiliation></author>
       <pages>10540-10560</pages>
       <abstract>In this work, we utilize Large Language Models (LLMs) for a novel use case: constructing Performance Predictors (PP) that estimate the performance of specific deep neural network architectures on downstream tasks. We create PP prompts for LLMs, comprising (i) role descriptions, (ii) instructions for the LLM, (iii) hyperparameter definitions, and (iv) demonstrations presenting sample architectures with efficiency metrics and ‘training from scratch’ performance. In machine translation (MT) tasks, GPT-4 with our PP prompts (LLM-PP) achieves a SoTA mean absolute error and a slight degradation in rank correlation coefficient compared to baseline predictors. Additionally, we demonstrate that predictions from LLM-PP can be distilled to a compact regression model (LLM-Distill-PP), which surprisingly retains much of the performance of LLM-PP. This presents a cost-effective alternative for resource-intensive performance estimation. Specifically, for Neural Architecture Search (NAS), we introduce a Hybrid-Search algorithm (HS-NAS) employing LLM-Distill-PP for the initial search stages and reverting to the baseline predictor later. HS-NAS performs similarly to SoTA NAS, reducing search hours by approximately 50%, and in some cases, improving latency, GFLOPs, and model size. The code can be found at: https://github.com/UBC-NLP/llmas.</abstract>
-      <url hash="91b1a431">2024.findings-acl.627</url>
+      <url hash="dc2ea639">2024.findings-acl.627</url>
       <bibkey>jawahar-etal-2024-llm</bibkey>
       <doi>10.18653/v1/2024.findings-acl.627</doi>
       <video href="2024.findings-acl.627.mp4"/>
@@ -14852,7 +14852,7 @@
       <author><first>Guohong</first><last>Fu</last></author>
       <pages>10561-10573</pages>
       <abstract>Dialogue discourse parsing (DDP) aims to capture the relations between utterances in the dialogue. In everyday real-world scenarios, dialogues are typically multi-modal and cover open-domain topics. However, most existing widely used benchmark datasets for DDP contain only textual modality and are domain-specific. This makes it challenging to accurately and comprehensively understand the dialogue without multi-modal clues, and prevents them from capturing the discourse structures of the more prevalent daily conversations. This paper proposes MODDP, the first multi-modal Chinese discourse parsing dataset derived from open-domain daily dialogues, consisting 864 dialogues and 18,114 utterances, accompanied by 12.7 hours of video clips. We present a simple yet effective benchmark approach for multi-modal DDP. Through extensive experiments, we present several benchmark results based on MODDP. The significant improvement in performance from introducing multi-modalities into the original textual unimodal DDP model demonstrates the necessity of integrating multi-modalities into DDP.</abstract>
-      <url hash="e67b03be">2024.findings-acl.628</url>
+      <url hash="60ba3905">2024.findings-acl.628</url>
       <bibkey>gong-etal-2024-moddp</bibkey>
       <doi>10.18653/v1/2024.findings-acl.628</doi>
       <video href="2024.findings-acl.628.mp4"/>
@@ -14869,7 +14869,7 @@
       <author><first>Guanghui</first><last>Fu</last></author>
       <pages>10574-10585</pages>
       <abstract>In the current environment, psychological issues are prevalent and widespread, with social media serving as a key outlet for individuals to share their feelings. This results in the generation of vast quantities of data daily, where negative emotions have the potential to precipitate crisis situations. There is a recognized need for models capable of efficient analysis. While pre-trained language models have demonstrated their effectiveness broadly, there’s a noticeable gap in pre-trained models tailored for specialized domains like psychology. To address this, we have collected a huge dataset from Chinese social media platforms and enriched it with publicly available datasets to create a comprehensive database encompassing 3.36 million text entries. To enhance the model’s applicability to psychological text analysis, we integrated psychological lexicons into the pre-training masking mechanism. Building on an existing Chinese language model, we performed adaptive training to develop a model specialized for the psychological domain. We evaluated our model’s performance across six public datasets, where it demonstrated improvements compared to eight other models. Additionally, in the qualitative comparison experiment, our model provided psychologically relevant predictions given the masked sentences. Due to concerns regarding data privacy, the dataset will not be made publicly available. However, we have made the pre-trained models and codes publicly accessible to the community via: https://github.com/zwzzzQAQ/Chinese-MentalBERT.</abstract>
-      <url hash="4e6236c6">2024.findings-acl.629</url>
+      <url hash="70cd0eef">2024.findings-acl.629</url>
       <bibkey>zhai-etal-2024-chinese</bibkey>
       <doi>10.18653/v1/2024.findings-acl.629</doi>
     </paper>
@@ -14884,7 +14884,7 @@
       <author><first>Yu</first><last>Qiao</last></author>
       <pages>10586-10613</pages>
       <abstract>A single language model, even when aligned with labelers through reinforcement learning from human feedback (RLHF), may not suit all human preferences. Recent approaches therefore prefer customization, gathering multi-dimensional feedback, and creating distinct reward models for each dimension.Different language models are then optimized for various preferences using multi-objective RLHF (MORLHF) with varying reward weights.However, RL fine-tuning is unstable and resource-heavy, especially with diverse and usually conflicting objectives.In this paper, we present Multi-Objective Direct Preference Optimization (MODPO), an RL-free extension of Direct Preference Optimization (DPO) for multiple alignment objectives.Essentially, MODPO folds language modeling directly into reward modeling, training language models as implicit collective reward models that combine all objectives with specific weights. MODPO theoretically yields the same optimal solutions as MORLHF but is practically more stable and efficient.Empirical results in safety alignment and long-form question answering show that MODPO matches or outperforms existing methods, producing a Pareto front of language models catering to diverse preferences with three times less computational resources compared to MORLHF.Code is available at https://github.com/ZHZisZZ/modpo.</abstract>
-      <url hash="121982a5">2024.findings-acl.630</url>
+      <url hash="d6ae9a1e">2024.findings-acl.630</url>
       <bibkey>zhou-etal-2024-beyond</bibkey>
       <doi>10.18653/v1/2024.findings-acl.630</doi>
       <video href="2024.findings-acl.630.mp4"/>
@@ -14897,7 +14897,7 @@
       <author><first>Junbo</first><last>Zhao</last><affiliation>Zhejiang University</affiliation></author>
       <pages>10614-10632</pages>
       <abstract>Prompt recovery in large language models (LLMs) is crucial for understanding how LLMs work and addressing concerns regarding privacy, copyright, etc. The trend towards inference-only APIs complicates this task by restricting access to essential outputs for recovery. To tackle this challenge, we extract prompt-related information from limited outputs and identify a strong(negative) correlation between output probability-based uncertainty and the success of prompt recovery.This finding led to the development of Deliberative PrOmpt RecoverY (DORY), our novel approach that leverages uncertainty to recover prompts accurately. DORY involves reconstructing drafts from outputs, refining these with hints, and filtering out noise based on uncertainty. Our evaluation shows that DORY outperforms existing baselines across diverse LLMs and prompt benchmarks, improving performance by approximately 10.82% and establishing a new state-of-the-art record in prompt recovery tasks. Significantly, DORY operates using a single LLM without any external resources or model, offering a cost-effective, user-friendly prompt recovery solution.</abstract>
-      <url hash="5024fd1a">2024.findings-acl.631</url>
+      <url hash="6c6a9d42">2024.findings-acl.631</url>
       <bibkey>gao-etal-2024-dory</bibkey>
       <doi>10.18653/v1/2024.findings-acl.631</doi>
       <video href="2024.findings-acl.631.mp4"/>
@@ -14913,7 +14913,7 @@
       <author><first>Tat-Seng</first><last>Chua</last><affiliation>National University of Singapore</affiliation></author>
       <pages>10633-10649</pages>
       <abstract>Equipping a conversational search engine with strategies regarding when to ask clarification questions is becoming increasingly important across various domains. Attributing to the context understanding capability of LLMs and their access to domain-specific sources of knowledge, LLM-based clarification strategies feature rapid transfer to various domains in a post-hoc manner.However, they still struggle to deliver promising performance on unseen domains, struggling to achieve effective domain transferability.We take the first step to investigate this issue and existing methods tend to produce one-size-fits-all strategies across diverse domains, limiting their search effectiveness.In response, we introduce a novel method, called STYLE,to achieve effective domain transferability.Our experimental results indicate that STYLE bears strong domain transferability, resulting in an average search performance improvement of 10% on four unseen domains.</abstract>
-      <url hash="e9cc8814">2024.findings-acl.632</url>
+      <url hash="0afb9df2">2024.findings-acl.632</url>
       <bibkey>chen-etal-2024-style</bibkey>
       <doi>10.18653/v1/2024.findings-acl.632</doi>
       <video href="2024.findings-acl.632.mp4"/>
@@ -14933,7 +14933,7 @@
       <author><first>Zhijiang</first><last>Guo</last><affiliation>University of Cambridge</affiliation></author>
       <pages>10650-10671</pages>
       <abstract>Generative search engines have the potential to transform how people seek information online, but generated responses from existing large language models (LLMs)-backed generative search engines may not always be accurate. Nonetheless, retrieval-augmented generation exacerbates safety concerns, since adversaries may successfully evade the entire system by subtly manipulating the most vulnerable part of a claim. To this end, we propose evaluating the robustness of generative search engines in the realistic and high-risk setting, where adversaries have only black-box system access and seek to deceive the model into returning incorrect responses. Through a comprehensive human evaluation of various generative search engines, such as Bing Chat, PerplexityAI, and YouChat across diverse queries, we demonstrate the effectiveness of adversarial factual questions in inducing incorrect responses. Moreover, retrieval-augmented generation exhibits a higher susceptibility to factual errors compared to LLMs without retrieval. These findings highlight the potential security risks of these systems and emphasize the need for rigorous evaluation before deployment. The dataset and code will be publicly available.</abstract>
-      <url hash="b38154ca">2024.findings-acl.633</url>
+      <url hash="3a16447d">2024.findings-acl.633</url>
       <bibkey>hu-etal-2024-evaluating</bibkey>
       <doi>10.18653/v1/2024.findings-acl.633</doi>
     </paper>
@@ -14945,7 +14945,7 @@
       <author><first>Inderjit</first><last>Dhillon</last><affiliation>University of Texas, Austin and Google</affiliation></author>
       <pages>10672-10685</pages>
       <abstract>Large language models (LLMs) have demonstrated remarkable capabilities in solving complex open-domain tasks, guided by comprehensive instructions and demonstrations provided in the form of prompts. However, these prompts can be lengthy, often comprising hundreds of lines and thousands of tokens, and their design often requires considerable human effort. Recent research has explored automatic prompt engineering for short prompts, typically consisting of one or a few sentences. However, the automatic design of long prompts remains a challenging problem due to its immense search space. In this paper, we propose an algorithm named Automated Prompt Engineering Xpert (APEX), a novel algorithm that automatically improves long prompts. Leveraging a greedy algorithm with beam-search for efficiency, APEX utilizes search history to significantly enhance the effectiveness of LLM-based mutation in its search process. Our results show that APEX achieves an average of 9.2% accuracy gain on eight tasks in Big Bench Hard and a consistent improvements on GSM8K with various models, highlighting the significance of automating prompt designs to fully harness the capabilities of LLMs.</abstract>
-      <url hash="0e9be12c">2024.findings-acl.634</url>
+      <url hash="7f59f900">2024.findings-acl.634</url>
       <bibkey>hsieh-etal-2024-automatic</bibkey>
       <doi>10.18653/v1/2024.findings-acl.634</doi>
     </paper>
@@ -14960,7 +14960,7 @@
       <author><first>Ting</first><last>Liu</last><affiliation>Harbin Institute of Technology</affiliation></author>
       <pages>10686-10697</pages>
       <abstract>Chain-of-Thought (CoT) serves as a critical emerging ability in LLMs, especially when it comes to logical reasoning. Attempts have been made to induce such ability in small models as well by distilling from the data with CoT generated by Large Language Models (LLMs). However, existing methods often simply generate and incorporate more data from LLMs and fail to note the importance of efficiently utilizing existing CoT data. We here propose a new training paradigm AS-ES (Abstractive Segments - Extractive Segments) learning, which exploits the inherent information in CoT for iterative generation. Experiments show that our methods surpass the direct seq2seq training on CoT-extensive tasks like MWP and PET summarization, without data augmentation or altering the model itself. Furthermore, we explore the reason behind the inefficiency of small models in learning CoT and provide an explanation of why AS-ES learning works, giving insights into the underlying mechanism of CoT.</abstract>
-      <url hash="3b5fad0c">2024.findings-acl.635</url>
+      <url hash="994df1f9">2024.findings-acl.635</url>
       <bibkey>xi-etal-2024-es</bibkey>
       <doi>10.18653/v1/2024.findings-acl.635</doi>
     </paper>
@@ -14972,7 +14972,7 @@
       <author><first>Joo-Kyung</first><last>Kim</last><affiliation>Amazon AGI</affiliation></author>
       <pages>10698-10709</pages>
       <abstract>Visual Question Answering (VQA) often involves diverse reasoning scenarios across Vision and Language (V&amp;L). Most prior VQA studies, however, have merely focused on assessing the model’s overall accuracy without evaluating it on different reasoning cases. Furthermore, some recent works observe that conventional Chain-of-Thought (CoT) prompting fails to generate effective reasoning for VQA, especially for complex scenarios requiring multi-hop reasoning. In this paper, we propose II-MMR, a novel idea to identify and improve multi-modal multi-hop reasoning in VQA. In specific, II-MMR takes a VQA question with an image and finds a reasoning path to reach its answer using two novel language promptings: (i) answer prediction-guided CoT prompt, or (ii) knowledge triplet-guided prompt. II-MMR then analyzes this path to identify different reasoning cases in current VQA benchmarks by estimating how many hops and what types (i.e., visual or beyond-visual) of reasoning are required to answer the question. On popular benchmarks including GQA and A-OKVQA, II-MMR observes that most of their VQA questions are easy to answer, simply demanding “single-hop” reasoning, whereas only a few questions require “multi-hop” reasoning. Moreover, while the recent V&amp;L model struggles with such complex multi-hop reasoning questions even using the traditional CoT method, II-MMR shows its effectiveness across all reasoning cases in both zero-shot and fine-tuning settings.</abstract>
-      <url hash="5145c551">2024.findings-acl.636</url>
+      <url hash="86bf8cf8">2024.findings-acl.636</url>
       <bibkey>kil-etal-2024-ii</bibkey>
       <doi>10.18653/v1/2024.findings-acl.636</doi>
     </paper>
@@ -14988,7 +14988,7 @@
       <author><first>Dinesh</first><last>Manocha</last><affiliation>University of Maryland, College Park</affiliation></author>
       <pages>10710-10727</pages>
       <abstract>Given a source and its edited version performed based on human instructions in natural language, how do we extract the underlying edit operations, to automatically replicate similar edits on other images? This is the problem of reverse designing, and we present TAME-RD, a model to solve this problem. TAME-RD automatically learns from the complex interplay of image editing operations and the natural language instructions to learn fully specified edit operations. It predicts both the underlying image edit operations as discrete categories and their corresponding parameter values in the continuous space.We accomplish this by mapping together the contextual information from the natural language text and the structural differences between the corresponding source and edited images using the concept of pre-post effect. We demonstrate the efficiency of our network through quantitative evaluations on multiple datasets. We observe improvements of 6-10% on various accuracy metrics and 1.01X-4X on the RMSE score and the concordance correlation coefficient for the corresponding parameter values on the benchmark GIER dataset. We also introduce I-MAD, a new two-part dataset: I-MAD-Dense, a collection of approximately 100K source and edited images, together with automatically generated text instructions and annotated edit operations, and I-MAD-Pro, consisting of about 1.6K source and edited images, together with text instructions and annotated edit operations provided by professional editors. On our dataset, we observe absolute improvements of 1-10% on the accuracy metrics and 1.14X–5X on the RMSE score.</abstract>
-      <url hash="f7b61ff4">2024.findings-acl.637</url>
+      <url hash="3c2a50fa">2024.findings-acl.637</url>
       <bibkey>guhan-etal-2024-tame</bibkey>
       <doi>10.18653/v1/2024.findings-acl.637</doi>
       <video href="2024.findings-acl.637.mp4"/>
@@ -15003,7 +15003,7 @@
       <author><first>Rui</first><last>Yan</last><affiliation>Renmin University of China</affiliation></author>
       <pages>10728-10739</pages>
       <abstract>In this paper, by treating in-context learning (ICL) as a meta-optimization process, we explain why LLMs are sensitive to the order of ICL examples. This understanding leads us to the development of Batch-ICL, an effective, efficient, and order-agnostic inference algorithm for ICL. Differing from the standard N-shot learning approach, Batch-ICL employs <tex-math>N</tex-math> separate 1-shot forward computations and aggregates the resulting meta-gradients. These aggregated meta-gradients are then applied to the forward computation of a zero-shot query to generate the final prediction. This batch processing approach renders the LLM agnostic to the order of ICL examples. Through extensive experiments and analysis, we demonstrate that Batch-ICL consistently outperforms most permutations of ICL examples. In some cases, it even exceeds the performance of the best order for standard ICL, all while reducing the computational resources required. Furthermore, we develop a novel variant of Batch-ICL featuring multiple “epochs” of meta-optimization. This variant implicitly explores permutations of ICL examples, further enhancing ICL performance.</abstract>
-      <url hash="1a4db92d">2024.findings-acl.638</url>
+      <url hash="4bc68513">2024.findings-acl.638</url>
       <bibkey>zhang-etal-2024-batch</bibkey>
       <doi>10.18653/v1/2024.findings-acl.638</doi>
     </paper>
@@ -15032,7 +15032,7 @@
       <author><first>Mitesh</first><last>Khapra</last><affiliation>Indian Institute of Technology, Madras, Dhirubhai Ambani Institute Of Information and Communication Technology</affiliation></author>
       <pages>10740-10782</pages>
       <abstract>We present INDICVOICES, a dataset of natural and spontaneous speech containing a total of 7348 hours of read (9%), extempore (74%) and conversational (17%) audio from 16237 speakers covering 145 Indian districts and 22 languages. Of these 7348 hours, 1639 hours have already been transcribed, with a median of 73 hours per language. Through this paper, we share our journey of capturing the cultural, linguistic and demographic diversity of India to create a one-of-its-kind inclusive and representative dataset. More specifically, we share an open-source blueprint for data collection at scale comprising of standardised protocols, centralised tools, a repository of engaging questions, prompts and conversation scenarios spanning multiple domains and topics of interest, quality control mechanisms, comprehensive transcription guidelines and transcription tools. We hope that this open source blueprint will serve as a comprehensive starter kit for data collection efforts in other multilingual regions of the world. Using INDICVOICES, we build IndicASR, the first ASR model to support all the 22 languages listed in the 8th schedule of the Constitution of India.</abstract>
-      <url hash="175c3de3">2024.findings-acl.639</url>
+      <url hash="ecaa3fe5">2024.findings-acl.639</url>
       <bibkey>javed-etal-2024-indicvoices</bibkey>
       <doi>10.18653/v1/2024.findings-acl.639</doi>
       <video href="2024.findings-acl.639.mp4"/>
@@ -15045,7 +15045,7 @@
       <author><first>Xin</first><last>Wang</last><affiliation>University of California, Santa Cruz</affiliation></author>
       <pages>10783-10795</pages>
       <abstract>In our work, we explore the synergistic capabilities of pre-trained vision-and-language models (VLMs) and large language models (LLMs) on visual commonsense reasoning (VCR) problems. We find that VLMs and LLMs-based decision pipelines are good at different kinds of VCR problems. Pre-trained VLMs exhibit strong performance for problems involving understanding the literal visual content, which we noted as visual commonsense understanding (VCU). For problems where the goal is to infer conclusions beyond image content, which we noted as visual commonsense inference (VCI), VLMs face difficulties, while LLMs, given sufficient visual evidence, can use commonsense to infer the answer well. We empirically validate this by letting LLMs classify VCR problems into these two categories and show the significant difference between VLM and LLM with image caption decision pipelines on two subproblems. Moreover, we identify a challenge with VLMs’ passive perception, which may miss crucial context information, leading to incorrect reasoning by LLMs. Based on these, we suggest a collaborative approach, named ViCor, where pre-trained LLMs serve as problem classifiers to analyze the problem category, then either use VLMs to answer the question directly or actively instruct VLMs to concentrate on and gather relevant visual elements to support potential commonsense inferences. We evaluate our framework on two VCR benchmark datasets and outperform all other methods without in-domain fine-tuning.</abstract>
-      <url hash="8e00b9fe">2024.findings-acl.640</url>
+      <url hash="4ec11fa4">2024.findings-acl.640</url>
       <bibkey>zhou-etal-2024-vicor</bibkey>
       <doi>10.18653/v1/2024.findings-acl.640</doi>
       <video href="2024.findings-acl.640.mp4"/>
@@ -15064,7 +15064,7 @@
       <author><first>Zang</first><last>Li</last><affiliation>Tencent</affiliation></author>
       <pages>10796-10816</pages>
       <abstract>In-context learning of large-language models (LLMs) has achieved remarkable success in the field of natural language processing, while extensive case studies reveal that the single-step chain-of-thought prompting approach faces challenges such as attention diffusion and inadequate performance in complex tasks like text-to-SQL. To improve the contextual learning capabilities of LLMs in text-to-SQL, a workflow paradigm method is proposed, aiming to enhance the attention and problem-solving scope of LLMs through decomposition. Specifically, the information determination module for eliminating redundant information and the brand-new prompt structure based on problem classification greatly enhance the model’s attention. Additionally, the inclusion of self-correction and active learning modules greatly expands the problem-solving scope of LLMs, hence improving the upper limit of LLM-based approaches. Extensive experiments conducted on three datasets demonstrate that our approach outperforms other methods by a significant margin. About 2-3 percentage point improvements compared to the existing baseline on the Spider Dev, Spider-Realistic, and Bird Dev datasets and new SOTA results on the Spider Test dataset are achieved. Our code is available on GitHub: <url>https://github.com/FlyingFeather/DEA-SQL</url>.</abstract>
-      <url hash="218deaa7">2024.findings-acl.641</url>
+      <url hash="5dad065f">2024.findings-acl.641</url>
       <bibkey>xie-etal-2024-decomposition</bibkey>
       <doi>10.18653/v1/2024.findings-acl.641</doi>
       <video href="2024.findings-acl.641.mp4"/>
@@ -15079,7 +15079,7 @@
       <author><first>Bo</first><last>Xu</last><affiliation>Dalian University of Technology</affiliation></author>
       <pages>10817-10826</pages>
       <abstract>Short video fake news detection is crucial for combating the spread of misinformation. Current detection methods tend to aggregate features from individual modalities into multimodal features, overlooking the implicit opinions and the evolving nature of opinions across modalities. In this paper, we mine implicit opinions within short video news and promote the evolution of both explicit and implicit opinions across all modalities. Specifically, we design a prompt template to mine implicit opinions regarding the credibility of news from the textual component of videos. Additionally, we employ a diffusion model that encourages the interplay among diverse modal opinions, including those extracted through our implicit opinion prompts. Experimental results on a publicly available dataset for short video fake news detection demonstrate the superiority of our model over state-of-the-art methods.</abstract>
-      <url hash="45e04474">2024.findings-acl.642</url>
+      <url hash="0715e88e">2024.findings-acl.642</url>
       <bibkey>zong-etal-2024-unveiling</bibkey>
       <doi>10.18653/v1/2024.findings-acl.642</doi>
     </paper>
@@ -15094,7 +15094,7 @@
       <author><first>Ashutosh</first><last>Modi</last><affiliation>IIT Kanpur</affiliation></author>
       <pages>10827-10844</pages>
       <abstract>Indian Sign Language has limited resources for developing machine learning and data-driven approaches for automated language processing. Though text/audio-based language processing techniques have shown colossal research interest and tremendous improvements in the last few years, Sign Languages still need to catch up due to the need for more resources. To bridge this gap, in this work, we propose <b>iSign</b>: a benchmark for Indian Sign Language (ISL) Processing. We make three primary contributions to this work. First, we release one of the largest ISL-English datasets with more than video-sentence/phrase pairs. To the best of our knowledge, it is the largest sign language dataset available for ISL. Second, we propose multiple NLP-specific tasks (including SignVideo2Text, SignPose2Text, Text2Pose, Word Prediction, and Sign Semantics) and benchmark them with the baseline models for easier access to the research community. Third, we provide detailed insights into the proposed benchmarks with a few linguistic insights into the working of ISL. We streamline the evaluation of Sign Language processing, addressing the gaps in the NLP research community for Sign Languages. We release the dataset, tasks and models via the following website: https://exploration-lab.github.io/iSign/</abstract>
-      <url hash="09aea051">2024.findings-acl.643</url>
+      <url hash="e51ed5c8">2024.findings-acl.643</url>
       <bibkey>joshi-etal-2024-isign</bibkey>
       <doi>10.18653/v1/2024.findings-acl.643</doi>
       <video href="2024.findings-acl.643.mp4"/>
@@ -15109,7 +15109,7 @@
       <author><first>Junbo</first><last>Zhao</last><affiliation>Zhejiang University</affiliation></author>
       <pages>10845-10861</pages>
       <abstract>The rapid advancements of Large Language Models (LLMs) tightly associate with the expansion of the training data size. However, the unchecked ultra-large-scale training sets introduce a series of potential risks like data contamination, i.e. the benchmark data is used for training. In this work, we propose a holistic method named Polarized Augment Calibration (PAC) along with a new to-be-released dataset to detect the contaminated data and diminish the contamination effect. PAC extends the popular MIA (Membership Inference Attack) — from machine learning community — by forming a more global target at detecting training data to Clarify invisible training data. As a pioneering work, PAC is very much plug-and-play that can be integrated with most (if not all) current white- and black-box LLMs. By extensive experiments, PAC outperforms existing methods by at least 4.5%, towards data contamination detection on more 4 dataset formats, with more than 10 base LLMs. Besides, our application in real-world scenarios highlights the prominent presence of contamination and related issues.</abstract>
-      <url hash="64047078">2024.findings-acl.644</url>
+      <url hash="62c38585">2024.findings-acl.644</url>
       <bibkey>ye-etal-2024-data</bibkey>
       <doi>10.18653/v1/2024.findings-acl.644</doi>
       <video href="2024.findings-acl.644.mp4"/>
@@ -15121,7 +15121,7 @@
       <author><first>Yang</first><last>Feng</last><affiliation>Institute of Computing Technology, Chinese Academy of Sciences</affiliation></author>
       <pages>10862-10884</pages>
       <abstract>Although Large Language Models (LLMs) have demonstrated impressive text generation capabilities, they are easily misled by untruthful contexts provided by users or knowledge augmentation tools, leading to hallucinations. To alleviate LLMs from being misled by untruthful context and take advantage of knowledge augmentation, we propose Truth-Aware Context Selection (TACS), a lightweight method to adaptively recognize and mask untruthful context from the inputs. TACS begins by performing truth detection on the input context, leveraging the parameterized knowledge within the LLM. Subsequently, it constructs a corresponding attention mask based on the truthfulness of each position, selecting the truthful context and discarding the untruthful context. Additionally, we introduce a new evaluation metric, Disturbance Adaption Rate, to further study the LLMs’ ability to accept truthful information and resist untruthful information.Experimental results indicate that TACS can effectively filter untruthful context and significantly improve the overall quality of LLMs’ responses when presented with misleading information.</abstract>
-      <url hash="d654903c">2024.findings-acl.645</url>
+      <url hash="2faa9af4">2024.findings-acl.645</url>
       <bibkey>yu-etal-2024-truth</bibkey>
       <doi>10.18653/v1/2024.findings-acl.645</doi>
       <video href="2024.findings-acl.645.mp4"/>
@@ -15134,7 +15134,7 @@
       <author><first>Deyi</first><last>Xiong</last><affiliation>Tianjin University</affiliation></author>
       <pages>10885-10897</pages>
       <abstract>Large language models (LLMs) exhibit outstanding performance in machine translation via in-context learning. In contrast to sentence-level translation, document-level translation (DOCMT) by LLMs based on in-context learning faces two major challenges: firstly, document translations generated by LLMs are often incoherent; secondly, the length of demonstration for in-context learning is usually limited. To address these issues, we propose a Context-Aware Prompting method (CAP), which enables LLMs to generate more accurate, cohesive, and coherent translations via in-context learning. CAP takes into account multi-level attention, selects the most relevant sentences to the current one as context, and then generates a summary from these collected sentences. Subsequently, sentences most similar to the summary are retrieved from the datastore as demonstrations, which effectively guide LLMs in generating cohesive and coherent translations. We conduct extensive experiments across various DOCMT tasks, and the results demonstrate the effectiveness of our approach, particularly in zero pronoun translation (ZPT) and literary translation tasks.</abstract>
-      <url hash="2381aec7">2024.findings-acl.646</url>
+      <url hash="42bb68c9">2024.findings-acl.646</url>
       <bibkey>cui-etal-2024-efficiently</bibkey>
       <doi>10.18653/v1/2024.findings-acl.646</doi>
     </paper>
@@ -15148,7 +15148,7 @@
       <author><first>Wanxiang</first><last>Che</last><affiliation>Harbin Institute of Technology</affiliation></author>
       <pages>10898-10910</pages>
       <abstract>Nowadays, data augmentation through synthetic data has been widely used in the field of Grammatical Error Correction (GEC) to alleviate the problem of data scarcity. However, these synthetic data are mainly used in the pre-training phase rather than the data-limited fine tuning phase due to inconsistent error distribution and noisy labels. In this paper, we propose a synthetic data construction method based on contextual augmentation, which can ensure an efficient augmentation of the original data with a more consistent error distribution. Specifically, we combine rule-based substitution with model-based generation, using the generation model to generate a richer context for the extracted error patterns. Besides, we also propose a relabeling-based data cleaning method to mitigate the effects of noisy labels in synthetic data. Experiments on CoNLL14 and BEA19-Test show that our proposed augmentation method consistently and substantially outperforms strong baselines and achieves the state-of-the-art level with only a few synthetic data.</abstract>
-      <url hash="2e27e74a">2024.findings-acl.647</url>
+      <url hash="97bf604a">2024.findings-acl.647</url>
       <bibkey>wang-etal-2024-improving-grammatical</bibkey>
       <doi>10.18653/v1/2024.findings-acl.647</doi>
     </paper>
@@ -15160,7 +15160,7 @@
       <author><first>Junbo</first><last>Zhao</last><affiliation>Zhejiang University</affiliation></author>
       <pages>10911-10921</pages>
       <abstract>In the current landscape of large language models (LLMs), the process of instruction tuning serves as an essential step. Considering the high computing power overhead, data-efficient instruction tuning was proposed to reduce the training data size in this process, aiming at selecting high-quality instructional data. Nevertheless, we argue that most current data-efficient instruction-tuning methods are highly dependent on the quality of the original instruction-tuning dataset. When it comes to datasets synthesized by LLMs, a common scenario in this field, dirty samples will even be selected with a higher probability than other samples. To address these challenges, we utilized external knowledge (relevant examples or paragraphs) to evaluate those samples synthesized by LLMs with an in-context-based relative predictive entropy. Based on the new metric, we proposed a framework, dubbed as <b>RECOST</b>, which integrates external-knowledge-base re-ranking and diversity-consistent sampling into a single pipeline. Through extensive experiments on several synthetic datasets (Alpaca and Alpaca-gpt4), we demonstrate the effectiveness of our method and achieve even better results with only <b>1%</b> of the full dataset.</abstract>
-      <url hash="20db411d">2024.findings-acl.648</url>
+      <url hash="1f82df85">2024.findings-acl.648</url>
       <bibkey>zhang-etal-2024-recost</bibkey>
       <doi>10.18653/v1/2024.findings-acl.648</doi>
       <video href="2024.findings-acl.648.mp4"/>
@@ -15172,7 +15172,7 @@
       <author><first>Alexander</first><last>Fraser</last><affiliation>Technical University of Munich</affiliation></author>
       <pages>10922-10943</pages>
       <abstract>Cross-lingual alignment, the meaningful similarity of representations across languages in multilingual language models, has been an active field of research in recent years. We survey the literature of techniques to improve cross-lingual alignment, providing a taxonomy of methods and summarising insights from throughout the field. We present different understandings of cross-lingual alignment and their limitations. We provide a qualitative summary of results from a number of surveyed papers. Finally, we discuss how these insights may be applied not only to encoder models, where this topic has been heavily studied, but also to encoder-decoder or even decoder-only models, and argue that an effective trade-off between language-neutral and language-specific information is key.</abstract>
-      <url hash="441a7654">2024.findings-acl.649</url>
+      <url hash="870eae9a">2024.findings-acl.649</url>
       <bibkey>hammerl-etal-2024-understanding</bibkey>
       <doi>10.18653/v1/2024.findings-acl.649</doi>
       <video href="2024.findings-acl.649.mp4"/>
@@ -15184,7 +15184,7 @@
       <author><first>Defu</first><last>Lian</last><affiliation>University of Science and Technology of China</affiliation></author>
       <pages>10944-10959</pages>
       <abstract>Lifelong prompt tuning has significantly advanced parameter-efficient lifelong learning with its efficiency and minimal storage demands on various tasks.Our empirical studies, however, highlights certain transferability constraints in the current methodologies: a universal algorithm that guarantees consistent positive transfer across all tasks is currently unattainable, especially when dealing dissimilar tasks that may engender negative transfer.Identifying the misalignment between algorithm selection and task specificity as the primary cause of negative transfer, we present the Similarity Heuristic Lifelong Prompt Tuning (SHLPT) framework. This innovative strategy partitions tasks into two distinct subsets by harnessing a learnable similarity metric, thereby facilitating fruitful transfer from tasks regardless of their similarity or dissimilarity. Additionally, SHLPT incorporates a parameter pool to combat catastrophic forgetting effectively. Our experiments shows that SHLPT outperforms state-of-the-art techniques in lifelong learning benchmarks and demonstrates robustness against negative transfer in diverse task sequences.</abstract>
-      <url hash="c17b2cf9">2024.findings-acl.650</url>
+      <url hash="8e35aa13">2024.findings-acl.650</url>
       <bibkey>wu-etal-2024-mitigate</bibkey>
       <doi>10.18653/v1/2024.findings-acl.650</doi>
       <video href="2024.findings-acl.650.mp4"/>
@@ -15202,7 +15202,7 @@
       <author id="yang-liu"><first>Yang</first><last>Liu</last></author>
       <pages>10960-10977</pages>
       <abstract>While Large language models (LLMs) have demonstrated considerable capabilities across various natural language tasks, they often fall short of the performance achieved by domain-specific state-of-the-art models. One potential approach to enhance domain-specific capabilities of LLMs involves fine-tuning them using corresponding datasets. However, this method can be both resource and time-intensive, and not applicable to closed-source commercial LLMs. In this paper, we propose Preference Adaptation for Enhancing Domain-specific Abilities of LLMs (PANDA), a method designed to augment the domain-specific capabilities of LLMs by leveraging insights from the response preference of expert models without requiring fine-tuning. Our experimental results reveal that PANDA significantly enhances the domain-specific ability of LLMs on text classification and interactive decision tasks. Moreover, LLM with PANDA even outperforms the expert model that being learned on 4 tasks of ScienceWorld. This finding highlights the potential of exploring tuning-free approaches to achieve weak-to-strong generalization.</abstract>
-      <url hash="0ba9099b">2024.findings-acl.651</url>
+      <url hash="312e4953">2024.findings-acl.651</url>
       <bibkey>liu-etal-2024-panda</bibkey>
       <doi>10.18653/v1/2024.findings-acl.651</doi>
     </paper>
@@ -15217,7 +15217,7 @@
       <author><first>Tomasz</first><last>Kajdanowicz</last><affiliation>Wroclaw University of Science and Technology</affiliation></author>
       <pages>10978-10996</pages>
       <abstract>Advancements in AI and natural language processing have revolutionized machine-human language interactions, with question answering (QA) systems playing a pivotal role. The knowledge base question answering (KBQA) task, utilizing structured knowledge graphs (KG), allows for handling extensive knowledge-intensive questions. However, a significant gap exists in KBQA datasets, especially for low-resource languages. Many existing construction pipelines for these datasets are outdated and inefficient in human labor, and modern assisting tools like Large Language Models (LLM) are not utilized to reduce the workload. To address this, we have designed and implemented a modern, semi-automated approach for creating datasets, encompassing tasks such as KBQA, Machine Reading Comprehension (MRC), and Information Retrieval (IR), tailored explicitly for low-resource environments. We executed this pipeline and introduced the PUGG dataset, the first Polish KBQA dataset, and novel datasets for MRC and IR. Additionally, we provide a comprehensive implementation, insightful findings, detailed statistics, and evaluation of baseline models.</abstract>
-      <url hash="308f3ce2">2024.findings-acl.652</url>
+      <url hash="65999c49">2024.findings-acl.652</url>
       <bibkey>sawczyn-etal-2024-developing</bibkey>
       <doi>10.18653/v1/2024.findings-acl.652</doi>
       <video href="2024.findings-acl.652.mp4"/>
@@ -15232,7 +15232,7 @@
       <author><first>Xiao</first><last>Huang</last><affiliation>The Hong Kong Polytechnic University</affiliation></author>
       <pages>10997-11008</pages>
       <abstract>Generating accurate SQL queries for user questions (text-to-SQL) has been a long-standing challenge since it requires a deep understanding of both the user’s question and the corresponding database schema in order to retrieve the desired content accurately. Existing methods rely on the comprehensive capability of large language models (LLMs) to generate the SQL. However, some necessary knowledge is not explicitly included in the database schema and user question or has been learned by LLMs. Thus, the generated SQL of the knowledge-insufficient questions may be inaccurate, negatively influencing the text-to-SQL models’ performance and robustness. To address this challenge, we propose the Knowledge-to-SQL framework, which employs tailored Data Expert LLM (DELLM) to provide helpful knowledge for all text-to-SQL models. Specifically, we introduce the detailed implementation of DELLM regarding table reading and the basic fine-tuning process. We further propose a Preference Learning via Database Feedback (PLDBF) strategy, refining the DELLM to generate more helpful knowledge for LLMs. Extensive experiments verify that DELLM can enhance the state-of-the-art approaches for text-to-SQL tasks. The corresponding code of DELLM is released for further research.</abstract>
-      <url hash="a2dc4c46">2024.findings-acl.653</url>
+      <url hash="fea76254">2024.findings-acl.653</url>
       <bibkey>hong-etal-2024-knowledge</bibkey>
       <doi>10.18653/v1/2024.findings-acl.653</doi>
       <video href="2024.findings-acl.653.mp4"/>
@@ -15247,7 +15247,7 @@
       <author><first>Masao</first><last>Utiyama</last><affiliation>National Institute of Information and Communications Technology (NICT), National Institute of Advanced Industrial Science and Technology</affiliation></author>
       <pages>11009-11018</pages>
       <abstract>Minimum Bayes risk (MBR) decoding achieved state-of-the-art translation performance by using COMET, a neural metric that has a high correlation with human evaluation.However, MBR decoding requires quadratic time since it computes the expected score between a translation hypothesis and all reference translations.We propose centroid-based MBR (CBMBR) decoding to improve the speed of MBR decoding.Our method clusters the reference translations in the feature space, and then calculates the score using the centroids of each cluster.The experimental results show that our CBMBR not only improved the decoding speed of the expected score calculation 5.7 times, but also outperformed vanilla MBR decoding in translation quality by up to 0.5 COMET in the WMT’22 En<tex-math>\leftrightarrow</tex-math>Ja, En<tex-math>\leftrightarrow</tex-math>De, En<tex-math>\leftrightarrow</tex-math>Zh, and WMT’23 En<tex-math>\leftrightarrow</tex-math>Ja translation tasks.</abstract>
-      <url hash="9595d379">2024.findings-acl.654</url>
+      <url hash="192980eb">2024.findings-acl.654</url>
       <bibkey>deguchi-etal-2024-centroid</bibkey>
       <doi>10.18653/v1/2024.findings-acl.654</doi>
       <video href="2024.findings-acl.654.mp4"/>
@@ -15265,7 +15265,7 @@
       <author><first>Yao-Chung</first><last>Fan</last><affiliation>National Chung Hsing University</affiliation></author>
       <pages>11019-11029</pages>
       <abstract>In this paper, we tackle the task of distractor generation (DG) for multiple-choice questions. Our study introduces two key designs. First, we propose the concept of retrieval augmented pretraining, which involves refining the language model pretraining to align it more closely with the downstream task of DG. Second, we explore the integration of knowledge graphs and language models to further enhance the performance of DG. Our study unveils promising directions for further development in DG by showcasing the efficacy of knowledge augmentation and task-specific pretraining. These findings demonstrate the potential for leveraging both strategies to enhance the quality and performance of DG systems.</abstract>
-      <url hash="3fc8c6e9">2024.findings-acl.655</url>
+      <url hash="132ff011">2024.findings-acl.655</url>
       <bibkey>yu-etal-2024-enhancing</bibkey>
       <doi>10.18653/v1/2024.findings-acl.655</doi>
       <video href="2024.findings-acl.655.mp4"/>
@@ -15277,7 +15277,7 @@
       <author><first>Debasis</first><last>Ganguly</last><affiliation>University of Glasgow</affiliation></author>
       <pages>11030-11047</pages>
       <abstract>In recent years, research shows that neural ranking models (NRMs) substantially outperform their lexical counterparts in text retrieval. In traditional search pipelines, a combination of features leads to well-defined behaviour. However, as neural approaches become increasingly prevalent as the final scoring component of engines or as standalone systems, their robustness to malicious text and, more generally, semantic perturbation needs to be better understood. We posit that the transformer attention mechanism can induce exploitable defects in search models through sensitivity to token position within a sequence, leading to an attack that could generalise beyond a single query or topic. We demonstrate such defects by showing that non-relevant text–such as promotional content–can be easily injected into a document without adversely affecting its position in search results. Unlike previous gradient-based attacks, we demonstrate the existence of these biases in a query-agnostic fashion. In doing so, without the knowledge of topicality, we can still reduce the negative effects of non-relevant content injection by controlling injection position. Our experiments are conducted with simulated on-topic promotional text automatically generated by prompting LLMs with topical context from target documents. We find that contextualisation of a non-relevant text further reduces negative effects whilst likely circumventing existing content filtering mechanisms. In contrast, lexical models are found to be more resilient to such content injection attacks. We then investigate a simple yet effective compensation for the weaknesses of the NRMs in search, validating our hypotheses regarding transformer bias.</abstract>
-      <url hash="d4b95670">2024.findings-acl.656</url>
+      <url hash="e76dabdf">2024.findings-acl.656</url>
       <bibkey>parry-etal-2024-exploiting</bibkey>
       <doi>10.18653/v1/2024.findings-acl.656</doi>
       <video href="2024.findings-acl.656.mp4"/>
@@ -15290,7 +15290,7 @@
       <author><first>Raja</first><last>Giryes</last><affiliation>Tel Aviv University</affiliation></author>
       <pages>11048-11064</pages>
       <abstract>Web-scale training on paired text-image data is becoming increasingly central to multimodal learning, but is challenged by the highly noisy nature of datasets in the wild. Standard data filtering approaches succeed in removing mismatched text-image pairs, but permit semantically related but highly abstract or subjective text. These approaches lack the fine-grained ability to isolate the most concrete samples that provide the strongest signal for learning in a noisy dataset. In this work, we propose a new metric, Image Caption Concreteness (ICC), that evaluates caption text without an image reference to measure its concreteness and relevancy for use in multimodal learning. Our unsupervised approach leverages strong foundation models for measuring visual-semantic information loss in multimodal representations. We demonstrate that this strongly correlates with human evaluation of concreteness in both single-word and caption-level texts. Moreover, we show that curation using ICC complements existing approaches: It succeeds in selecting the highest quality samples from multimodal web-scale datasets to allow for efficient training in resource-constrained settings.</abstract>
-      <url hash="b8140558">2024.findings-acl.657</url>
+      <url hash="37fa06df">2024.findings-acl.657</url>
       <bibkey>yanuka-etal-2024-icc</bibkey>
       <doi>10.18653/v1/2024.findings-acl.657</doi>
       <video href="2024.findings-acl.657.mp4"/>
@@ -15306,7 +15306,7 @@
       <author><first>Haobo</first><last>Wang</last><affiliation>Zhejiang University</affiliation></author>
       <pages>11065-11082</pages>
       <abstract>Within the evolving landscape of deep learning, the dilemma of data quantity and quality has been a long-standing problem. The recent advent of Large Language Models (LLMs) offers a data-centric solution to alleviate the limitations of real-world data with synthetic data generation. However, current investigations into this field lack a unified framework and mostly stay on the surface. Therefore, this paper provides an organization of relevant studies based on a generic workflow of synthetic data generation. By doing so, we highlight the gaps within existing research and outline prospective avenues for future study. This work aims to shepherd the academic and industrial communities towards deeper, more methodical inquiries into the capabilities and applications of LLMs-driven synthetic data generation.</abstract>
-      <url hash="638c317b">2024.findings-acl.658</url>
+      <url hash="9e79ea2c">2024.findings-acl.658</url>
       <bibkey>long-etal-2024-llms</bibkey>
       <doi>10.18653/v1/2024.findings-acl.658</doi>
       <video href="2024.findings-acl.658.mp4"/>
@@ -15319,7 +15319,7 @@
       <author><first>Ryan</first><last>Cotterell</last><affiliation>Swiss Federal Institute of Technology</affiliation></author>
       <pages>11083-11094</pages>
       <abstract>A language model may be viewed as a <tex-math>\Sigma</tex-math>-valued stochastic process for some alphabet <tex-math>\Sigma</tex-math>.However, in some pathological situations, such a stochastic process may “leak” probability mass onto the set of infinite strings and hence is not equivalent to the conventional view of a language model as a distribution over ordinary (finite) strings.Such ill-behaved language processes are referred to as *non-tight* in the literature.In this work, we study conditions of tightness through the lens of stochastic processes.In particular, by regarding the symbol as marking a stopping time and using results from martingale theory, we give characterizations of tightness that generalize our previous work [(Du et al. 2023)](https://arxiv.org/abs/2212.10502).</abstract>
-      <url hash="052f6c16">2024.findings-acl.659</url>
+      <url hash="1041566a">2024.findings-acl.659</url>
       <bibkey>du-etal-2024-language</bibkey>
       <doi>10.18653/v1/2024.findings-acl.659</doi>
     </paper>
@@ -15330,7 +15330,7 @@
       <author><first>Jaewoong</first><last>Cho</last><affiliation>KRAFTON</affiliation></author>
       <pages>11095-11111</pages>
       <abstract>Recent advancements in large language models (LLMs) have remarkably enhanced performances on a variety of tasks in multiple languages. However, tokenizers in LLMs trained primarily on English-centric corpora often overly fragment a text into character or Unicode-level tokens in non-Roman alphabetic languages, leading to inefficient text generation.We introduce a simple yet effective framework to accelerate text generation in such languages. Our approach involves employing a new language model head with a vocabulary set tailored to a specific target language for a pre-trained LLM. This is followed by fine-tuning the new head while incorporating a verification step to ensure the model’s performance is preserved.We show that this targeted fine-tuning, while freezing other model parameters, effectively reduces token fragmentation for the target language. Our extensive experiments demonstrate that the proposed framework increases the generation speed by a factor of 1.7 while maintaining the performance of pre-trained multilingual models on target monolingual tasks.</abstract>
-      <url hash="ca40ad65">2024.findings-acl.660</url>
+      <url hash="e2d5c258">2024.findings-acl.660</url>
       <bibkey>hong-etal-2024-accelerating</bibkey>
       <doi>10.18653/v1/2024.findings-acl.660</doi>
       <video href="2024.findings-acl.660.mp4"/>
@@ -15342,7 +15342,7 @@
       <author><first>Koichi</first><last>Takeda</last><affiliation>Nagoya University</affiliation></author>
       <pages>11112-11118</pages>
       <abstract>In a semantic frame resource such as FrameNet, the definition sentence of a frame is essential for humans to understand the meaning of the frame intuitively. Recently, several attempts have been made to induce semantic frames from large corpora, but the cost of creating the definition sentences for such frames is significant. In this paper, we address a new task of generating frame definitions from a set of frame-evoking words. Specifically, given a cluster of frame-evoking words and associated exemplars induced as the same semantic frame, we utilize a large language model to generate frame definitions. We demonstrate that incorporating frame element reasoning as chain-of-thought can enhance the inclusion of correct frame elements in the generated definitions.</abstract>
-      <url hash="93e6bb47">2024.findings-acl.661</url>
+      <url hash="eb5a8e03">2024.findings-acl.661</url>
       <bibkey>han-etal-2024-definition</bibkey>
       <doi>10.18653/v1/2024.findings-acl.661</doi>
     </paper>
@@ -15356,7 +15356,7 @@
       <author><first>Tat-Seng</first><last>Chua</last><affiliation>National University of Singapore</affiliation></author>
       <pages>11119-11129</pages>
       <abstract>Generative retrieval is a promising new paradigm in text retrieval that generates identifier strings of relevant passages as the retrieval target. This paradigm leverages powerful generative language models, distinct from traditional sparse or dense retrieval methods. In this work, we identify a viable direction to further enhance generative retrieval via distillation and propose a feasible framework, named DGR. DGR utilizes sophisticated ranking models, such as the cross-encoder, in a teacher role to supply a passage rank list, which captures the varying relevance degrees of passages instead of binary hard labels; subsequently, DGR employs a specially designed distilled RankNet loss to optimize the generative retrieval model, considering the passage rank order provided by the teacher model as labels. This framework only requires an additional distillation step to enhance current generative retrieval systems and does not add any burden to the inference stage. We conduct experiments on four public datasets, and the results indicate that DGR achieves state-of-the-art performance among the generative retrieval methods. Additionally, DGR demonstrates exceptional robustness and generalizability with various teacher models and distillation losses.</abstract>
-      <url hash="b36030e9">2024.findings-acl.662</url>
+      <url hash="24956c06">2024.findings-acl.662</url>
       <bibkey>li-etal-2024-distillation</bibkey>
       <doi>10.18653/v1/2024.findings-acl.662</doi>
       <video href="2024.findings-acl.662.mp4"/>
@@ -15364,12 +15364,12 @@
     <paper id="663">
       <title><fixed-case>T</fixed-case>ox<fixed-case>V</fixed-case>id<fixed-case>LM</fixed-case>: A Multimodal Framework for Toxicity Detection in Code-Mixed Videos</title>
       <author><first>Krishanu</first><last>Maity</last></author>
-      <author><first>A.s.</first><last>Poornash</last></author>
+      <author><first>Poornash</first><last>Sangeetha</last></author>
       <author><first>Sriparna</first><last>Saha</last><affiliation>Indian Institute of Technology Patna, India</affiliation></author>
       <author><first>Pushpak</first><last>Bhattacharyya</last><affiliation>Indian Institute of Technology, Bombay, Dhirubhai Ambani Institute Of Information and Communication Technology</affiliation></author>
       <pages>11130-11142</pages>
       <abstract>In an era of rapidly evolving internet technology, the surge in multimodal content, including videos, has expanded the horizons of online communication. However, the detection of toxic content in this diverse landscape, particularly in low-resource code-mixed languages, remains a critical challenge. While substantial research has addressed toxic content detection in textual data, the realm of video content, especially in non-English languages, has been relatively underexplored. This paper addresses this research gap by introducing a benchmark dataset, the first of its kind, consisting of 931 videos with 4021 code-mixed Hindi-English utterances collected from YouTube. Each utterance within this dataset has been meticulously annotated for toxicity, severity, and sentiment labels. We have developed an advanced Multimodal Multitask framework built for Toxicity detection in Video Content by leveraging Language Models (LMs), crafted for the primary objective along with the additional tasks of conducting sentiment and severity analysis. ToxVidLM incorporates three key modules – the Encoder module, Cross-Modal Synchronization module, and Multitask module – crafting a generic multimodal LM customized for intricate video classification tasks. Our experiments reveal that incorporating multiple modalities from the videos substantially enhances the performance of toxic content detection by achieving an Accuracy and Weighted F1 score of 94.29% and 94.35%, respectively.</abstract>
-      <url hash="be7060bb">2024.findings-acl.663</url>
+      <url hash="f71b458e">2024.findings-acl.663</url>
       <bibkey>maity-etal-2024-toxvidlm</bibkey>
       <doi>10.18653/v1/2024.findings-acl.663</doi>
       <video href="2024.findings-acl.663.mp4"/>
@@ -15387,7 +15387,7 @@
       <author id="yang-liu"><first>Yang</first><last>Liu</last></author>
       <pages>11143-11156</pages>
       <abstract>Large Language Models (LLMs) have witnessed remarkable advancements in recent years, prompting the exploration of tool learning, which integrates LLMs with external tools to address diverse real-world challenges. Assessing the capability of LLMs to utilise tools necessitates large-scale and stable benchmarks. However, previous works relied on either hand-crafted online tools with limited scale, or large-scale real online APIs suffering from instability of API status. To address this problem, we introduce StableToolBench, a benchmark evolving from ToolBench, proposing a virtual API server and stable evaluation system. The virtual API server contains a caching system and API simulators which are complementary to alleviate the change in API status. Meanwhile, the stable evaluation system designs solvable pass and win rates using GPT-4 as the automatic evaluator to eliminate the randomness during evaluation. Experimental results demonstrate the stability of StableToolBench, and further discuss the effectiveness of API simulators, the caching system, and the evaluator system.</abstract>
-      <url hash="2c123c6d">2024.findings-acl.664</url>
+      <url hash="70d2d095">2024.findings-acl.664</url>
       <bibkey>guo-etal-2024-stabletoolbench</bibkey>
       <doi>10.18653/v1/2024.findings-acl.664</doi>
       <video href="2024.findings-acl.664.mp4"/>
@@ -15404,7 +15404,7 @@
       <author><first>Bing</first><last>Qin</last><affiliation>Harbin Institute of Technology</affiliation></author>
       <pages>11157-11176</pages>
       <abstract>Emotional Intelligence (EI), consisting of emotion perception, emotion cognition and emotion expression, plays the critical roles in improving user interaction experience for the current large language model (LLM) based conversational general AI assistants. Previous works mainly focus on raising the emotion perception ability of them via naive fine-tuning on EI-related classification or regression tasks. However, this leads to the incomplete enhancement of EI and catastrophic forgetting of the general intelligence (GI). To this end, we first introduce EiBench, a large-scale collection of EI-related tasks in the text-to-text format with task instructions that covers all three aspects of EI, which lays a solid foundation for the comprehensive EI enhancement of LLMs. Then a novel Modular Emotional Intelligence enhancement method (**MoEI**), consisting of Modular Parameter Expansion and intra-inter modulation, is proposed to comprehensively enhance the EI of LLMs without compromise their GI. Extensive experiments on two representative LLM-based assistants, Flan-T5 and LLaMA-2-Chat, demonstrate the effectiveness of MoEI to improving EI while maintain GI.</abstract>
-      <url hash="75b89568">2024.findings-acl.665</url>
+      <url hash="d65baf8e">2024.findings-acl.665</url>
       <bibkey>zhao-etal-2024-matter</bibkey>
       <doi>10.18653/v1/2024.findings-acl.665</doi>
       <video href="2024.findings-acl.665.mp4"/>
@@ -15420,7 +15420,7 @@
       <author><first>Edward</first><last>Choi</last><affiliation>Korea Advanced Institute of Science and Technology</affiliation></author>
       <pages>11177-11213</pages>
       <abstract>To reliably deploy Large Language Models (LLMs) in a specific country, they must possess an understanding of the nation’s culture and basic knowledge. To this end, we introduce National Alignment, which measures the alignment between an LLM and a targeted country from two aspects: social value alignment and common knowledge alignment. We constructed KorNAT, the first benchmark that measures national alignment between LLMs and South Korea. KorNat contains 4K and 6K multiple-choice questions for social value and common knowledge, respectively. To attain an appropriately aligned ground truth in the social value dataset, we conducted a large-scale public survey with 6,174 South Koreans. For common knowledge, we created the data based on the South Korea text books and GED exams. Our dataset creation process is meticulously designed based on statistical sampling theory, and we also introduce metrics to measure national alignment, including three variations of social value alignment. We tested seven LLMs and found that only few models passed our reference score, indicating there exists room for improvement. Our dataset has received government approval following an assessment by a government-affiliated organization dedicated to evaluating dataset quality.</abstract>
-      <url hash="2070fe1e">2024.findings-acl.666</url>
+      <url hash="7b0d3ac0">2024.findings-acl.666</url>
       <bibkey>lee-etal-2024-kornat</bibkey>
       <doi>10.18653/v1/2024.findings-acl.666</doi>
       <video href="2024.findings-acl.666.mp4"/>
@@ -15434,7 +15434,7 @@
       <author><first>Samrat</first><last>Mondal</last></author>
       <pages>11214-11226</pages>
       <abstract>The mining of adverse drug events (ADEs) is pivotal in pharmacovigilance, enhancing patient safety by identifying potential risks associated with medications, facilitating early detection of adverse events, and guiding regulatory decision-making. Traditional ADE detection methods are reliable but slow, not easily adaptable to large-scale operations, and offer limited information. With the exponential increase in data sources like social media content, biomedical literature, and Electronic Medical Records (EMR), extracting relevant ADE-related information from these unstructured texts is imperative. Previous ADE mining studies have focused on text-based methodologies, overlooking visual cues, limiting contextual comprehension, and hindering accurate interpretation. To address this gap, we present a MultiModal Adverse Drug Event (MMADE) detection dataset, merging ADE-related textual information with visual aids. Additionally, we introduce a framework that leverages the capabilities of LLMs and VLMs for ADE detection by generating detailed descriptions of medical images depicting ADEs, aiding healthcare professionals in visually identifying adverse events. Using our MMADE dataset, we showcase the significance of integrating visual cues from images to enhance overall performance. This approach holds promise for patient safety, ADE awareness, and healthcare accessibility, paving the way for further exploration in personalized healthcare.</abstract>
-      <url hash="9a1fa027">2024.findings-acl.667</url>
+      <url hash="9b1cf524">2024.findings-acl.667</url>
       <bibkey>sahoo-etal-2024-enhancing</bibkey>
       <doi>10.18653/v1/2024.findings-acl.667</doi>
       <video href="2024.findings-acl.667.mp4"/>
@@ -15447,7 +15447,7 @@
       <author><first>Sarana</first><last>Nutanong</last></author>
       <pages>11227-11239</pages>
       <abstract>Determining sentence pair similarity is crucial for various NLP tasks. A common technique to address this is typically evaluated on a continuous semantic textual similarity scale from 0 to 5. However, based on a linguistic observation in STS annotation guidelines, we found that the score in the range [4,5] indicates an upper-range sample, while the rest are lower-range samples. This necessitates a new approach to treating the upper-range and lower-range classes separately. In this paper, we introduce a novel embedding space decomposition method called MixSP utilizing a Mixture of Specialized Projectors, designed to distinguish and rank upper-range and lower-range samples accurately. The experimental results demonstrate that MixSP decreased the overlap representation between upper-range and lower-range classes significantly while outperforming competitors on STS and zero-shot benchmarks.</abstract>
-      <url hash="866c468a">2024.findings-acl.668</url>
+      <url hash="17dfc963">2024.findings-acl.668</url>
       <bibkey>ponwitayarat-etal-2024-space</bibkey>
       <doi>10.18653/v1/2024.findings-acl.668</doi>
     </paper>
@@ -15458,7 +15458,7 @@
       <author><first>Sara</first><last>Tonelli</last></author>
       <pages>11240-11247</pages>
       <abstract>Research on abusive language detection and content moderation is crucial to combat online harm. However, current limitations set by regulatory bodies and social media platforms can make it difficult to share collected data. We address this challenge by exploring the possibility to replace existing datasets in English for abusive language detection with synthetic data obtained by rewriting original texts with an instruction-based generative model.We show that such data can be effectively used to train a classifier whose performance is in line, and sometimes better, than a classifier trained on original data. Training with synthetic data also seems to improve robustness in a cross-dataset setting. A manual inspection of the generated data confirms that rewriting makes it impossible to retrieve the original texts online.</abstract>
-      <url hash="e5ef9644">2024.findings-acl.669</url>
+      <url hash="68469bf9">2024.findings-acl.669</url>
       <bibkey>casula-etal-2024-dont</bibkey>
       <doi>10.18653/v1/2024.findings-acl.669</doi>
       <video href="2024.findings-acl.669.mp4"/>
@@ -15470,7 +15470,7 @@
       <author><first>Yutaka</first><last>Matsuo</last><affiliation>The University of Tokyo and The University of Tokyo</affiliation></author>
       <pages>11248-11259</pages>
       <abstract>This paper investigates how machine translation for low-resource languages can be improved by incorporating information from bilingual lexicons during the training process for mainly translation between Mandarin and Formosan languages, which are all moribund or critically endangered, and we also show that our techniques work for translation between Spanish and Nahuatl, a language pair consisting of languages from completely different language families. About 70% of the approximately 7,000 languages of the world have data in the form of lexicons, a valuable resource for improving low-resource language translation. We collect a dataset of parallel data and bilingual lexicons between Mandarin and 16 different Formosan languages and examine mainly three different approaches: (1) simply using lexical data as additional parallel data, (2) generating pseudo-parallel sentence data to use during training by replacing words in the original parallel sentence data using the lexicon, and (3) a combination of (1) and (2). All three approaches give us gains in both Bleu scores and chrF scores, and we found that (3) provided the most gains, followed by (1) and then (2), which we observed for both translation between Mandarin and the Formosan languages and Spanish-Nahuatl. With technique (3), we saw an average increase of 5.55 in Bleu scores and 10.33 in chrF scores.</abstract>
-      <url hash="9fdad6e2">2024.findings-acl.670</url>
+      <url hash="ac4a5966">2024.findings-acl.670</url>
       <bibkey>zheng-etal-2024-improving-low</bibkey>
       <doi>10.18653/v1/2024.findings-acl.670</doi>
       <video href="2024.findings-acl.670.mp4"/>
@@ -15487,7 +15487,7 @@
       <author><first>Timothy</first><last>Baldwin</last><affiliation>Mohamed bin Zayed University of Artificial Intelligence and The University of Melbourne</affiliation></author>
       <pages>11260-11285</pages>
       <abstract>As the capabilities of large language models (LLMs) continue to advance, evaluating their performance is becoming more important and more challenging. This paper aims to address this issue for Mandarin Chinese in the form of CMMLU, a comprehensive Chinese benchmark that covers various subjects, including natural sciences, social sciences, engineering, and the humanities. We conduct a thorough evaluation of more than 20 contemporary multilingual and Chinese LLMs, assessing their performance across different subjects and settings. The results reveal that most existing LLMs struggle to achieve an accuracy of even 60%, which is the pass mark for Chinese exams. This highlights that there is substantial room for improvement in the capabilities of LLMs. Additionally, we conduct extensive experiments to identify factors impacting the models’ performance and propose directions for enhancing LLMs. CMMLU fills the gap in evaluating the knowledge and reasoning capabilities of large language models for Chinese.</abstract>
-      <url hash="a1aff102">2024.findings-acl.671</url>
+      <url hash="b25adb84">2024.findings-acl.671</url>
       <bibkey>li-etal-2024-cmmlu</bibkey>
       <doi>10.18653/v1/2024.findings-acl.671</doi>
       <video href="2024.findings-acl.671.mp4"/>
@@ -15496,12 +15496,12 @@
       <title>Prometheus-Vision: Vision-Language Model as a Judge for Fine-Grained Evaluation</title>
       <author><first>Seongyun</first><last>Lee</last></author>
       <author><first>Seungone</first><last>Kim</last><affiliation>Carnegie Mellon University</affiliation></author>
-      <author><first>Sue Hyun</first><last>Park</last><affiliation>Korea Advanced Institute of Science &amp; Technology</affiliation></author>
+      <author><first>Sue</first><last>Park</last><affiliation>Korea Advanced Institute of Science &amp; Technology</affiliation></author>
       <author><first>Geewook</first><last>Kim</last><affiliation>NAVER Cloud and KAIST</affiliation></author>
       <author><first>Minjoon</first><last>Seo</last><affiliation>Twelve Labs and Korea Advanced Institute of Science and Technology</affiliation></author>
       <pages>11286-11315</pages>
       <abstract>Assessing long-form responses generated by Vision-Language Models (VLMs) is challenging. It not only requires checking whether the VLM follows the given instruction but also verifying whether the text output is properly grounded on the given image. Inspired by the recent approach of evaluating LMs with LMs, in this work, we propose to evaluate VLMs with VLMs. For this purpose, we present a new feedback dataset called the Perception Collection, encompassing 15K customized score rubrics that users might care about during assessment. Using the Perception Collection, we train Prometheus-Vision, the first open-source VLM evaluator model that can understand the user-defined score criteria during evaluation. Prometheus-Vision shows the highest Pearson correlation with human evaluators and GPT-4V among open-source models, showing its effectiveness for transparent and accessible evaluation of VLMs. We open-source our code, dataset, and model.</abstract>
-      <url hash="044beece">2024.findings-acl.672</url>
+      <url hash="db6ed7a9">2024.findings-acl.672</url>
       <bibkey>lee-etal-2024-prometheus</bibkey>
       <doi>10.18653/v1/2024.findings-acl.672</doi>
     </paper>
@@ -15515,7 +15515,7 @@
       <author><first>Fuli</first><last>Feng</last><affiliation>University of Science and Technology of China</affiliation></author>
       <pages>11316-11360</pages>
       <abstract>The rapid advancement of Large Language Models (LLMs) in the realm of mathematical reasoning necessitates comprehensive evaluations to gauge progress and inspire future directions. Existing assessments predominantly focus on problem-solving from the examinee perspective, overlooking a dual perspective of examiner regarding error identification and correction.From the examiner perspective, we define four evaluation tasks for error identification and correction along with a new dataset with annotated error types and steps. We also design diverse prompts to thoroughly evaluate eleven representative LLMs. Our principal findings indicate that GPT-4 outperforms all models, while open-source model LLaMA-2-7B demonstrates comparable abilities to closed-source models GPT-3.5 and Gemini Pro.Notably, calculation error proves the most challenging error type. Moreover, prompting LLMs with the error types can improve the average correction accuracy by 47.9%. These results reveal potential directions for developing the mathematical reasoning abilities of LLMs.Our code and dataset is available on https://github.com/LittleCirc1e/EIC.</abstract>
-      <url hash="b59cb58e">2024.findings-acl.673</url>
+      <url hash="20607f8b">2024.findings-acl.673</url>
       <bibkey>li-etal-2024-evaluating-mathematical</bibkey>
       <doi>10.18653/v1/2024.findings-acl.673</doi>
       <video href="2024.findings-acl.673.mp4"/>
@@ -15526,7 +15526,7 @@
       <author><first>Fabio Massimo</first><last>Zanzotto</last><affiliation>University of Rome Tor Vergata</affiliation></author>
       <pages>11361-11374</pages>
       <abstract>Neural network pruning has become increasingly crucial due to the complexity of these models and their widespread use in various fields. Existing pruning algorithms often suffer from limitations such as architecture specificity, excessive complexity and reliance on demanding calculations, rendering them impractical for real-world applications.This paper introduces KEN: a straightforward, universal and unstructured pruning algorithm based on Kernel Density Estimation (KDE). KEN aims to construct optimized transformers by selectively preserving the most significant parameters while restoring others to their pre-training state. This strategy preserves model performance while enabling storage of only the optimized subnetwork, leading to substantial memory savings.Extensive evaluations across seven different LLMs demonstrate that KEN achieves equal or better performance than their original unpruned versions, with a minimum parameter reduction of 25%. Furthermore, in-depth comparisons with established pruning and PEFT algorithms confirm KEN effectiveness. We further introduce KEN<tex-math>_{viz}</tex-math>, an explainable tool that visualizes the optimized model composition achieved by KEN from different points of view.</abstract>
-      <url hash="4b2fce5f">2024.findings-acl.674</url>
+      <url hash="1ca89a79">2024.findings-acl.674</url>
       <bibkey>mastromattei-zanzotto-2024-less</bibkey>
       <doi>10.18653/v1/2024.findings-acl.674</doi>
     </paper>
@@ -15538,7 +15538,7 @@
       <author><first>Xueqi</first><last>Cheng</last><affiliation>, Chinese Academy of Sciences</affiliation></author>
       <pages>11375-11388</pages>
       <abstract>Large Language Models (LLMs) have been found to have difficulty knowing they do not possess certain knowledge and tend to provide specious answers in such cases. Retrieval Augmentation (RA) has been extensively studied to mitigate LLMs’ hallucinations. However, due to the extra overhead and unassured quality of retrieval, it may not be optimal to conduct RA all the time. A straightforward idea is to only conduct retrieval when LLMs are uncertain about a question. This motivates us to enhance the LLMs’ ability to perceive their knowledge boundaries to help RA. In this paper, we first quantitatively measure LLMs’ such ability and confirm their overconfidence. Then, we study how LLMs’ certainty about a question correlates with their dependence on external retrieved information. We propose several methods to enhance LLMs’ perception of knowledge boundaries and show that they are effective in reducing overconfidence. Additionally, equipped with these methods, LLMs can achieve comparable or even better performance of RA with much fewer retrieval calls.</abstract>
-      <url hash="3848c525">2024.findings-acl.675</url>
+      <url hash="f11157ba">2024.findings-acl.675</url>
       <bibkey>ni-etal-2024-llms</bibkey>
       <doi>10.18653/v1/2024.findings-acl.675</doi>
       <video href="2024.findings-acl.675.mp4"/>
@@ -15555,7 +15555,7 @@
       <author><first>JingBo</first><last>Zhu</last><affiliation>Northeastern University</affiliation></author>
       <pages>11389-11403</pages>
       <abstract>Alignment training is crucial for enabling large language models (LLMs) to cater to human intentions and preferences. It is typically performed based on two stages with different objectives: instruction-following alignment and human-preference alignment. However, aligning LLMs with these objectives in sequence suffers from an inherent problem: the objectives may conflict, and the LLMs cannot guarantee to simultaneously align with the instructions and human preferences well. To response to these, in this work, we propose a Hybrid Alignment Training (Hbat) approach, based on alternating alignment and modified elastic weight consolidation methods. The basic idea is to alternate between different objectives during alignment training, so that better collaboration can be achieved between the two alignment tasks. We experiment with Hbat on summarization and dialogue tasks. Experimental results show that the proposed Hbat can significantly outperform all baselines. Notably, Hbat yields consistent performance gains over the traditional two-stage alignment training when using both proximal policy optimization and direct preference optimization.</abstract>
-      <url hash="665e869f">2024.findings-acl.676</url>
+      <url hash="e6b7a798">2024.findings-acl.676</url>
       <bibkey>wang-etal-2024-hybrid</bibkey>
       <doi>10.18653/v1/2024.findings-acl.676</doi>
     </paper>
@@ -15571,7 +15571,7 @@
       <author><first>Rui</first><last>Yan</last><affiliation>Renmin University of China</affiliation></author>
       <pages>11404-11415</pages>
       <abstract>Speculative decoding has emerged as a promising technique to accelerate the inference of Large Language Models (LLMs) by employing a small language model to draft a hypothesis sequence, which is then validated by the LLM. The effectiveness of this approach heavily relies on the balance between performance and efficiency of the draft model. In our research, we focus on enhancing the proportion of draft tokens that are accepted to the final output by generating multiple hypotheses instead of just one. This allows the LLM more options to choose from and select the longest sequence that meets its standards. Our analysis reveals that hypotheses produced by the draft model share many common token sequences, suggesting a potential for optimizing computation. Leveraging this observation, we introduce an innovative approach utilizing a directed acyclic graph (DAG) to manage the drafted hypotheses. This structure enables us to efficiently predict and merge recurring token sequences, vastly reducing the computational demands of the draft model. We term this approach Graph-structured Speculative Decoding (GSD). We apply GSD across a range of LLMs, including a 70-billion parameter LLaMA-2 model, and observe a remarkable speedup of 1.70<tex-math>\times</tex-math> to 1.94 <tex-math>\times</tex-math>, significantly surpassing standard speculative decoding.</abstract>
-      <url hash="a764ad8e">2024.findings-acl.677</url>
+      <url hash="4e5490e2">2024.findings-acl.677</url>
       <bibkey>gong-etal-2024-graph</bibkey>
       <doi>10.18653/v1/2024.findings-acl.677</doi>
     </paper>
@@ -15583,7 +15583,7 @@
       <author><first>Lydia</first><last>Chen</last><affiliation>Delft University of Technology</affiliation></author>
       <pages>11416-11436</pages>
       <abstract>As large language models (LLM) are increasingly used for text generation tasks, it is critical to audit their usages, govern their applications, and mitigate their potential harms. Existing watermark techniques are shown effective in embedding single human-imperceptible and machine-detectable patterns without significantly affecting generated text quality and semantics. However, the efficiency in detecting watermarks, i.e., the minimum number of tokens required to assert detection with significance and robustness against post-editing, is still debatable. In this paper, we propose, Duwak, to fundamentally enhance the efficiency and quality of watermarking by embedding dual secret patterns in both token probability distribution and sampling schemes. To mitigate expression degradation caused by biasing toward certain tokens, we design a contrastive search to watermark the sampling scheme, which minimizes the token repetition and enhances the diversity. We theoretically explain the interdependency of the two watermarks within Duwak. We evaluate Duwak extensively on Llama2 and Vicuna under various post-editing attacks, against four state-of-the-art watermarking techniques and combinations of them. Our results show that Duwak marked text achieves the highest watermarked text quality at the lowest required token count for detection, up to 70% tokens less than existing approaches, especially under post paraphrasing.</abstract>
-      <url hash="7cb76f03">2024.findings-acl.678</url>
+      <url hash="15298e52">2024.findings-acl.678</url>
       <bibkey>zhu-etal-2024-duwak</bibkey>
       <doi>10.18653/v1/2024.findings-acl.678</doi>
       <video href="2024.findings-acl.678.mp4"/>
@@ -15599,7 +15599,7 @@
       <author><first>Lizhuang</first><last>Ma</last><affiliation>Dept. of Computer Sci. &amp; Eng., Shanghai Jiao Tong University</affiliation></author>
       <pages>11437-11452</pages>
       <abstract>The rapid advancement of Large Language Models (LLMs) has brought about remarkable generative capabilities but also raised concerns about their potential misuse. While strategies like supervised fine-tuning and reinforcement learning from human feedback have enhanced their safety, these methods primarily focus on natural languages, which may not generalize to other domains. This paper introduces CodeAttack, a framework that transforms natural language inputs into code inputs, presenting a novel environment for testing the safety generalization of LLMs. Our comprehensive studies on state-of-the-art LLMs including GPT-4, Claude-2, and Llama-2 series reveal a new and universal safety vulnerability of these models against code input: CodeAttack bypasses the safety guardrails of all models more than 80% of the time. We find that a larger distribution gap between CodeAttack and natural language leads to weaker safety generalization, such as encoding natural language input with data structures. Furthermore, we give our hypotheses about the success of CodeAttack: the misaligned bias acquired by LLMs during code training, prioritizing code completion over avoiding the potential safety risk. Finally, we analyze potential mitigation measures. These findings highlight new safety risks in the code domain and the need for more robust safety alignment algorithms to match the code capabilities of LLMs.</abstract>
-      <url hash="ad020582">2024.findings-acl.679</url>
+      <url hash="0e1ac331">2024.findings-acl.679</url>
       <bibkey>ren-etal-2024-codeattack</bibkey>
       <doi>10.18653/v1/2024.findings-acl.679</doi>
       <video href="2024.findings-acl.679.mp4"/>
@@ -15614,7 +15614,7 @@
       <author><first>Yujiu</first><last>Yang</last><affiliation>Graduate School at Shenzhen,Tsinghua University</affiliation></author>
       <pages>11453-11464</pages>
       <abstract>While large language models (LLMs) have achieved impressive performance across diverse tasks, recent studies showcase that causal LLMs suffer from the “reversal curse”. It is a typical example that the model knows “A’s father is B”, but is unable to reason “B’s child is A”. This limitation poses a challenge to the advancement of artificial general intelligence (AGI), as it suggests a gap in the models’ ability to comprehend and apply bidirectional reasoning. In this paper, we first conduct substantial evaluation and identify that the root cause of the reversal curse lies in the different word order between the training and inference stage, namely, the poor ability of causal language models to predict antecedent words within the training data. Accordingly, permutation on the training data is considered as a potential solution, since this can make the model predict antecedent words or tokens. However, previous permutation methods may disrupt complete phrases or entities, thereby posing challenges for the model to comprehend and learn from training data. To address this issue, we propose Semantic-aware Permutation Training (SPT), which addresses this issue by segmenting the training sentences into semantic units (i.e., entities or phrases) with an assistant language model and permuting these units before feeding into the model. Extensive experiments demonstrate that SPT effectively mitigates the reversal curse since the performance on reversed questions approximates that on the forward ones, and significantly advances the performance of existing works.</abstract>
-      <url hash="45d6dc58">2024.findings-acl.680</url>
+      <url hash="e0c3470d">2024.findings-acl.680</url>
       <bibkey>guo-etal-2024-mitigating</bibkey>
       <doi>10.18653/v1/2024.findings-acl.680</doi>
       <video href="2024.findings-acl.680.mp4"/>
@@ -15629,7 +15629,7 @@
       <author><first>Zhongqiang</first><last>Huang</last><affiliation>Alibaba Group</affiliation></author>
       <pages>11465-11480</pages>
       <abstract>Pre-trained speech models, such as wav2vec 2.0, have significantly advanced speech-related tasks, including speech recognition and translation. However, their applicability in streaming scenarios is limited because these models are trained on complete utterances, leading to a mismatch with incremental streaming inputs. This paper identifies three critical design aspects within the architecture of wav2vec 2.0 and proposes a novel model, wav2vec-S, which incorporates simple modifications to ensure consistent speech representations during both training and inference phases for streaming speech inputs. Furthermore, we demonstrate that wav2vec-S models can be efficiently adapted from pre-trained wav2vec 2.0 models through continued pre-training and effectively finetuned to meet various latency requirements in downstream applications. Experiments on speech recognition and translation tasks show that wav2vec-S outperforms strong baseline models and achieves a superior balance between quality and latency.</abstract>
-      <url hash="f1eba940">2024.findings-acl.681</url>
+      <url hash="42114d5e">2024.findings-acl.681</url>
       <bibkey>fu-etal-2024-wav2vec</bibkey>
       <doi>10.18653/v1/2024.findings-acl.681</doi>
     </paper>
@@ -15642,7 +15642,7 @@
       <author><first>Balaji Vasan</first><last>Srinivasan</last><affiliation>Adobe Research</affiliation></author>
       <pages>11481-11495</pages>
       <abstract>With the enhancement in the field of generative artificial intelligence (AI), contextual question answering has become extremely relevant. Attributing model generations to the input source document is essential to ensure trustworthiness and reliability. We observe that when large language models (LLMs) are used for contextual question answering, the output answer often consists of text copied verbatim from the input prompt which is linked together with “glue text” generated by the LLM. Motivated by this, we propose that LLMs have an inherent awareness from where the text was copied, likely captured in the hidden states of the LLM. We introduce a novel method for attribution in contextual question answering, leveraging the hidden state representations of LLMs. Our approach bypasses the need for extensive model retraining and retrieval model overhead, offering granular attributions and preserving the quality of generated answers. Our experimental results demonstrate that our method performs on par or better than GPT-4 at identifying verbatim copied segments in LLM generations and in attributing these segments to their source. Importantly, our method shows robust performance across various LLM architectures, highlighting its broad applicability. Additionally, we present Verifiability-granular, an attribution dataset which has token level annotations for LLM generations in the contextual question answering setup.</abstract>
-      <url hash="d7675892">2024.findings-acl.682</url>
+      <url hash="33cd1567">2024.findings-acl.682</url>
       <bibkey>phukan-etal-2024-peering</bibkey>
       <doi>10.18653/v1/2024.findings-acl.682</doi>
       <video href="2024.findings-acl.682.mp4"/>
@@ -15656,7 +15656,7 @@
       <author><first>Seong Joon</first><last>Oh</last><affiliation>Parameter Lab and Eberhard-Karls-Universität Tübingen</affiliation></author>
       <pages>11496-11517</pages>
       <abstract>Large Language Model (LLM) services and models often come with legal rules on *who* can use them and *how* they must use them. Assessing the compliance of the released LLMs is crucial, as these rules protect the interests of the LLM contributor and prevent misuse. In this context, we describe the novel fingerprinting problem of Black-box Identity Verification (BBIV). The goal is to determine whether a third-party application uses a certain LLM through its chat function. We propose a method called Targeted Random Adversarial Prompt (TRAP) that identifies the specific LLM in use. We repurpose adversarial suffixes, originally proposed for jailbreaking, to get a pre-defined answer from the target LLM, while other models give random answers. TRAP detects the target LLMs with over 95% true positive rate at under 0.2% false positive rate even after a single interaction. TRAP remains effective even if the LLM has minor changes that do not significantly alter the original function.</abstract>
-      <url hash="f510f850">2024.findings-acl.683</url>
+      <url hash="4e02f03c">2024.findings-acl.683</url>
       <bibkey>gubri-etal-2024-trap</bibkey>
       <doi>10.18653/v1/2024.findings-acl.683</doi>
       <video href="2024.findings-acl.683.mp4"/>
@@ -15669,7 +15669,7 @@
       <author><first>Suma</first><last>Bhat</last><affiliation>University of Illinois, Urbana Champaign</affiliation></author>
       <pages>11518-11531</pages>
       <abstract>Recent advancements in joint speech-text pre-training have significantly advanced the processing of natural language. However, a key limitation is their reliance on parallel speech-text data, posing challenges due to data accessibility. Addressing this, our paper introduces an innovative framework for jointly performing speech and text processing without parallel corpora during pre-training but only downstream. Utilizing pre-trained unimodal models, we extract distinct representations for speech and text, aligning them effectively in a newly defined space using a multi-level contrastive learning mechanism. A unique swap reconstruction mechanism enhances the alignment and is followed by fusion via a multi-head mechanism, seamlessly merging modality-invariant and modality-specific representations. Testing for emotion recognition (SLU task) and idiom usage detection (NLU task) demonstrates robust performance, with commendable robustness to noise in text or speech data.</abstract>
-      <url hash="67234c79">2024.findings-acl.684</url>
+      <url hash="3a05075b">2024.findings-acl.684</url>
       <bibkey>zhou-etal-2024-clasp</bibkey>
       <doi>10.18653/v1/2024.findings-acl.684</doi>
     </paper>
@@ -15682,7 +15682,7 @@
       <author><first>Weiming</first><last>Lu</last><affiliation>Zhejiang University</affiliation></author>
       <pages>11532-11547</pages>
       <abstract>Theory of Mind (ToM)—the cognitive ability to reason about mental states of ourselves and others, is the foundation of social interaction. Although ToM comes naturally to humans, it poses a significant challenge to even the most advanced Large Language Models (LLMs). Due to the complex logical chains in ToM reasoning, especially in higher-order ToM questions, simply utilizing reasoning methods like Chain of Thought (CoT) will not improve the ToM capabilities of LLMs. We present TimeToM, which constructs a temporal space and uses it as the foundation to improve the ToM capabilities of LLMs in multiple scenarios. Specifically, within the temporal space, we construct Temporal Belief State Chain (TBSC) for each character and inspired by the cognition perspective of the social world model, we divide TBSC into self-world beliefs and social world beliefs, aligning with first-order ToM (first-order beliefs) and higher-order ToM (higher-order beliefs) questions, respectively. Moreover, we design a novel tool-belief solver that, by considering belief communication between characters in temporal space, can transform a character’s higher-order beliefs into another character’s first-order beliefs under belief communication period.</abstract>
-      <url hash="f38a1e3a">2024.findings-acl.685</url>
+      <url hash="041ee84c">2024.findings-acl.685</url>
       <bibkey>hou-etal-2024-timetom</bibkey>
       <doi>10.18653/v1/2024.findings-acl.685</doi>
     </paper>
@@ -15695,7 +15695,7 @@
       <author><first>Sarana</first><last>Nutanong</last></author>
       <pages>11548-11563</pages>
       <abstract>NLU models have achieved promising results on standard benchmarks. Despite state-of-the-art accuracy, analysis reveals that many models make predictions using annotation bias rather than the properties we intend the model to learn. Consequently, these models perform poorly on out-of-distribution datasets. Recent advances in bias mitigation show that annotation bias can be alleviated through fine-tuning debiasing objectives. In this paper, we apply causal mediation analysis to gauge how much each model component mediates annotation biases. Using the knowledge from the causal analysis, we improve the model’s robustness against annotation bias through two bias mitigation methods: causal-grounded masking and gradient unlearning. Causal analysis reveals that biases concentrated in specific components, even after employing other training-time debiasing techniques. Manipulating these components by masking out neurons’ activations or updating specific weight blocks both demonstrably improve robustness against annotation artifacts.</abstract>
-      <url hash="340d29a8">2024.findings-acl.686</url>
+      <url hash="151e959a">2024.findings-acl.686</url>
       <bibkey>sae-lim-etal-2024-identifying</bibkey>
       <doi>10.18653/v1/2024.findings-acl.686</doi>
       <video href="2024.findings-acl.686.mp4"/>
@@ -15706,7 +15706,7 @@
       <author><first>Mariya</first><last>Toneva</last><affiliation>Max Planck Institute for Software Systems</affiliation></author>
       <pages>11564-11584</pages>
       <abstract>The rapid growth in natural language processing (NLP) research has led to numerous new models, outpacing our understanding of how they compare to established ones. One major reason for this difficulty is saturating benchmarks, which may not well reflect differences in model performance in the wild. In this work, we introduce a novel framework to compare two NLP models by revealing their shared invariance to interpretable input perturbations targeting a specific linguistic capability. Via experiments on models from the same and different architecture families, this framework offers insights about how changes in models (e.g., distillation, size increase) affect linguistic capabilities. Furthermore, our framework enables evaluation of invariances between commercial black-box models (e.g., InstructGPT family) and models that are better understood (e.g., GPT-2). Across experiments, we observe that large language models share many invariances encoded by models of various sizes, whereas the invariances by large models are only shared by other large models. Possessing a wide variety of invariances may be key to the recent successes of large language models, and our framework can shed light on the types of invariances retained or emerging in new models. We make the code publicly available.</abstract>
-      <url hash="f74c0753">2024.findings-acl.687</url>
+      <url hash="e9b4be5a">2024.findings-acl.687</url>
       <bibkey>rawal-toneva-2024-perturbed</bibkey>
       <doi>10.18653/v1/2024.findings-acl.687</doi>
       <video href="2024.findings-acl.687.mp4"/>
@@ -15722,7 +15722,7 @@
       <author><first>Kan</first><last>Li</last></author>
       <pages>11585-11596</pages>
       <abstract>Stochastic sampling strategies such as top-k and top-p have been widely used in dialogue generation task. However, as an open-domain chatting system, there will be two different conversation scenarios, i.e. chit-chat and knowledge-based question answering. In the former situation, responses diversity is essential due to the one-to-many nature in dialogue. The latter, on the other hand, requires less randomness given that stochastic decoding strategy entails the risk of generating incorrect information. As a result, an adaptive and flexible decoding strategy is needed to cope with these two scenarios simultaneously. To this end, we propose the dynamic decoding strategy (DDS), which can adjust the decoding space w.r.t. different contexts. In DDS, both sequence-level and token-level adaptive search can be achieved to adjust the decoding process in a unified framework. Besides, our adaptive algorithm can not only be used during model inference, but it can also be applied during the model training stage to further enhance the performance. Comprehensive experiments indicate that the proposed decoding strategy can consistently improve the performance of pre-trained dialogue models when coupled with four well-used stochastic decoding algorithms.</abstract>
-      <url hash="3524c649">2024.findings-acl.688</url>
+      <url hash="6510431b">2024.findings-acl.688</url>
       <bibkey>li-etal-2024-dynamic</bibkey>
       <doi>10.18653/v1/2024.findings-acl.688</doi>
       <video href="2024.findings-acl.688.mp4"/>
@@ -15739,7 +15739,7 @@
       <author><first>Ruifeng</first><last>Xu</last><affiliation>Harbin Institute of Technology</affiliation></author>
       <pages>11597-11613</pages>
       <abstract>End-to-end argumentation mining (AM) aims to extract the argumentation structure including argumentation components and their argumentation relations from text. Recent developments in end-to-end AM models have demonstrated significant progress by redefining the AM task as a sequence generation task, exhibiting simplicity and competitive performance. Nevertheless, these models overlook the integration of supplementary discourse structure information, a crucial factor for comprehending argumentation structures, resulting in suboptimal outcomes. In this study, we propose the DENIM framework, which generates discourse structure-aware prefixes for each layer of the generation model. These prefixes imbue the generation-based AM model with discourse structures, thereby augmenting the overall generation process. Moreover, we introduce a multi-task prompt coupled with a three-step decoding strategy, aiming to optimize the efficiency and effectiveness of argumentation structure decoding. Extensive experiments and analyses on two benchmark datasets show that DENIM achieves state-of-the-art performances on two AM benchmarks.</abstract>
-      <url hash="f73513f7">2024.findings-acl.689</url>
+      <url hash="d8a2584b">2024.findings-acl.689</url>
       <bibkey>sun-etal-2024-discourse</bibkey>
       <doi>10.18653/v1/2024.findings-acl.689</doi>
     </paper>
@@ -15755,7 +15755,7 @@
       <author><first>Kan</first><last>Li</last></author>
       <pages>11614-11627</pages>
       <abstract>The guidance from capability evaluations has greatly propelled the progress of human society and the development of Artificial Intelligence. However, as LLMs evolve, it becomes challenging to construct evaluation benchmark with accurate labels for SuperLLMs whose capabilities approach or even surpass those of humans. To credibly conduct poor-supervised evaluation without accurate labels, we first prove that the consistency between the model under evaluation and the reference model, when their prediction distributions are independent and the sample size is infinite, can equivalently assess the true capabilities of the model to be evaluated. However, using either humans or LLMs as the reference model cannot sufficiently meet the conditions, for which we propose the PEEM algorithm. By treating all models under evaluation as reference models, PEEM alternately optimizes model weights and filters reference models based on EM algorithm to maximally alleviate the insufficiency of the conditions. Comprehensive experiments across 3 types of tasks with 16 mainstream LLMs validate the efficiency, universality, and effectiveness of PEEM. More generally, PEEM has advanced the evaluation paradigm evolution from human-centric to human&amp;model-centric, alleviating the limitations of human capabilities for evaluating SuperLLMs.</abstract>
-      <url hash="d89bbdd7">2024.findings-acl.690</url>
+      <url hash="c0b1cc7e">2024.findings-acl.690</url>
       <bibkey>yuan-etal-2024-poor</bibkey>
       <doi>10.18653/v1/2024.findings-acl.690</doi>
     </paper>
@@ -15769,7 +15769,7 @@
       <author><first>Zhaopeng</first><last>Tu</last><affiliation>Tencent AI Lab</affiliation></author>
       <pages>11628-11638</pages>
       <abstract>Neural machine translation (NMT) systems often produce inadequate translations for named entities. In this study, we conducted preliminary experiments to examine the factors affecting the translation accuracy of named entities, specifically focusing on their translation difficulty and context diversity. Based on our observations, we propose a novel data augmentation strategy to enhance the accuracy of named entity translation. The main concept behind our approach is to increase both the context diversity and translation probability for the targeted named entity pair. To achieve this, we construct additional samples for named entities that exhibit high translation difficulty or low context diversity and use the augmented training data to re-train the final translation model. Furthermore, we propose an entity-aware machine translation metric that prefers the translation output to generate more accurate named entities. Our experimental results demonstrate significant improvements over the baseline in terms of general translation performance and named entity translation accuracy across various test sets, such as WMT news translation and terminology test sets.</abstract>
-      <url hash="263c4946">2024.findings-acl.691</url>
+      <url hash="e47e6e2f">2024.findings-acl.691</url>
       <bibkey>liang-etal-2024-addressing</bibkey>
       <doi>10.18653/v1/2024.findings-acl.691</doi>
     </paper>
@@ -15784,7 +15784,7 @@
       <author><first>Daxin</first><last>Jiang</last><affiliation>Microsoft</affiliation></author>
       <pages>11639-11651</pages>
       <abstract>To improve the performance of the dual-encoder retriever, one effective approach is knowledge distillation from the cross-encoder ranker. Existing works prepare training instances by pairing each query with one positive and a batch of negatives. However, most hard negatives mined by advanced dense retrieval methods are still too trivial for the teacher to distinguish, preventing the teacher from transferring abundant dark knowledge to the student through its soft label. To alleviate this issue, we propose Adam, a knowledge distillation framework that can better transfer the dark knowledge held in the teacher with adaptive dark examples. Different from previous works that only rely on one positive and hard negatives as candidate passages, we create dark examples that all have moderate relevance to the query by strengthening negatives and masking positives in the discrete space. Furthermore, as the quality of knowledge held in different training instances varies as measured by the teacher’s confidence score, we propose a self-paced distillation strategy that adaptively concentrates on a subset of high-quality instances to conduct our dark-example-based knowledge distillation to help the student learn better. We conduct experiments on two widely-used benchmarks and verify the effectiveness of our method.</abstract>
-      <url hash="6d59cf58">2024.findings-acl.692</url>
+      <url hash="3f54b3cf">2024.findings-acl.692</url>
       <bibkey>tao-etal-2024-adam</bibkey>
       <doi>10.18653/v1/2024.findings-acl.692</doi>
     </paper>
@@ -15797,7 +15797,7 @@
       <author><first>Jie</first><last>Zhou</last></author>
       <pages>11652-11663</pages>
       <abstract>Large language models (LLMs) are capable of performing conditional sequence generation tasks, such as translation or summarization, through instruction fine-tuning. The fine-tuning data is generally sequentially concatenated from a specific task instruction, an input sentence, and the corresponding response. Considering the locality modeled by the self-attention mechanism of LLMs, these models face the risk of instruction forgetting when generating responses for long input sentences. To mitigate this issue, we propose enhancing the instruction-following capability of LLMs by shifting the position of task instructions after the input sentences. Theoretical analysis suggests that our straightforward method can alter the model’s learning focus, thereby emphasizing the training of instruction-following capabilities. Concurrently, experimental results demonstrate that our approach consistently outperforms traditional settings across various model scales (1B / 7B / 13B) and different sequence generation tasks (translation and summarization), without any additional data or annotation costs. Notably, our method significantly improves the zero-shot performance on conditional sequence generation, e.g., up to 9.7 BLEU points on WMT zero-shot translation tasks. Further analysis reveals that our method can significantly improve the tranditional model’s instruction following ability by 1x over traditional approch.</abstract>
-      <url hash="3eb4bbbf">2024.findings-acl.693</url>
+      <url hash="9b64b037">2024.findings-acl.693</url>
       <bibkey>liu-etal-2024-instruction</bibkey>
       <doi>10.18653/v1/2024.findings-acl.693</doi>
     </paper>
@@ -15811,7 +15811,7 @@
       <author><first>Zenglin</first><last>Xu</last><affiliation>Fudan University</affiliation></author>
       <pages>11664-11674</pages>
       <abstract>Sparse models, including sparse Mixture-of-Experts (MoE) models, have emerged as an effective approach for scaling Transformer models. However, they often suffer from computational inefficiency since a significant number of parameters are unnecessarily involved in computations by multiplying values by zero or low activation values. To address this issue, we present XMoE, a novel MoE designed to enhance both the efficacy and efficiency of sparse MoE models. XMoE leverages small experts and a threshold-based router to enable tokens to selectively engage only essential parameters. Our extensive experiments on language modeling and machine translation tasks demonstrate that enhances model performance and can decrease the computation load at MoE layers by over 50% without sacrificing performance. Furthermore, we present the versatility of by applying it to dense models, enabling sparse computation during inference. We provide a comprehensive analysis and make our code available at <url>https://anonymous.4open.science/r/XMoE</url>.</abstract>
-      <url hash="d69ae064">2024.findings-acl.694</url>
+      <url hash="8064ec8b">2024.findings-acl.694</url>
       <bibkey>yang-etal-2024-xmoe</bibkey>
       <doi>10.18653/v1/2024.findings-acl.694</doi>
     </paper>
@@ -15823,7 +15823,7 @@
       <author><first>Jie</first><last>Zhou</last></author>
       <pages>11675-11687</pages>
       <abstract>Recently, DeepNorm scales Transformers into extremely deep (i.e., 1000 layers) and reveals the promising potential of deep scaling. To stabilize the training of deep models, DeepNorm attempts to constrain the model update to a constant value. Although applying such a constraint can benefit the early stage of model training, it may lead to undertrained models during the whole training procedure. In this paper, we propose BranchNorm, which dynamically rescales the non-residual branch of Transformer in accordance with the training period. BranchNorm not only theoretically stabilizes the training with smooth gradient norms at the early stage, but also encourages better convergence in the subsequent training stage. Experimental results on multiple translation tasks demonstrate that BranchNorm achieves a better trade-off between training stability and converge performance.</abstract>
-      <url hash="97275172">2024.findings-acl.695</url>
+      <url hash="6970a22c">2024.findings-acl.695</url>
       <bibkey>liu-etal-2024-branchnorm</bibkey>
       <doi>10.18653/v1/2024.findings-acl.695</doi>
     </paper>
@@ -15838,7 +15838,7 @@
       <author><first>Hongping</first><last>Zhi</last></author>
       <pages>11688-11699</pages>
       <abstract>Question answering over temporal knowledge graphs (TKGQA) is an emerging topic, which has attracted increasing interest since it considers the dynamic knowledge in the world. Several datasets along with model developments are proposed in the TKGQA research field. However, existing studies generally focus on fact-centered reasoning, with limited attention to temporal reasoning. To tackle the intricate and comprehensive nature of temporal reasoning, we propose a new TKGQA dataset, MusTQ, which contains 666K multi-step temporal reasoning questions as well as a TKG. The multi-step temporal reasoning is established based on six basic temporal reasoning types derived from a well-established measure theory. Using MusTQ, we evaluate previous TKGQA methods and find that they typically fall short in multi-step temporal reasoning. Furthermore, we propose a TKGQA model, MusTKGQA, which enhances multi-step reasoning ability with entity-time attention mechanism and optimized temporal knowledge graph representation. Extensive experiments on MusTQ show that our model achieves state-of-the-art multi-step temporal reasoning performance.</abstract>
-      <url hash="328e1395">2024.findings-acl.696</url>
+      <url hash="d2a0836b">2024.findings-acl.696</url>
       <bibkey>zhang-etal-2024-mustq</bibkey>
       <doi>10.18653/v1/2024.findings-acl.696</doi>
       <video href="2024.findings-acl.696.mp4"/>
@@ -15852,7 +15852,7 @@
       <author><first>Jack</first><last>Hessel</last><affiliation>Samaya AI</affiliation></author>
       <pages>11700-11726</pages>
       <abstract>Effective interlocutors account for the uncertain goals, beliefs, and emotions of others. But even the best human conversationalist cannot perfectly anticipate the trajectory of a dialogue. How well can language models represent inherent uncertainty in conversations? We propose FortUne Dial, an expansion of the long-standing “conversation forecasting” task: instead of just accuracy, evaluation is conducted with uncertainty-aware metrics, effectively enabling abstention on individual instances. We study two ways in which language models potentially represent outcome uncertainty (internally, using scores and directly, using tokens) and propose fine-tuning strategies to improve calibration of both representations. Experiments on eight difficult negotiation corpora demonstrate that our proposed fine-tuning strategies (a traditional supervision strategy and an off-policy reinforcement learning strategy) can calibrate smaller open-source models to compete with pre-trained models 10x their size.</abstract>
-      <url hash="f9d49c2c">2024.findings-acl.697</url>
+      <url hash="984f2b89">2024.findings-acl.697</url>
       <bibkey>sicilia-etal-2024-deal</bibkey>
       <doi>10.18653/v1/2024.findings-acl.697</doi>
       <video href="2024.findings-acl.697.mp4"/>
@@ -15869,7 +15869,7 @@
       <author><first>Ho-Kin</first><last>Tang</last><affiliation>Harbin Institute of Technology</affiliation></author>
       <pages>11727-11742</pages>
       <abstract>Fine-tuning pre-trained language models, particularly large language models, demands extensive computing resources and can result in varying performance outcomes across different domains and datasets. This paper examines the approach of integrating multiple models from diverse training scenarios into a unified model. This unified model excels across various data domains and exhibits the ability to generalize well on out-of-domain data. We propose a knowledge fusion method named Evolver, inspired by evolutionary algorithms, which does not need further training or additional training data. Specifically, our method involves aggregating the weights of different language models into a population and subsequently generating offspring models through mutation and crossover operations. These offspring models are then evaluated against their parents, allowing for the preservation of those models that show enhanced performance on development datasets. Importantly, our model evolving strategy can be seamlessly integrated with existing model merging frameworks, offering a versatile tool for model enhancement. Experimental results on mainstream language models (i.e., encoder-only, decoder-only, encoder-decoder) reveal that Evolver outperforms previous state-of-the-art models by large margins.</abstract>
-      <url hash="1225968b">2024.findings-acl.698</url>
+      <url hash="c44d88ac">2024.findings-acl.698</url>
       <bibkey>du-etal-2024-knowledge</bibkey>
       <doi>10.18653/v1/2024.findings-acl.698</doi>
       <video href="2024.findings-acl.698.mp4"/>
@@ -15883,7 +15883,7 @@
       <author><first>Navid</first><last>Rekabsaz</last><affiliation>Thomson Reuters</affiliation></author>
       <pages>11743-11776</pages>
       <abstract>Multi-task learning (MTL) has shown considerable practical benefits, particularly when using language models (LMs). While this is commonly achieved by learning tasks under a joint optimization procedure, some methods, such as AdapterFusion, divide the problem into two stages: (i) task learning, where knowledge specific to a task is encapsulated within sets of parameters (e.g., adapters), and (ii) transfer, where this already learned knowledge is leveraged for a target task. This separation of concerns provides numerous benefits (e.g., promoting reusability). However, current two stage MTL introduces a substantial number of additional parameters. We address this issue by leveraging the usefulness of linearly scaling the output representations of source adapters for transfer learning. We introduce ScaLearn, a simple and highly parameter-efficient two-stage MTL method that capitalizes on the knowledge of the source tasks by learning a minimal set of scaling parameters that enable effective transfer to a target task. Our experiments on three benchmarks (GLUE, SuperGLUE, and HumSet) and two encoder LMs show that ScaLearn consistently outperforms strong baselines with a small number of transfer parameters (~0.35% of those of AdapterFusion). Remarkably, we observe that ScaLearn maintains its strong abilities even when further reducing parameters, achieving competitive results with only 8 transfer parameters per target task. Our proposed approach thus demonstrates the power of simple scaling as a promise for more efficient task transfer. Our code is available at https://github.com/CPJKU/ScaLearn.</abstract>
-      <url hash="5ac1c9d4">2024.findings-acl.699</url>
+      <url hash="438dbc1d">2024.findings-acl.699</url>
       <bibkey>frohmann-etal-2024-scalearn</bibkey>
       <doi>10.18653/v1/2024.findings-acl.699</doi>
       <video href="2024.findings-acl.699.mp4"/>
@@ -15894,7 +15894,7 @@
       <author><first>Yun-Nung</first><last>Chen</last><affiliation>Department of Computer Science and Informational Engineering, National Taiwan University</affiliation></author>
       <pages>11777-11788</pages>
       <abstract>For dialogue systems, the utilization of multimodal dialogue responses, as opposed to relying solely on text-only responses, offers the capability to describe different concepts through various modalities. This enhances the effectiveness of communication and elevates the overall conversational experience. However, current methods for dialogue-to-image retrieval are constrained by the capabilities of the pre-trained vision language models (VLMs). They struggle to accurately extract key information from conversations and are unable to handle long-turn conversations. In this paper, we leverage the reasoning capabilities of large language models (LLMs) to predict the potential features that may be present in the images to be shared, based on the dialogue context. This approach allows us to obtain succinct and precise descriptors, thereby improving the performance of text-image retrieval. Experimental results shows that our method outperforms previous approaches significantly in terms of Recall@k.</abstract>
-      <url hash="0cc4874e">2024.findings-acl.700</url>
+      <url hash="32eab8c4">2024.findings-acl.700</url>
       <bibkey>kao-chen-2024-visualizing</bibkey>
       <doi>10.18653/v1/2024.findings-acl.700</doi>
       <video href="2024.findings-acl.700.mp4"/>
@@ -15916,7 +15916,7 @@
       <author><first>Maosong</first><last>Sun</last></author>
       <pages>11789-11804</pages>
       <abstract>Scientific data visualization plays a crucial role in research by enabling the direct display of complex information and assisting researchers in identifying implicit patterns. Despite its importance, the use of Large Language Models (LLMs) for scientific data visualization remains rather unexplored. In this study, we introduce MatPlotAgent, an efficient model-agnostic LLM agent framework designed to automate scientific data visualization tasks. Leveraging the capabilities of both code LLMs and multi-modal LLMs, MatPlotAgent consists of three core modules: query understanding, code generation with iterative debugging, and a visual feedback mechanism for error correction. To address the lack of benchmarks in this field, we present MatPlotBench, a high-quality benchmark consisting of 100 human-verified test cases. Additionally, we introduce a scoring approach that utilizes GPT-4V for automatic evaluation. Experimental results demonstrate that MatPlotAgent can improve the performance of various LLMs, including both commercial and open-source models. Furthermore, the proposed evaluation method shows a strong correlation with human-annotated scores.</abstract>
-      <url hash="bbb09747">2024.findings-acl.701</url>
+      <url hash="c9a2a8f5">2024.findings-acl.701</url>
       <bibkey>yang-etal-2024-matplotagent</bibkey>
       <doi>10.18653/v1/2024.findings-acl.701</doi>
       <video href="2024.findings-acl.701.mp4"/>
@@ -15929,7 +15929,7 @@
       <author><first>XiangyunKong</first><last>XiangyunKong</last></author>
       <pages>11805-11816</pages>
       <abstract>Continual few-shot relation extraction (CFRE) aims to continually learn new relations with limited samples. However, current methods neglect the instability of embeddings in the process of different task training, which leads to serious catastrophic forgetting. In this paper, we propose the concept of the following degree from the perspective of instability to analyze catastrophic forgetting and design a novel method based on adaptive gradient correction and knowledge decomposition to alleviate catastrophic forgetting. Specifically, the adaptive gradient correction algorithm is designed to limit the instability of embeddings, which adaptively constrains the current gradient to be orthogonal to the embedding space learned from previous tasks. To reduce the instability between samples and prototypes, the knowledge decomposition module decomposes knowledge into general and task-related knowledge from the perspective of model architecture, which is asynchronously optimized during training. Experimental results on two standard benchmarks show that our method outperforms the state-of-the-art CFRE model and effectively improves the following degree of embeddings.</abstract>
-      <url hash="25065f3e">2024.findings-acl.702</url>
+      <url hash="09b8bf45">2024.findings-acl.702</url>
       <bibkey>hu-etal-2024-continual</bibkey>
       <doi>10.18653/v1/2024.findings-acl.702</doi>
     </paper>
@@ -15949,7 +15949,7 @@
       <author><first>Deyi</first><last>Xiong</last><affiliation>Tianjin University</affiliation></author>
       <pages>11817-11837</pages>
       <abstract>What a large language model (LLM) would respond in ethically relevant context? In this paper, we curate a large benchmark CMoralEval for morality evaluation of Chinese LLMs. The data sources of CMoralEval are two-fold: 1) a Chinese TV program discussing Chinese moral norms with stories from the society and 2) a collection of Chinese moral anomies from various newspapers and academic papers on morality. With these sources, we aim to create a moral evaluation dataset characterized by diversity and authenticity. We develop a morality taxonomy and a set of fundamental moral principles that are not only rooted in traditional Chinese culture but also consistent with contemporary societal norms. To facilitate efficient construction and annotation of instances in CMoralEval, we establish a platform with AI-assisted instance generation to streamline the annotation process. These help us curate CMoralEval that encompasses both explicit moral scenarios (14,964 instances) and moral dilemma scenarios (15,424 instances), each with instances from different data sources. We conduct extensive experiments with CMoralEval to examine a variety of Chinese LLMs. Experiment results demonstrate that CMoralEval is a challenging benchmark for Chinese LLMs.</abstract>
-      <url hash="55ada6df">2024.findings-acl.703</url>
+      <url hash="00d4ffe7">2024.findings-acl.703</url>
       <bibkey>yu-etal-2024-cmoraleval</bibkey>
       <doi>10.18653/v1/2024.findings-acl.703</doi>
     </paper>
@@ -15961,7 +15961,7 @@
       <author><first>Ivan</first><last>Titov</last><affiliation>University of Edinburgh and University of Amsterdam</affiliation></author>
       <pages>11838-11853</pages>
       <abstract>Large-scale deployment of generative AI tools often depends on costly API calls to a Large Language Model (LLM) to fulfil user queries, a process that also exposes the request stream to external providers. To curtail the frequency of these calls, one can employ a local smaller language model -a student- which is continuously trained on the responses of the LLM. This student gradually gains proficiency in independently handling an increasing number of user requests, a process we term neural caching. The crucial element in neural caching is a policy that decides which requests should be processed by the student alone and which should be redirected to the LLM, subsequently aiding the student’s learning. In this study, we focus on classification tasks, and we consider a range of classic Active Learning-based selection criteria as the policy. Our experiments suggest that Margin Sampling and Query by Committee bring consistent benefits over other policies and baselines across tasks and budgets.</abstract>
-      <url hash="a4e86742">2024.findings-acl.704</url>
+      <url hash="46302fb2">2024.findings-acl.704</url>
       <bibkey>ramirez-etal-2024-cache</bibkey>
       <doi>10.18653/v1/2024.findings-acl.704</doi>
       <video href="2024.findings-acl.704.mp4"/>
@@ -15973,7 +15973,7 @@
       <author><first>Christina</first><last>Lioma</last><affiliation>University of Copenhagen</affiliation></author>
       <pages>11854-11879</pages>
       <abstract>Explainable AI methods facilitate the understanding of model behaviour, yet, small, imperceptible perturbations to inputs can vastly distort explanations. As these explanations are typically evaluated holistically, before model deployment, it is difficult to assess when a particular explanation is trustworthy. Some studies have tried to create confidence estimators for explanations, but none have investigated an existing link between uncertainty and explanation quality. We artificially simulate epistemic uncertainty in text input by introducing noise at inference time. In this large-scale empirical study, we insert different levels of noise perturbations and measure the effect on the output of pre-trained language models and different uncertainty metrics. Realistic perturbations have minimal effect on performance and explanations, yet masking has a drastic effect. We find that high uncertainty doesn’t necessarily imply low explanation plausibility; the correlation between the two metrics can be moderately positive when noise is exposed during the training process. This suggests that noise-augmented models may be better at identifying salient tokens when uncertain. Furthermore, when predictive and epistemic uncertainty measures are over-confident, the robustness of a saliency map to perturbation can indicate model stability issues. Integrated Gradients shows the overall greatest robustness to perturbation, while still showing model-specific patterns in performance; however, this phenomenon is limited to smaller Transformer-based language models.</abstract>
-      <url hash="39f5df7a">2024.findings-acl.705</url>
+      <url hash="c5fe909a">2024.findings-acl.705</url>
       <bibkey>marjanovic-etal-2024-investigating</bibkey>
       <doi>10.18653/v1/2024.findings-acl.705</doi>
       <video href="2024.findings-acl.705.mp4"/>
@@ -15988,7 +15988,7 @@
       <author><first>Min</first><last>Zhang</last><affiliation>Harbin Institute of Technology, Shenzhen</affiliation></author>
       <pages>11880-11891</pages>
       <abstract>Text ranking is a critical task in information retrieval. Recent advances in pre-trained language models (PLMs), especially large language models (LLMs), present new opportunities for applying them to text ranking. While supervised fine-tuning (SFT) with ranking data has been widely explored to better align PLMs with text ranking goals, previous studies have focused primarily on encoder-only and encoder-decoder PLMs. Research on leveraging decoder-only LLMs for text ranking remains scarce. An exception to this is RankLLaMA, which uses direct SFT to explore LLaMA’s potential for text ranking. In this work, we propose a two-stage progressive paradigm to better adapt LLMs to text ranking. First, we conduct continual pre-training (CPT) of LLMs on a large weakly-supervised corpus. Second, we perform SFT, and propose an improved optimization strategy building upon RankLLaMA. Our experimental results on multiple benchmarks show that our approach outperforms previous methods in both in-domain and out-domain scenarios.</abstract>
-      <url hash="8e953c9b">2024.findings-acl.706</url>
+      <url hash="3873b520">2024.findings-acl.706</url>
       <bibkey>zhang-etal-2024-two</bibkey>
       <doi>10.18653/v1/2024.findings-acl.706</doi>
     </paper>
@@ -16003,7 +16003,7 @@
       <author><first>Marco</first><last>Guerini</last><affiliation>Fondazione Bruno Kessler</affiliation></author>
       <pages>11892-11907</pages>
       <abstract>Automatic methods for generating and gathering linguistic data have proven effective for fine-tuning Language Models (LMs) in languages less resourced than English. Still, while there has been emphasis on data quantity, less attention has been given to its quality. In this work, we investigate the impact of human intervention on machine-generated data when fine-tuning dialogical models. In particular, we study (1) whether post-edited dialogues exhibit higher perceived quality compared to the originals that were automatically generated; (2) whether fine-tuning with post-edited dialogues results in noticeable differences in the generated outputs; and (3) whether post-edited dialogues influence the outcomes when considering the parameter size of the LMs. To this end we created HED-IT, a large-scale dataset where machine-generated dialogues are paired with the version post-edited by humans. Using both the edited and unedited portions of HED-IT, we fine-tuned three different sizes of an LM. Results from both human and automatic evaluation show that the different quality of training data is clearly perceived and it has an impact also on the models trained on such data. Additionally, our findings indicate that larger models are less sensitive to data quality, whereas this has a crucial impact on smaller models. These results enhance our comprehension of the impact of human intervention on training data in the development of high-quality LMs.</abstract>
-      <url hash="13b1bbfd">2024.findings-acl.707</url>
+      <url hash="24a3bfda">2024.findings-acl.707</url>
       <bibkey>occhipinti-etal-2024-fine</bibkey>
       <doi>10.18653/v1/2024.findings-acl.707</doi>
       <video href="2024.findings-acl.707.mp4"/>
@@ -16017,7 +16017,7 @@
       <author><first>Le</first><last>Sun</last><affiliation>Institute of Software, Chinese Academy of Sciences</affiliation></author>
       <pages>11908-11922</pages>
       <abstract>Query expansion (QE) is a critical component in the open-domain question answering (OpenQA) pipeline, enhancing the retrieval performance by broadening the scope of queries with additional relevant texts. However, existing methods like GAR and EAR rely heavily on supervised training and often struggle to maintain effectiveness across domains and datasets. Meanwhile, although large language models (LLMs) have demonstrated QE capability for information retrieval (IR) tasks, their application in OpenQA is hindered by the inadequate analysis of query’s informational needs and the lack of quality control for generated QEs, failing to meet the unique requirements of OpenQA. To bridge this gap, we propose a novel LLM-based QE approach named AGR for the OpenQA task, leveraging a three-step prompting strategy. AGR begins with an analysis of the query, followed by the generation of answer-oriented expansions, and culminates with a refinement process for better query formulation. Extensive experiments on four OpenQA datasets reveal that AGR not only rivals in-domain supervised methods in retrieval accuracy, but also outperforms state-of-the-art baselines in out-domain zero-shot scenarios. Moreover, it exhibits enhanced performance in end-to-end QA evaluations, underscoring the superiority of AGR for OpenQA.</abstract>
-      <url hash="c42134ec">2024.findings-acl.708</url>
+      <url hash="14c7fd78">2024.findings-acl.708</url>
       <bibkey>chen-etal-2024-analyze</bibkey>
       <doi>10.18653/v1/2024.findings-acl.708</doi>
     </paper>
@@ -16037,7 +16037,7 @@
       <author><first>Shinji</first><last>Watanabe</last><affiliation>Carnegie Mellon University</affiliation></author>
       <pages>11923-11938</pages>
       <abstract>The Spoken Language Understanding Evaluation (SLUE) suite of benchmark tasks was recently introduced to address the need for openresources and benchmarking of complex spoken language understanding (SLU) tasks, including both classification and sequence generation tasks, on natural speech. The benchmark has demonstrated preliminary success in using pre-trained speech foundation models (SFM) for these SLU tasks. However, the community still lacks a fine-grained understanding of the comparative utility of different SFMs. Inspired by this, we ask: which SFMs offer the most benefits for these complex SLU tasks, and what is the most effective approach for incorporating these SFMs? To answer this, we perform an extensive evaluation of multiple supervised and self-supervised SFMs using several evaluation protocols: (i) frozen SFMs with a lightweight prediction head, (ii) frozen SFMs with a complex prediction head, and (iii) fine-tuned SFMs with a lightweight prediction head. Although the supervised SFMs are pre-trained on much more speech recognition data (with labels), they do not always outperform self-supervised SFMs; the latter tend to perform at least as well as, and sometimes better than, supervised SFMs, especially on the sequence generation tasks in SLUE. While there is no universally optimal way of incorporating SFMs, the complex prediction head gives the best performance for most tasks, although it increases the inference time. We also introduce an open-source toolkit and performance leaderboard, SLUE-PERB, for these tasks and modeling strategies.</abstract>
-      <url hash="f15260c4">2024.findings-acl.709</url>
+      <url hash="b23fcb8f">2024.findings-acl.709</url>
       <bibkey>arora-etal-2024-evaluation</bibkey>
       <doi>10.18653/v1/2024.findings-acl.709</doi>
       <video href="2024.findings-acl.709.mp4"/>
@@ -16050,7 +16050,7 @@
       <author><first>Jie</first><last>Zhou</last></author>
       <pages>11939-11951</pages>
       <abstract>Recent research has shown a weak correlation between n-gram-based metrics and human evaluations in machine translation task, particularly when evaluating large language models (LLMs). Additionally, the data leakage risk in LLMs may cause an overestimation problem when evaluating LLMs on downstream tasks. In this work, we identify the limited diversity of references as the primary cause for the inferior performance of n-gram-based metrics and the overestimation problem. To address this issue, we propose to utilize multiple references generated by LLMs, coupled with an effective selection strategy focused on accuracy and diversity, to improve the alignment between automatic metrics and human evaluations. We validate our approach on the WMT22 Metrics benchmark with 4 languages and observe a maximum accuracy gain of 9.5% in F200spBLEU, which makes it on par with computationally expensive neural-based metrics. We also show that using multi-reference with n-gram-based metrics significantly alleviates the overestimation problem when evaluating LLMs with data leakage. Further analysis explores the factors that affect the quality of generated references, offering insights into data synthesis by LLMs.</abstract>
-      <url hash="389a085b">2024.findings-acl.710</url>
+      <url hash="d7b90cd1">2024.findings-acl.710</url>
       <bibkey>zeng-etal-2024-towards</bibkey>
       <doi>10.18653/v1/2024.findings-acl.710</doi>
     </paper>
@@ -16067,7 +16067,7 @@
       <author><first>Paula</first><last>Buttery</last><affiliation>University of Cambridge</affiliation></author>
       <pages>11952-11967</pages>
       <abstract>Thanks to recent advances in generative AI, we are able to prompt large language models (LLMs) to produce texts which are fluent and grammatical. In addition, it has been shown that we can elicit attempts at grammatical error correction (GEC) from LLMs when prompted with ungrammatical input sentences. We evaluate how well LLMs can perform at GEC by measuring their performance on established benchmark datasets. We go beyond previous studies, which only examined GPT* models on a selection of English GEC datasets, by evaluating seven open-source and three commercial LLMs on four established GEC benchmarks. We investigate model performance and report results against individual error types. Our results indicate that LLMs do not always outperform supervised English GEC models except in specific contexts – namely commercial LLMs on benchmarks annotated with fluency corrections as opposed to minimal edits. We find that several open-source models outperform commercial ones on minimal edit benchmarks, and that in some settings zero-shot prompting is just as competitive as few-shot prompting.</abstract>
-      <url hash="1dd67531">2024.findings-acl.711</url>
+      <url hash="7d799fdc">2024.findings-acl.711</url>
       <bibkey>davis-etal-2024-prompting</bibkey>
       <doi>10.18653/v1/2024.findings-acl.711</doi>
     </paper>
@@ -16079,7 +16079,7 @@
       <author><first>Philipp</first><last>Schaer</last><affiliation>TH Köln - University of Applied Sciences</affiliation></author>
       <pages>11968-11989</pages>
       <abstract>Evaluation of text simplification currently focuses on the difference of a source text to its simplified variant. Datasets for this evaluation base on a specific topic and group of readers for which is simplified. The broad applicability of text simplification and specifics that come with intended target audiences (e.g., children compared to adult non-experts) are disregarded. An explainable assessment of the overall simplicity of text is missing. This work is BenchmArking Text Simplicity (BATS): we provide an explainable method to assess practical and concrete rules from literature describing features of simplicity and complexity of text. Our experiments on 15 datasets for text simplification highlight differences in features that are important in different domains of text and for different intended target audiences.</abstract>
-      <url hash="0adadce1">2024.findings-acl.712</url>
+      <url hash="31f2c35f">2024.findings-acl.712</url>
       <bibkey>kreutz-etal-2024-bats</bibkey>
       <doi>10.18653/v1/2024.findings-acl.712</doi>
     </paper>
@@ -16093,7 +16093,7 @@
       <author><first>Julia</first><last>Neidhardt</last><affiliation>Technische Universität Wien</affiliation></author>
       <pages>11990-12001</pages>
       <abstract>Model interpretability in toxicity detection greatly profits from token-level annotations. However, currently, such annotations are only available in English. We introduce a dataset annotated for offensive language detection sourced from a news forum, notable for its incorporation of the Austrian German dialect, comprising 4,562 user comments. In addition to binary offensiveness classification, we identify spans within each comment constituting vulgar language or representing targets of offensive statements. We evaluate fine-tuned Transformer models as well as large language models in a zero- and few-shot fashion. The results indicate that while fine-tuned models excel in detecting linguistic peculiarities such as vulgar dialect, large language models demonstrate superior performance in detecting offensiveness in AustroTox.</abstract>
-      <url hash="25405073">2024.findings-acl.713</url>
+      <url hash="6bd07474">2024.findings-acl.713</url>
       <bibkey>pachinger-etal-2024-austrotox</bibkey>
       <doi>10.18653/v1/2024.findings-acl.713</doi>
       <video href="2024.findings-acl.713.mp4"/>
@@ -16106,7 +16106,7 @@
       <author><first>Eddie</first><last>Yang</last><affiliation>University of California, San Diego</affiliation></author>
       <pages>12002-12027</pages>
       <abstract>Experimental methods for estimating the impacts of text on human evaluation have been widely used in the social sciences. However, researchers in experimental settings are usually limited to testing a small number of pre-specified text treatments. While efforts to mine unstructured texts for features that causally affect outcomes have been ongoing in recent years, these models have primarily focused on the topics or specific words of text, which may not always be the mechanism of the effect. We connect these efforts with NLP interpretability techniques and present a method for flexibly discovering clusters of similar text phrases that are predictive of human reactions to texts using convolutional neural networks. When used in an experimental setting, this method can identify text treatments and their effects under certain assumptions. We apply the method to two data sets. The first enables direct validation of the model’s ability to detect phrases known to cause the outcome. The second demonstrates its ability to flexibly discover text treatments with varying textual structures. In both cases, the model learns a greater variety of text treatments compared to benchmark methods, and these text features quantitatively meet or exceed the ability of benchmark methods to predict the outcome.</abstract>
-      <url hash="a7473c46">2024.findings-acl.714</url>
+      <url hash="46b02427">2024.findings-acl.714</url>
       <bibkey>ayers-etal-2024-discovering</bibkey>
       <doi>10.18653/v1/2024.findings-acl.714</doi>
       <video href="2024.findings-acl.714.mp4"/>
@@ -16122,7 +16122,7 @@
       <author><first>Juanzi</first><last>Li</last></author>
       <pages>12028-12038</pages>
       <abstract>Event extraction (EE) is a critical task in natural language processing, yet deploying a practical EE system remains challenging. On one hand, powerful large language models (LLMs) currently show poor performance because EE task is more complex than other tasks. On the other hand, state-of-the-art (SOTA) small language models (SLMs) for EE tasks are typically developed through fine-tuning, lack flexibility, and have considerable room for improvement. We propose an approach, **L**LMs-as-**C**orrector for **E**vent **E**xtraction (**LC4EE**), aiming to leverage the superior extraction capability of SLMs and the instruction-following ability of LLMs to construct a robust and highly available EE system. By utilizing LLMs to identify and correct errors of SLMs predictions based on automatically generated feedback information, EE performances can be improved significantly. Experimental results on the representative datasets ACE2005 and MAVEN-Arg for Event Detection (ED) and EE tasks validated the effectiveness of our method.</abstract>
-      <url hash="57ac86c1">2024.findings-acl.715</url>
+      <url hash="bbf02ca6">2024.findings-acl.715</url>
       <bibkey>zhu-etal-2024-lc4ee</bibkey>
       <doi>10.18653/v1/2024.findings-acl.715</doi>
     </paper>
@@ -16137,7 +16137,7 @@
       <author><first>Ge</first><last>Li</last><affiliation>Peking University Shenzhen Graduate School</affiliation></author>
       <pages>12039-12050</pages>
       <abstract>Recent statements about the impressive capabilities of large language models (LLMs) are usually supported by evaluating on open-access benchmarks. Considering the vast size and wide-ranging sources of LLMs’ training data, it could explicitly or implicitly include test data, leading to LLMs being more susceptible to data contamination. However, due to the opacity of training data, the black-box access of models, and the rapid growth of synthetic training data, detecting and mitigating data contamination for LLMs faces significant challenges. In this paper, we propose CDD, which stands for Contamination Detection via output Distribution for LLMs. CDD necessitates only the sampled texts to detect data contamination, by identifying the peakedness of LLM’s output distribution. To mitigate the impact of data contamination in evaluation, we also present TED: Trustworthy Evaluation via output Distribution, based on the correction of LLM’s output distribution. To facilitate this study, we introduce two benchmarks, i.e., DETCON and COMIEVAL, for data contamination detection and contamination mitigation evaluation tasks. Extensive experimental results show that CDD achieves the average relative improvements of 21.8%-30.2% over other contamination detection approaches in terms of Accuracy, F1 Score, and AUC metrics, and can effectively detect implicit contamination. TED substantially mitigates performance improvements up to 66.9% attributed to data contamination across various contamination setups. In real-world applications, we reveal that ChatGPT exhibits a high potential to suffer from data contamination on HumanEval benchmark.</abstract>
-      <url hash="63cf45d9">2024.findings-acl.716</url>
+      <url hash="aa31d8a4">2024.findings-acl.716</url>
       <bibkey>dong-etal-2024-generalization</bibkey>
       <doi>10.18653/v1/2024.findings-acl.716</doi>
       <video href="2024.findings-acl.716.mp4"/>
@@ -16148,7 +16148,7 @@
       <author><first>Sunita</first><last>Sarawagi</last><affiliation>IIT Bombay</affiliation></author>
       <pages>12051-12064</pages>
       <abstract>Maximizing the likelihood of the next token is an established, statistically sound objective for pre-training language models. In this paper we show that we can train better models faster by pre-aggregating the corpus with a collapsed <tex-math>n</tex-math>-gram distribution. Previous studies have proposed corpus-level <tex-math>n</tex-math>-gram statistics as a regularizer; however, the construction and querying of such <tex-math>n</tex-math>-grams, if done naively, prove to be costly and significantly impede training speed, thereby limiting their application in modern large language model pre-training.We introduce an alternative compact representation of the next token distribution that, in expectation, aligns with the complete <tex-math>n</tex-math>-gram distribution while markedly reducing variance across mini-batches compared to the standard next-token loss. Empirically, we demonstrate that both the <tex-math>n</tex-math>-gram regularized model and our approximation yield substantial improvements in model quality and convergence rate compared to existing methods. Furthermore, our approximation facilitates scalability of gains to larger datasets and models compared to the straightforward <tex-math>n</tex-math>-gram regularization method.</abstract>
-      <url hash="2e64eef6">2024.findings-acl.717</url>
+      <url hash="689e9c60">2024.findings-acl.717</url>
       <bibkey>sathe-sarawagi-2024-efficient</bibkey>
       <doi>10.18653/v1/2024.findings-acl.717</doi>
     </paper>
@@ -16160,7 +16160,7 @@
       <author><first>Hao</first><last>Xu</last><affiliation>Jilin University</affiliation></author>
       <pages>12065-12074</pages>
       <abstract>The ancestor of Chinese character – the ancient characters from about 1300 BC to 200 BC are not fixed in their writing glyphs. At the same or different points in time, one character can possess multiple glyphs that are different in shapes or radicals. Nearly half of ancient glyphs have not been deciphered yet. This paper proposes an innovative task of ancient Chinese glyph identification, which aims at inferring the Chinese character label for the unknown ancient Chinese glyphs which are not in the training set based on the image and radical information. Specifically, we construct a Chinese glyph knowledge graph (CGKG) associating glyphs in different historical periods according to the radical semantics, and propose a multimodal Chinese glyph identification framework (MCGI) fusing the visual, textual, and the graph data. The experiment is designed on a real Chinese glyph dataset spanning over 1000 years, it demonstrates the effectiveness of our method, and reports the potentials of each modality on this task. It provides a preliminary reference for the automatic ancient Chinese character deciphering at the glyph level.</abstract>
-      <url hash="d6774774">2024.findings-acl.718</url>
+      <url hash="119d6182">2024.findings-acl.718</url>
       <bibkey>chi-etal-2024-ancient</bibkey>
       <doi>10.18653/v1/2024.findings-acl.718</doi>
       <video href="2024.findings-acl.718.mp4"/>
@@ -16175,7 +16175,7 @@
       <author><first>Pushpak</first><last>Bhattacharyya</last><affiliation>Indian Institute of Technology, Bombay, Dhirubhai Ambani Institute Of Information and Communication Technology</affiliation></author>
       <pages>12075-12097</pages>
       <abstract>LLMs have demonstrated remarkable capability for understanding semantics, but their understanding of pragmatics is not well studied. To this end, we release a Pragmatics Understanding Benchmark (PUB) dataset consisting of fourteen tasks in four pragmatics phenomena, namely; Implicature, Presupposition, Reference, and Deixis. We curate high-quality test sets for each task, consisting of Multiple Choice Question Answers (MCQA). PUB includes a total of 28k data points, 6.1k are newly annotated. We evaluate nine models varying in the number of parameters and type of training. Our study reveals several key observations about the pragmatic capabilities of LLMs: 1. chat-fine-tuning strongly benefits smaller models, 2. large base models are competitive with their chat-fine-tuned counterparts, 3. there is a huge variance in performance across different pragmatics phenomena, and 4. a noticeable performance gap between human capabilities and model capabilities. We hope that PUB will enable comprehensive evaluation of LLM’s pragmatic reasoning capabilities.</abstract>
-      <url hash="e9028b54">2024.findings-acl.719</url>
+      <url hash="1a6feaa7">2024.findings-acl.719</url>
       <bibkey>sravanthi-etal-2024-pub</bibkey>
       <doi>10.18653/v1/2024.findings-acl.719</doi>
       <video href="2024.findings-acl.719.mp4"/>
@@ -16187,7 +16187,7 @@
       <author><first>Zixing</first><last>Zhang</last><affiliation>Hunan University</affiliation></author>
       <pages>12098-12110</pages>
       <abstract>This paper introduces EmoTransKG, an innovative Emotion Knowledge Graph (EKG) that establishes connections and transformations between emotions across diverse open-textual events. Compared to existing EKGs, which primarily focus on linking emotion keywords to related terms or on assigning sentiment dimension ratings to emotion words by humans, EmoTransKG aims to represent the general knowledge involved in emotion transformation. Specifically, in conversations, successive emotions expressed by a single speaker are temporally considered as the head and tail entities, with open-text utterances (events) occurring between them representing the relation. To explore the knowledge of emotion transformations described in EmoTransKG, we develop a Transformer-based translational model called EmoTransNet, which predictively trains tail entities by interpreting the relation as an operation that transforms the source emotion into the target emotion. Particularly, our designed EmoTransNet serves as a plug-in module that seamlessly integrates with any conversational emotion recognition (CER) models for emotion retrofitting. Experimental results on two CER datasets demonstrate that the incorporation of EmoTransNet with baseline models results in substantial improvements, and the qualitative visualization of entities and relations clearly clarify their unique roles in emotion transformations. These experiments confirm the quality and effectiveness of EmoTransKG.</abstract>
-      <url hash="5367878e">2024.findings-acl.720</url>
+      <url hash="beb4c6de">2024.findings-acl.720</url>
       <bibkey>zhao-etal-2024-emotranskg</bibkey>
       <doi>10.18653/v1/2024.findings-acl.720</doi>
       <video href="2024.findings-acl.720.mp4"/>
@@ -16200,7 +16200,7 @@
       <author><first>Lei</first><last>Li</last><affiliation>School of Computer Science, Carnegie Mellon University</affiliation></author>
       <pages>12111-12130</pages>
       <abstract>Large Language Models (LLMs), often show strong performance on English tasks, while exhibiting limitations on other languages. What is an LLM’s multilingual capability when it is trained only on certain languages? The underlying mechanism remains unclear. This study endeavors to examine the multilingual capability of LLMs from the vocabulary sharing perspective by conducting an exhaustive analysis across 101 languages. Through the investigation of the performance gap before and after embedding fine-tuning, we discovered four distinct quadrants. By delving into each quadrant we provide actionable and efficient guidelines for tuning these languages. Extensive experiments reveal that existing LLMs possess multilingual capabilities that surpass our expectations, and we can significantly improve the multilingual performance of LLMs based on these attributes of each quadrant .</abstract>
-      <url hash="7ec326ed">2024.findings-acl.721</url>
+      <url hash="82a5134c">2024.findings-acl.721</url>
       <bibkey>yuan-etal-2024-vocabulary</bibkey>
       <doi>10.18653/v1/2024.findings-acl.721</doi>
     </paper>
@@ -16213,7 +16213,7 @@
       <author><first>Yue</first><last>Zhang</last><affiliation>Westlake University</affiliation></author>
       <pages>12131-12145</pages>
       <abstract>While supervised fine-tuning (SFT) has been a straightforward approach for tailoring the output of foundation large language model (LLM) to specific preferences, concerns have been raised about the depth of this alignment, with some critiques suggesting it is merely “superficial”. We critically examine this hypothesis within the scope of cross-lingual generation tasks, proposing that the effectiveness of SFT may be constrained by its reliance on prior tokens to guide cross-lingual generation. Based on this crucial insight, and in response to the challenges posed by the costly and limited availability of non-English data for SFT, we introduce a novel training-free alignment method named PreTTY, which employs minimal task-related prior tokens to bridge the foundation LLM and the SFT LLM, achieving comparable performance without training. Experiments on machine translation and part-of-speech tagging across seven languages demonstrate the efficacy of PreTTY in cross-lingual settings. Remarkably, by initiating the decoding process with only one or two prior tokens, foundation LLMs can attain up to 98% of the performance metrics of their SFT counterparts. This method presents a cost-effective alternative to traditional SFT and advances the democratization of multilingual LLMs.</abstract>
-      <url hash="b8a7e0d2">2024.findings-acl.722</url>
+      <url hash="5719166b">2024.findings-acl.722</url>
       <bibkey>zhan-etal-2024-prefix</bibkey>
       <doi>10.18653/v1/2024.findings-acl.722</doi>
       <video href="2024.findings-acl.722.mp4"/>
@@ -16229,7 +16229,7 @@
       <author><first>Shuangyong</first><last>Song</last></author>
       <pages>12146-12158</pages>
       <abstract>Hierarchical text classification aims at categorizing texts into a multi-tiered tree-structured hierarchy of labels. Existing methods pay more attention to capture hierarchy-aware text feature by exploiting explicit parent-child relationships, while interactions between peer labels are rarely taken into account, resulting in severe label confusion within each layer. In this work, we propose a novel Dual Prompt Tuning (DPT) method, which emphasizes identifying discrimination among peer labels by performing contrastive learning on each hierarchical layer. We design an innovative hand-crafted prompt containing slots for both positive and negative label predictions to cooperate with contrastive learning. In addition, we introduce a label hierarchy self-sensing auxiliary task to ensure cross-layer label consistency. Extensive experiments demonstrate that DPT achieves significant improvements and outperforms the current state-of-the-art methods on BGC and RCV1-V2 benchmark datasets.</abstract>
-      <url hash="130c8601">2024.findings-acl.723</url>
+      <url hash="acf30dbf">2024.findings-acl.723</url>
       <bibkey>xiong-etal-2024-dual</bibkey>
       <doi>10.18653/v1/2024.findings-acl.723</doi>
       <video href="2024.findings-acl.723.mp4"/>
@@ -16241,7 +16241,7 @@
       <author><first>Edoardo</first><last>Ponti</last><affiliation>University of Edinburgh</affiliation></author>
       <pages>12159-12173</pages>
       <abstract>Multilingual Large Language Models (LLMs) achieve remarkable levels of zero-shot cross-lingual transfer performance. We speculate that this is predicated on their ability to align languages without explicit supervision from parallel sentences. While representations of translationally equivalent sentences in different languages are known to be similar after convergence, however, it remains unclear how such cross-lingual alignment emerges during pre-training of LLMs. Our study leverages intrinsic probing techniques, which identify which subsets of neurons encode linguistic features, to correlate the degree of cross-lingual neuron overlap with the zero-shot cross-lingual transfer performance for a given model. In particular, we rely on checkpoints of BLOOM, a multilingual autoregressive LLM, across different training steps and model scales. We observe a high correlation between neuron overlap and downstream performance, which supports our hypothesis on the conditions leading to effective cross-lingual transfer. Interestingly, we also detect a degradation of both implicit alignment and multilingual abilities in certain phases of the pre-training process, providing new insights into the multilingual pretraining dynamics.</abstract>
-      <url hash="844c9332">2024.findings-acl.724</url>
+      <url hash="4608be28">2024.findings-acl.724</url>
       <bibkey>wang-etal-2024-probing-emergence</bibkey>
       <doi>10.18653/v1/2024.findings-acl.724</doi>
       <video href="2024.findings-acl.724.mp4"/>
@@ -16256,7 +16256,7 @@
       <author><first>Haoran</first><last>Zheng</last></author>
       <pages>12174-12185</pages>
       <abstract>This study introduces the Semantic Textual Similarity Pseudo-Label Semi-Supervised Clustering (STSPL-SSC) framework. The STSPL-SSC framework is designed to tackle the prevalent issue of scarce labeled data by combining a Semantic Textual Similarity Pseudo-Label Generation process with a Robust Contrastive Learning module. The process begins with employing k-means clustering on embeddings for initial pseudo-Label allocation. Then we use a Semantic Text Similarity-enhanced module to supervise the secondary clustering of pseudo-labels using labeled data to better align with the real clustering centers. Subsequently, an Adaptive Optimal Transport (AOT) approach fine-tunes the pseudo-labels. Finally, a Robust Contrastive Learning module is employed to foster the learning of classification and instance-level distinctions, aiding clusters to better separate. Experiments conducted on multiple real-world datasets demonstrate that with just one label per class, clustering performance can be significantly improved, outperforming state-of-the-art models with an increase of 1-6% in both accuracy and normalized mutual information, approaching the results of fully-labeled classification.</abstract>
-      <url hash="5b04c3c1">2024.findings-acl.725</url>
+      <url hash="40956280">2024.findings-acl.725</url>
       <bibkey>nie-etal-2024-stspl</bibkey>
       <doi>10.18653/v1/2024.findings-acl.725</doi>
       <video href="2024.findings-acl.725.mp4"/>
@@ -16272,7 +16272,7 @@
       <author><first>Deyi</first><last>Xiong</last><affiliation>Tianjin University</affiliation></author>
       <pages>12186-12215</pages>
       <abstract>Increasing the number of parameters in large language models (LLMs) usually improves performance in downstream tasks but raises compute and memory costs, making deployment difficult in resource-limited settings. Quantization techniques, which reduce the bits needed for model weights or activations with minimal performance loss, have become popular due to the rise of LLMs. However, most quantization studies use pre-trained LLMs, and the impact of quantization on instruction-tuned LLMs and the relationship between perplexity and benchmark performance of quantized LLMs are not well understood. Evaluation of quantized LLMs is often limited to language modeling and a few classification tasks, leaving their performance on other benchmarks unclear. To address these gaps, we propose a structured evaluation framework consisting of three critical dimensions: (1) knowledge &amp; capacity, (2) alignment, and (3) efficiency, and conduct extensive experiments across ten diverse benchmarks. Our experimental results indicate that LLMs with 4-bit quantization can retain performance comparable to their non-quantized counterparts, and perplexity can serve as a proxy metric for quantized LLMs on most benchmarks. Furthermore, quantized LLMs with larger parameter scales can outperform smaller LLMs. Despite the memory savings achieved through quantization, it can also slow down the inference speed of LLMs. Consequently, substantial engineering efforts and hardware support are imperative to achieve a balanced optimization of decoding speed and memory consumption in the context of quantized LLMs.</abstract>
-      <url hash="9e524135">2024.findings-acl.726</url>
+      <url hash="9a624c12">2024.findings-acl.726</url>
       <bibkey>jin-etal-2024-comprehensive</bibkey>
       <doi>10.18653/v1/2024.findings-acl.726</doi>
     </paper>
@@ -16287,7 +16287,7 @@
       <author><first>JingBo</first><last>Zhu</last><affiliation>Northeastern University</affiliation></author>
       <pages>12216-12228</pages>
       <abstract>Neural Machine Translation (NMT) encounters challenges when translating in new domains and low-resource languages. To address these issues, researchers have proposed methods to integrate additional knowledge into NMT, such as translation memories (TMs). However, finding TMs that closely match the input sentence remains challenging, particularly in specific domains. On the other hand, monolingual data is widely accessible in most languages, and back-translation is seen as a promising approach for utilizing target language data. Nevertheless, it still necessitates additional training. In this paper, we introduce Pseudo-<tex-math>k</tex-math>NN-MT, a variant of <tex-math>k</tex-math>-nearest neighbor machine translation (<tex-math>k</tex-math>NN-MT) that utilizes target language data by constructing a pseudo datastore. Furthermore, we investigate the utility of large language models (LLMs) for the <tex-math>k</tex-math>NN component. Experimental results demonstrate that our approach exhibits strong domain adaptation capability in both high-resource and low-resource machine translation. Notably, LLMs are found to be beneficial for robust NMT systems.</abstract>
-      <url hash="1cd5569e">2024.findings-acl.727</url>
+      <url hash="d657df22">2024.findings-acl.727</url>
       <bibkey>reheman-etal-2024-exploiting</bibkey>
       <doi>10.18653/v1/2024.findings-acl.727</doi>
     </paper>
@@ -16299,7 +16299,7 @@
       <author><first>Gabriella</first><last>Kazai</last><affiliation>Amazon</affiliation></author>
       <pages>12229-12272</pages>
       <abstract>An important requirement for the reliable deployment of pre-trained large language models (LLMs) is the well-calibrated quantification of the uncertainty in their outputs. While the likelihood of predicting the next token is a practical surrogate of the data uncertainty learned during training, model uncertainty is challenging to estimate, i.e., due to lack of knowledge acquired during training. Prior efforts to quantify uncertainty of neural networks require specific architectures or (re-)training strategies, which are impractical to apply to LLMs with several billion parameters, or for black-box models where the architecture and parameters are not available. In this paper, we propose Bayesian Prompts Ensembles (BayesPE), a novel approach to effectively obtain well-calibrated uncertainty for the output of pre-trained LLMs. BayesPE computes output probabilities through a weighted ensemble of different, but semantically equivalent, task instruction prompts. The relative weights of the different prompts in the ensemble are estimated through approximate Bayesian variational inference over a small labeled validation set. We demonstrate that BayesPE approximates a Bayesian input layer for the LLM, providing a lower bound on the expected model error. In our extensive experiments, we show that BayesPE achieves significantly superior uncertainty calibration compared to several baselines over a range of natural language classification tasks, both in zero- and few-shot settings.</abstract>
-      <url hash="9434ec2f">2024.findings-acl.728</url>
+      <url hash="04a50120">2024.findings-acl.728</url>
       <bibkey>tonolini-etal-2024-bayesian</bibkey>
       <doi>10.18653/v1/2024.findings-acl.728</doi>
     </paper>
@@ -16310,7 +16310,7 @@
       <author><first>Zhen-Hua</first><last>Ling</last><affiliation>University of Science and Technology of China</affiliation></author>
       <pages>12273-12287</pages>
       <abstract>Automated audio captioning (AAC) aims to generate descriptions based on audio input, attracting exploration of emerging audio language models (ALMs). However, current evaluation metrics only provide a single score to assess the overall quality of captions without characterizing the nuanced difference by systematically going through an evaluation checklist. To this end, we propose the explainable and multi-factor audio captioning evaluation (X-ACE) paradigm. X-ACE identifies four main factors that constitute the majority of audio features, specifically sound event, source, attribute and relation. To assess a given caption from an ALM, it is firstly transformed into an audio graph, where each node denotes an entity in the caption and corresponds to a factor. On the one hand, graph matching is conducted from part to whole for a holistic assessment. On the other hand, the nodes contained within each factor are aggregated to measure the factor-level performance. The pros and cons of an ALM can be explicitly and clearly demonstrated through X-ACE, pointing out the direction for further improvements. Experiments show that X-ACE exhibits better correlation with human perception and can detect mismatches sensitively.</abstract>
-      <url hash="bbe2e5f5">2024.findings-acl.729</url>
+      <url hash="bd2b7fdd">2024.findings-acl.729</url>
       <bibkey>wang-etal-2024-x</bibkey>
       <doi>10.18653/v1/2024.findings-acl.729</doi>
     </paper>
@@ -16323,7 +16323,7 @@
       <author><first>Shuming</first><last>Shi</last><affiliation>Tencent AI Lab</affiliation></author>
       <pages>12288-12304</pages>
       <abstract>As humans, we consistently interact with our peers and receive feedback in the form of natural language. This language feedback allows us to maintain appropriate behavior, and rectify potential errors. The question arises naturally: can we use language feedback to align large language models (LLMs)? In contrast to previous research that aligns LLMs with scalar rewards, we present the first systematic exploration of alignment through the lens of language feedback (i.e., judgment). We start with an in-depth investigation of potential methods that can be adapted for aligning LLMs with judgments, revealing that these methods cannot fully capitalize on judgments. To facilitate more effective utilization of judgments, we propose a novel framework, Contrastive Unlikelihood Training (CUT), that allows for fine-grained inappropriate content detection and correction based on judgments. Our results show that, with merely 1317 off-the-shelf judgment data, CUT can beat the 175B DaVinci003 and surpass the best baseline by 50.84 points on AlpacaEval using LLaMA2-13b. CUT can also align LLMs in an iterative fashion using up-to-date model-specific judgments, improving performance from 81.09 to 91.68 points on AlpacaEval using LLaMA2-chat-13b. Further analysis suggests that judgments hold greater potential in LLM alignment than rewards.</abstract>
-      <url hash="f479ddfd">2024.findings-acl.730</url>
+      <url hash="5c068046">2024.findings-acl.730</url>
       <bibkey>xu-etal-2024-reasons</bibkey>
       <doi>10.18653/v1/2024.findings-acl.730</doi>
       <video href="2024.findings-acl.730.mp4"/>
@@ -16339,7 +16339,7 @@
       <author><first>Ruifeng</first><last>Xu</last><affiliation>Harbin Institute of Technology</affiliation></author>
       <pages>12305-12322</pages>
       <abstract>Argumentative Essay Generation (AEG) is a challenging task in computational argumentation, where detailed logical reasoning and effective rhetorical skills are essential.Previous methods on argument generation typically involve planning prior to generation.However, the planning strategies in these methods overlook the exploration of the logical reasoning process.Inspired by argument structure-related theories, we propose an argumentative planning strategy for prompting large language models (LLMs) to generate high-quality essays.This strategy comprises two stages: (1) Sketch planning, which creates a rough outline of the essay, and (2) Dialectical planning, which refines the outline through critical self-reflection.Such a planning strategy enables LLMs to write argumentative essays that are more logical, diverse, and persuasive.Furthermore, due to the scarcity of existing AEG datasets, we construct three new datasets.These datasets are from two domains: exam essays and news editorials, covering both Chinese and English.Automatic and manual evaluation on four datasets show that our method can generate more dialectical and persuasive essays with higher diversity compared to several strong baselines.</abstract>
-      <url hash="370de7b8">2024.findings-acl.731</url>
+      <url hash="4a35ca65">2024.findings-acl.731</url>
       <bibkey>he-etal-2024-decomposing</bibkey>
       <doi>10.18653/v1/2024.findings-acl.731</doi>
     </paper>
@@ -16350,7 +16350,7 @@
       <author><first>Preslav</first><last>Nakov</last><affiliation>Mohamed bin Zayed University of Artificial Intelligence</affiliation></author>
       <pages>12323-12334</pages>
       <abstract>Recognizing fallacies is crucial for ensuring the quality and validity of arguments across various domains. However, computational fallacy recognition faces challenges due to the diverse genres, domains, and types of fallacies found in datasets. This leads to a highly multi-class, and even multi-label, setup with substantial class imbalance. In this study, we aim to enhance existing models for fallacy recognition by incorporating additional context and by leveraging large language models to generate synthetic data, thus increasing the representation of the infrequent classes. We experiment with GPT3.5 to generate synthetic examples and we examine the impact of prompt settings for this. Moreover, we explore zero-shot and few-shot scenarios to evaluate the effectiveness of using the generated examples for training smaller models within a unified fallacy recognition framework. Furthermore, we analyze the overlap between the synthetic data and existing fallacy datasets. Finally, we investigate the usefulness of providing supplementary context for detecting fallacy types that need such context, e.g., diversion fallacies. Our evaluation results demonstrate consistent improvements across fallacy types, datasets, and generators. The code and the synthetic datasets are all publicly available.</abstract>
-      <url hash="19175f8c">2024.findings-acl.732</url>
+      <url hash="f2dc2138">2024.findings-acl.732</url>
       <bibkey>alhindi-etal-2024-large</bibkey>
       <doi>10.18653/v1/2024.findings-acl.732</doi>
       <video href="2024.findings-acl.732.mp4"/>
@@ -16362,7 +16362,7 @@
       <author><first>Petr</first><last>Sojka</last><affiliation>Faculty of Informatics, Masaryk University</affiliation></author>
       <pages>12335-12352</pages>
       <abstract>Many recent language models (LMs) are capable of in-context learning (ICL), manifested in the LMs’ ability to perform a new task solely from natural-language instruction. Previous work curating in-context learners assumes that ICL emerges from a vast over-parametrization or the scale of multi-task training. However, recent theoretical work attributes the ICL ability to concept-dependent training data and creates functional in-context learners even in small-scale, synthetic settings.In this work, we practically explore this newly identified axis of ICL quality. We propose Concept-aware Training (CoAT), a framework for constructing training scenarios that make it beneficial for the LM to learn to utilize the analogical reasoning concepts from demonstrations. We find that by using CoAT, pre-trained transformers can learn to better utilise new latent concepts from demonstrations and that such ability makes ICL more robust to the functional deficiencies of the previous models. Finally, we show that concept-aware in-context learners are much more effective in in-context learning a majority of unseen tasks compared to traditional instruction tuning, and fare comparably also to previous in-context learners trained in large-scale multitask learning requiring magnitudes of more training data.</abstract>
-      <url hash="4a078248">2024.findings-acl.733</url>
+      <url hash="32cf6d86">2024.findings-acl.733</url>
       <bibkey>stefanik-etal-2024-concept</bibkey>
       <doi>10.18653/v1/2024.findings-acl.733</doi>
     </paper>
@@ -16373,7 +16373,7 @@
       <author><first>Kokil</first><last>Jaidka</last><affiliation>National University of Singapore</affiliation></author>
       <pages>12353-12360</pages>
       <abstract>Supervised machine-learning models for predicting user behavior offer a challenging classification problem with lower average prediction performance scores than other text classification tasks. This study evaluates multi-task learning frameworks grounded in Cognitive Appraisal Theory to predict user behavior as a function of users’ self-expression and psychological attributes. Our experiments show that users’ language and traits improve predictions above and beyond models predicting only from text. Our findings highlight the importance of integrating psychological constructs into NLP to enhance the understanding and prediction of user actions. We close with a discussion of the implications for future applications of large language models for computational psychology.</abstract>
-      <url hash="00e97e80">2024.findings-acl.734</url>
+      <url hash="5a26ed17">2024.findings-acl.734</url>
       <bibkey>yeo-etal-2024-beyond</bibkey>
       <doi>10.18653/v1/2024.findings-acl.734</doi>
       <video href="2024.findings-acl.734.mp4"/>
@@ -16385,7 +16385,7 @@
       <author><first>Wei</first><last>Lu</last><affiliation>Singapore University of Technology and Design</affiliation></author>
       <pages>12361-12372</pages>
       <abstract>In non-autoregressive translation (NAT), directed acyclic Transformers (DAT) have demonstrated their ability to achieve comparable performance to the autoregressive Transformers.In this paper, we first show that DAT is essentially a fully connected left-to-right Hidden Markov Model (HMM), with the source and target sequences being observations and the token positions being latent states.Even though generative models like HMM do not suffer from label bias in traditional task settings (e.g., sequence labeling), we argue here that the left-to-right HMM in NAT may still encounter this issue due to the missing observations at the inference stage.To combat label bias, we propose two constrained HMMs: 1) Adaptive Window HMM, which explicitly balances the number of outgoing transitions at different states; 2) Bi-directional HMM, i.e., a combination of left-to-right and right-to-left HMMs, whose uni-directional components can implicitly regularize each other’s biases via shared parameters.Experimental results on WMT’14 EnDe and WMT’17 ZhEn demonstrate that our methods can achieve better or comparable performance to the original DAT using various decoding methods.We also demonstrate that our methods effectively reduce the impact of label bias.</abstract>
-      <url hash="958d038a">2024.findings-acl.735</url>
+      <url hash="eebeab44">2024.findings-acl.735</url>
       <bibkey>li-etal-2024-non</bibkey>
       <doi>10.18653/v1/2024.findings-acl.735</doi>
     </paper>
@@ -16401,7 +16401,7 @@
       <author><first>Ruifeng</first><last>Xu</last><affiliation>Harbin Institute of Technology</affiliation></author>
       <pages>12373-12387</pages>
       <abstract>Stance detection is a challenging task that aims to identify public opinion from social media platforms with respect to specific targets. Previous work on stance detection largely focused on pure texts. In this paper, we study multi-modal stance detection for tweets consisting of texts and images, which are prevalent in today’s fast-growing social media platforms where people often post multi-modal messages. To this end, we create five new multi-modal stance detection datasets of different domains based on Twitter, in which each example consists of a text and an image. In addition, we propose a simple yet effective Targeted Multi-modal Prompt Tuning framework (TMPT), where target information is leveraged to learn multi-modal stance features from textual and visual modalities. Experimental results on our five benchmark datasets show that the proposed TMPT achieves state-of-the-art performance in multi-modal stance detection.</abstract>
-      <url hash="dacb7779">2024.findings-acl.736</url>
+      <url hash="c238f558">2024.findings-acl.736</url>
       <bibkey>liang-etal-2024-multi</bibkey>
       <doi>10.18653/v1/2024.findings-acl.736</doi>
     </paper>
@@ -16413,7 +16413,7 @@
       <author><first>Lu</first><last>Wang</last><affiliation>Northeastern University, Northeastern University and University of Michigan</affiliation></author>
       <pages>12388-12400</pages>
       <abstract>Large language models (LLMs) can generate long-form and coherent text, yet they often hallucinate facts, which undermines their reliability. To mitigate this issue, inference-time methods steer LLM representations toward the “truthful directions” previously learned for truth elicitation. However, applying these truthful directions with the same intensity fails to generalize across different query contexts. We propose LITO, a Learnable Intervention method for Truthfulness Optimization that automatically identifies the optimal intervention intensity tailored to each specific context. LITO explores a sequence of model generations based on increasing levels of intervention intensities. It selects the most accurate response or refuses to answer when the predictions are highly uncertain. Experiments on multiple LLMs and question-answering datasets demonstrate that LITO improves truthfulness while preserving task accuracy. The adaptive nature of LITO counters the limitations of one-size-fits-all intervention methods, maximizing truthfulness by reflecting the model’s internal knowledge only when it is confident. Our code is available at https://github.com/launchnlp/LITO.</abstract>
-      <url hash="a8f33041">2024.findings-acl.737</url>
+      <url hash="c9a93abd">2024.findings-acl.737</url>
       <bibkey>fatahi-bayat-etal-2024-enhanced</bibkey>
       <doi>10.18653/v1/2024.findings-acl.737</doi>
       <video href="2024.findings-acl.737.mp4"/>
@@ -16429,7 +16429,7 @@
       <author><first>Dong</first><last>Yu</last><affiliation>Tencent AI Lab</affiliation></author>
       <pages>12401-12430</pages>
       <abstract>In the past year, MultiModal Large Language Models (MM-LLMs) have undergone substantial advancements, augmenting off-the-shelf LLMs to support MM inputs or outputs via cost-effective training strategies. The resulting models not only preserve the inherent reasoning and decision-making capabilities of LLMs but also empower a diverse range of MM tasks. In this paper, we provide a comprehensive survey aimed at facilitating further research of MM-LLMs. Initially, we outline general design formulations for model architecture and training pipeline. Subsequently, we introduce a taxonomy encompassing 126 MM-LLMs, each characterized by its specific formulations. Furthermore, we review the performance of selected MM-LLMs on mainstream benchmarks and summarize key training recipes to enhance the potency of MM-LLMs. Finally, we explore promising directions for MM-LLMs while concurrently maintaining a [real-time tracking website](https://mm-llms.github.io/) for the latest developments in the field. We hope that this survey contributes to the ongoing advancement of the MM-LLMs domain.</abstract>
-      <url hash="3af3c9e8">2024.findings-acl.738</url>
+      <url hash="02621438">2024.findings-acl.738</url>
       <bibkey>zhang-etal-2024-mm</bibkey>
       <doi>10.18653/v1/2024.findings-acl.738</doi>
       <video href="2024.findings-acl.738.mp4"/>
@@ -16457,7 +16457,7 @@
       <author><first>Jie</first><last>Fu</last><affiliation>Hong Kong University of Science and Technology</affiliation></author>
       <pages>12431-12446</pages>
       <abstract>The advancement of large language models (LLMs) has enhanced the ability to generalize across a wide range of unseen natural language processing (NLP) tasks through instruction-following.Yet, their effectiveness often diminishes in low-resource languages like Chinese, exacerbated by biased evaluations from data leakage, casting doubt on their true generalizability to new linguistic territories. In response, we introduce the Chinese Instruction-Following Benchmark (**CIF-Bench**), designed to evaluate the zero-shot generalizability of LLMs to the Chinese language. CIF-Bench comprises 150 tasks and 15,000 input-output pairs, developed by native speakers to test complex reasoning and Chinese cultural nuances across 20 categories. To mitigate data contamination, we release only half of the dataset publicly, with the remainder kept private, and introduce diversified instructions to minimize score variance, totaling 45,000 data instances.Our evaluation of 28 selected LLMs reveals a noticeable performance gap, with the best model scoring only 52.9%, highlighting the limitations of LLMs in less familiar language and task contexts.This work not only uncovers the current limitations of LLMs in handling Chinese language tasks but also sets a new standard for future LLM generalizability research, pushing towards the development of more adaptable, culturally informed, and linguistically diverse models.</abstract>
-      <url hash="a529c416">2024.findings-acl.739</url>
+      <url hash="1d44117b">2024.findings-acl.739</url>
       <bibkey>li-etal-2024-cif</bibkey>
       <doi>10.18653/v1/2024.findings-acl.739</doi>
     </paper>
@@ -16471,7 +16471,7 @@
       <author><first>Olivier</first><last>Pietquin</last><affiliation>Cohere and Earth Species Project</affiliation></author>
       <pages>12447-12472</pages>
       <abstract>While reinforcement learning (RL) has been proven essential for tuning large language models (LLMs), it can lead to reward over-optimization (ROO). Existing approaches address ROO by adding KL regularization, requiring computationally expensive hyperparameter tuning. Additionally, KL regularization focuses solely on regularizing the language policy, neglecting a potential source of regularization: the reward function itself. Inspired by demonstration-guided RL, we here introduce the Reward Calibration from Demonstration (RCfD), which leverages human demonstrations and a reward model to recalibrate the reward objective. Formally, given a prompt, the RCfD objective minimizes the distance between the demonstrations’ and LLM’s rewards rather than directly maximizing the reward function. This objective shift avoids incentivizing the LLM to exploit the reward model and promotes more natural and diverse language generation.We show the effectiveness of RCfD in three RL language tasks, where it achieves comparable performance to carefully tuned baselines while mitigating ROO.</abstract>
-      <url hash="76430be8">2024.findings-acl.740</url>
+      <url hash="2cd0f920">2024.findings-acl.740</url>
       <bibkey>rita-etal-2024-countering</bibkey>
       <doi>10.18653/v1/2024.findings-acl.740</doi>
     </paper>
@@ -16483,7 +16483,7 @@
       <author><first>Aline</first><last>Villavicencio</last><affiliation>University of Exeter and University of Sheffield</affiliation></author>
       <pages>12473-12485</pages>
       <abstract>Accurately modeling idiomatic or non-compositional language has been a longstanding challenge in Natural Language Processing (NLP). This is partly because these expressions do not derive their meanings solely from their constituent words, but also due to the scarcity of relevant data resources, and their impact on the performance of downstream tasks such as machine translation and simplification. In this paper we propose an approach to model idiomaticity effectively using a triplet loss that incorporates the asymmetric contribution of components words to an idiomatic meaning for training language models by using adaptive contrastive learning and resampling miners to build an idiomatic-aware learning objective. Our proposed method is evaluated on a SemEval challenge and outperforms previous alternatives significantly in many metrics.</abstract>
-      <url hash="481d7fc9">2024.findings-acl.741</url>
+      <url hash="bb0fb373">2024.findings-acl.741</url>
       <bibkey>he-etal-2024-enhancing</bibkey>
       <doi>10.18653/v1/2024.findings-acl.741</doi>
     </paper>
@@ -16496,7 +16496,7 @@
       <author><first>Xipeng</first><last>Qiu</last><affiliation>Fudan University</affiliation></author>
       <pages>12486-12502</pages>
       <abstract>Large language models have achieved remarkable success, but their extensive parameter size necessitates substantial memory for training, thereby setting a high threshold. While the recently proposed low-memory optimization (LOMO) reduces memory footprint, its optimization technique, akin to stochastic gradient descent, is sensitive to hyper-parameters and exhibits suboptimal convergence, failing to match the performance of the prevailing optimizer for large language models, AdamW. Through analysis of the Adam optimizer, we found that, compared to momentum, the adaptive learning rate is more critical for bridging the gap. Building on this insight, we introduce the low-memory optimization with adaptive learning rate (AdaLomo), which offers an adaptive learning rate for each parameter and exhibits superior convergence performance compared to LOMO theoretically. To maintain memory efficiency, we employ non-negative matrix factorization for the second-order moment estimation. Additionally, we suggest the use of a grouped update normalization to stabilize convergence. Our experiments with instruction-tuning and further pre-training demonstrate that AdaLomo achieves results on par with AdamW, while significantly reducing memory requirements, thereby lowering the hardware barrier to training large language models. The code is accessible at https://github.com/OpenLMLab/LOMO.</abstract>
-      <url hash="b0fd406f">2024.findings-acl.742</url>
+      <url hash="d40c8aad">2024.findings-acl.742</url>
       <bibkey>lv-etal-2024-adalomo</bibkey>
       <doi>10.18653/v1/2024.findings-acl.742</doi>
     </paper>
@@ -16510,7 +16510,7 @@
       <author><first>Zhiguo</first><last>Wang</last></author>
       <pages>12503-12525</pages>
       <abstract>Current knowledge editing approaches struggle to effectively propagate updates to interconnected facts.In this work, we delve into the barriers that hinder the appropriate propagation of updated knowledge within these models for accurate reasoning. To support our analysis, we introduce a novel reasoning-based benchmark, ReCoE (Reasoning-based Counterfactual Editing dataset), which covers six common reasoning schemes in the real world. We conduct an extensive analysis of existing knowledge editing techniques, including input-augmentation, finetuning, and locate-and-edit methods. We found that all model editing methods exhibit notably low performance on this dataset, especially within certain reasoning schemes. Our analysis of the chain-of-thought responses from edited models indicate that, while the models effectively update individual facts, they struggle to recall these facts in reasoning tasks. Moreover, locate-and-edit methods severely deteriorate the models’ language modeling capabilities, leading to poor perplexity and logical coherence in their outputs.</abstract>
-      <url hash="582724d5">2024.findings-acl.743</url>
+      <url hash="28eb265c">2024.findings-acl.743</url>
       <bibkey>hua-etal-2024-propagation</bibkey>
       <doi>10.18653/v1/2024.findings-acl.743</doi>
       <video href="2024.findings-acl.743.mp4"/>
@@ -16524,7 +16524,7 @@
       <author><first>Maria</first><last>Liakata</last><affiliation>Queen Mary University London</affiliation></author>
       <pages>12526-12537</pages>
       <abstract>Through the rise of social media platforms, longitudinal language modelling has received much attention over the latest years, especially in downstream tasks such as mental health monitoring of individuals where modelling linguistic content in a temporal fashion is crucial. A key limitation in existing work is how to effectively model temporal sequences within Transformer-based language models. In this work we address this challenge by introducing a novel approach for predicting ‘Moments of Change’ (MoC) in the mood of online users, by simultaneously considering user linguistic and time-aware context. A Hawkes process-inspired transformation layer is applied over the proposed architecture to model the influence of time on users’ posts – capturing both their immediate and historical dynamics. We perform experiments on the two existing datasets for the MoC task and showcase clear performance gains when leveraging the proposed layer. Our ablation study reveals the importance of considering temporal dynamics in detecting subtle and rare mood changes. Our results indicate that considering linguistic and temporal information in a hierarchical manner provide valuable insights into the temporal dynamics of modelling user generated content over time, with applications in mental health monitoring.</abstract>
-      <url hash="ea36002d">2024.findings-acl.744</url>
+      <url hash="9d90830c">2024.findings-acl.744</url>
       <bibkey>hills-etal-2024-exciting</bibkey>
       <doi>10.18653/v1/2024.findings-acl.744</doi>
       <video href="2024.findings-acl.744.mp4"/>
@@ -16540,7 +16540,7 @@
       <author><first>Xiaodan</first><last>Liang</last></author>
       <pages>12538-12559</pages>
       <abstract>Understanding and following natural language instructions while navigating through complex, real-world environments poses a significant challenge for general-purpose robots. These environments often include obstacles and pedestrians, making it essential for autonomous agents to possess the capability of self-corrected planning to adjust their actions based on feedback from the surroundings. However, the majority of existing vision-and-language navigation (VLN) methods primarily operate in less realistic simulator settings and do not incorporate environmental feedback into their decision-making processes. To address this gap, we introduce a novel zero-shot framework called CorNav, utilizing a large language model for decision-making and comprising two key components: 1) incorporating environmental feedback for refining future plans and adjusting its actions, and 2) multiple domain experts for parsing instructions, scene understanding, and refining predicted actions. In addition to the framework, we develop a 3D simulator that renders realistic scenarios using Unreal Engine 5. To evaluate the effectiveness and generalization of navigation agents in a zero-shot multi-task setting, we create a benchmark called NavBench. Our empirical study involves deploying 7 baselines across four tasks, i.e., goal-conditioned navigation given a specific object category, goal-conditioned navigation given simple instructions, finding abstract objects based on high-level instructions, and step-by-step instruction following. Extensive experiments demonstrate that CorNav consistently outperforms all baselines by a significant margin across all tasks. On average, CorNav achieves a success rate of 28.1%, surpassing the best baseline’s performance of 20.5%.</abstract>
-      <url hash="8675f6b1">2024.findings-acl.745</url>
+      <url hash="40123dc0">2024.findings-acl.745</url>
       <bibkey>liang-etal-2024-cornav</bibkey>
       <doi>10.18653/v1/2024.findings-acl.745</doi>
       <video href="2024.findings-acl.745.mp4"/>
@@ -16563,7 +16563,7 @@
       <author><first>Chenghua</first><last>Lin</last><affiliation>University of Manchester</affiliation></author>
       <pages>12560-12574</pages>
       <abstract>Multi-modal information retrieval (MMIR) is a rapidly evolving field where significant progress has been made through advanced representation learning and cross-modality alignment research, particularly in image-text pairing.However, current benchmarks for evaluating MMIR performance on image-text pairings overlook the scientific domain, which has a notable gap with the generic data since the caption of scientific charts and tables usually describes the analysis of experimental results or scientific principles in contrast to human activity or scenery depicted in generic images.To bridge this gap, we develop a <b>sci</b>entific domain-specific <b>MMIR</b> benchmark (<b>SciMMIR</b>) by leveraging open-access research paper corpora to extract data relevant to the scientific domain. This benchmark comprises <b>530K</b> meticulously curated image-text pairs, extracted from figures and tables with detailed captions from scientific documents.We further annotate the image-text pairs with a two-level subset-subcategory hierarchy to facilitate a more comprehensive evaluation of the baselines. We conduct zero-shot and fine-tuned evaluations on prominent multi-modal image-captioning and visual language models, such as CLIP, BLIP, and BLIP-2.Our findings offer critical insights for MMIR in the scientific domain, including the impact of pre-training and fine-tuning settings and the effects of different visual and textual encoders.</abstract>
-      <url hash="62304cfd">2024.findings-acl.746</url>
+      <url hash="44032a93">2024.findings-acl.746</url>
       <bibkey>wu-etal-2024-scimmir</bibkey>
       <doi>10.18653/v1/2024.findings-acl.746</doi>
     </paper>
@@ -16574,7 +16574,7 @@
       <author><first>Yiannis</first><last>Aloimonos</last><affiliation>University of Maryland, College Park</affiliation></author>
       <pages>12575-12584</pages>
       <abstract>Videos are more informative than images becausethey capture the dynamics of the scene.By representing motion in videos, we can capturedynamic activities. In this work, we introduceGPT-4 generated motion descriptions thatcapture fine-grained motion descriptions of activitiesand apply them to three action datasets.We evaluated several video-text models on thetask of retrieval of motion descriptions. Wefound that they fall far behind human expertperformance on two action datasets, raisingthe question of whether video-text models understandmotion in videos. To address it, weintroduce a method of improving motion understandingin video-text models by utilizingmotion descriptions. This method proves tobe effective on two action datasets for the motiondescription retrieval task. The results drawattention to the need for quality captions involvingfine-grained motion information in existingdatasets and demonstrate the effectiveness ofthe proposed pipeline in understanding finegrainedmotion during video-text retrieval.</abstract>
-      <url hash="386a97b2">2024.findings-acl.747</url>
+      <url hash="b810c732">2024.findings-acl.747</url>
       <bibkey>devaraj-etal-2024-diving</bibkey>
       <doi>10.18653/v1/2024.findings-acl.747</doi>
       <video href="2024.findings-acl.747.mp4"/>
@@ -16587,7 +16587,7 @@
       <author><first>Stephen</first><last>Bach</last><affiliation>Computer Science Department, Brown University and Snorkel AI</affiliation></author>
       <pages>12585-12611</pages>
       <abstract>We introduce Bonito, an open-source model for conditional task generation that converts unannotated text into task-specific training datasets for instruction tuning. We aim to enable zero-shot task adaptation of large language models on users’ specialized, private data. We train Bonito by fine-tuning a pretrained large language model on a new large-scale dataset with 1.65M examples created by remixing existing instruction tuning datasets into meta-templates. The meta-templates for a dataset produce training examples where the input is the unannotated text and the task attribute and the output consists of the instruction and the response. We use Bonito to generate synthetic tasks for seven datasets from specialized domains with unannotated text across three task types—yes-no question answering, extractive question answering, and natural language inference—and adapt language models. We show that Bonito significantly improves the average performance of pretrained and instruction tuned models over the de facto self supervised baseline. For example, adapting Mistral-Instruct-v2 and instruction tuned variants of Mistral and Llama2 with Bonito improves the strong zero-shot performance by 22.1 F1 points whereas the next word prediction objective undoes some of the benefits of instruction tuning and reduces the average performance by 0.8 F1 points. We conduct additional experiments with Bonito to understand the effects of the domain, the size of the training set, and the choice of alternative synthetic task generators. Overall, we show that learning with synthetic instruction tuning datasets is an effective way to adapt language models to new domains. The model, dataset, and code are available at https://github.com/BatsResearch/bonito.</abstract>
-      <url hash="ebdd4f68">2024.findings-acl.748</url>
+      <url hash="0af81710">2024.findings-acl.748</url>
       <bibkey>nayak-etal-2024-learning</bibkey>
       <doi>10.18653/v1/2024.findings-acl.748</doi>
       <video href="2024.findings-acl.748.mp4"/>
@@ -16602,7 +16602,7 @@
       <author><first>Dimitra</first><last>Vergyri</last></author>
       <pages>12612-12627</pages>
       <abstract>Paraphrasing of offensive content is a better alternative to content removal and helps improve civility in a communication environment. Supervised paraphrasers; however, rely heavily on large quantities of labelled data to help preserve meaning and intent. They also often retain a large portion of the offensiveness of the original content, which raises questions on their overall usability. In this paper we aim to assist practitioners in developing usable paraphrasers by exploring In-Context Learning (ICL) with large language models (LLMs), i.e., using a limited number of input-label demonstration pairs to guide the model in generating desired outputs for specific queries. Our study focuses on key factors such as - number and order of demonstrations, exclusion of prompt instruction, and reduction in measured toxicity. We perform principled evaluation on three datasets, including our proposed Context-Aware Polite Paraphrase (CAPP) dataset, comprising of dialogue-style rude utterances, polite paraphrases, and additional dialogue context. We evaluate our approach using four closed source and one open source LLM. Our results reveal that ICL is comparable to supervised methods in generation quality, while being qualitatively better by 25% on human evaluation and attaining lower toxicity by 76%. Also, ICL-based paraphrasers only show a slight reduction in performance even with just 10% training data.</abstract>
-      <url hash="ceb57ad5">2024.findings-acl.749</url>
+      <url hash="72fbfcd3">2024.findings-acl.749</url>
       <bibkey>som-etal-2024-demonstrations</bibkey>
       <doi>10.18653/v1/2024.findings-acl.749</doi>
       <video href="2024.findings-acl.749.mp4"/>
@@ -16615,7 +16615,7 @@
       <author><first>Ritwik</first><last>Banerjee</last><affiliation>State University of New York, Stony Brook</affiliation></author>
       <pages>12628-12643</pages>
       <abstract>Whataboutism, a potent tool for disrupting narratives and sowing distrust, remains under-explored in quantitative NLP research. Moreover, past work has not distinguished its use as a strategy for misinformation and propaganda from its use as a tool for pragmatic and semantic framing. We introduce new datasets from Twitter/X and YouTube, revealing overlaps as well as distinctions between whataboutism, propaganda, and the tu quoque fallacy. Furthermore, drawing on recent work in linguistic semantics, we differentiate the ‘what about’ lexical construct from whataboutism. Our experiments bring to light unique challenges in its accurate detection, prompting the introduction of a novel method using attention weights for negative sample mining. We report significant improvements of 4% and 10% over previous state-of-the-art methods in our Twitter and YouTube collections, respectively.</abstract>
-      <url hash="cd044116">2024.findings-acl.750</url>
+      <url hash="cc9f4e5d">2024.findings-acl.750</url>
       <bibkey>phi-etal-2024-paying</bibkey>
       <doi>10.18653/v1/2024.findings-acl.750</doi>
       <video href="2024.findings-acl.750.mp4"/>
@@ -16626,7 +16626,7 @@
       <author><first>James</first><last>Thorne</last><affiliation>KAIST</affiliation></author>
       <pages>12644-12669</pages>
       <abstract>This paper investigates the inherent knowledge in language models from the perspective of epistemological holism. The purpose of this paper is to explore whether LLMs exhibit characteristics consistent with epistemological holism. These characteristics suggest that core knowledge, such as commonsense, general, and specific knowledge, each plays a specific role, serving as the foundation of our knowledge system and being difficult to revise. To assess these traits related to holism, we created a scientific reasoning dataset and examined the epistemology of language models through three tasks: Abduction, Revision, and Argument Generation. In the abduction task, the language models explained situations while avoiding revising the core knowledge. However, in other tasks, the language models were revealed not to distinguish between core and peripheral knowledge, showing an incomplete alignment with holistic knowledge principles.</abstract>
-      <url hash="fad9bec1">2024.findings-acl.751</url>
+      <url hash="5f988f5f">2024.findings-acl.751</url>
       <bibkey>kim-thorne-2024-epistemology</bibkey>
       <doi>10.18653/v1/2024.findings-acl.751</doi>
       <video href="2024.findings-acl.751.mp4"/>
@@ -16637,7 +16637,7 @@
       <author><first>Nicholas</first><last>Asher</last><affiliation>CNRS</affiliation></author>
       <pages>12670-12687</pages>
       <abstract>Despite great performance on many tasks, language models (LMs) still struggle with reasoning, sometimes providing responses that cannot possibly be true because they stem from logical incoherence. We call such responses strong hallucinations and prove that they follow from an LM’s computation of its internal representations for logical operators and outputs from those representations. Focusing on negation, we provide a novel solution in which negation is treated not as another element of a latent representation, but as an operation over an LM’s latent representations that constrains how they may evolve. We show that our approach improves model performance in cloze prompting and natural language inference tasks with negation without requiring training on sparse negative data.</abstract>
-      <url hash="852c171a">2024.findings-acl.752</url>
+      <url hash="2dc0fce6">2024.findings-acl.752</url>
       <bibkey>bhar-asher-2024-strong</bibkey>
       <doi>10.18653/v1/2024.findings-acl.752</doi>
       <video href="2024.findings-acl.752.mp4"/>
@@ -16649,7 +16649,7 @@
       <author><first>Chenghua</first><last>Lin</last><affiliation>University of Manchester</affiliation></author>
       <pages>12688-12701</pages>
       <abstract>Automatic evaluation of generated textual content presents an ongoing challenge within the field of NLP. Given the impressive capabilities of modern language models (LMs) across diverse NLP tasks, there is a growing trend to employ these models in creating innovative evaluation metrics for automated assessment of generation tasks. This paper investigates a pivotal question: Do language model-driven evaluation metrics inherently exhibit bias favoring texts generated by the same underlying language model? Specifically, we assess whether prominent LM-based evaluation metrics (e.g. BARTScore, T5Score, and GPTScore) demonstrate a favorable bias toward their respective underlying LMs in the context of summarization tasks. Our findings unveil a latent bias, particularly pronounced when such evaluation metrics are used in a reference-free manner without leveraging gold summaries. These results underscore that assessments provided by generative evaluation models can be influenced by factors beyond the inherent text quality, highlighting the necessity of developing more reliable evaluation protocols in the future.</abstract>
-      <url hash="02474c30">2024.findings-acl.753</url>
+      <url hash="06ed38b6">2024.findings-acl.753</url>
       <bibkey>liu-etal-2024-llms-narcissistic</bibkey>
       <doi>10.18653/v1/2024.findings-acl.753</doi>
     </paper>
@@ -16663,7 +16663,7 @@
       <author><first>Jakob</first><last>Foerster</last><affiliation>University of Oxford, University of Oxford</affiliation></author>
       <pages>12702-12716</pages>
       <abstract>Benchmarks have been essential for driving progress in machine learning. A better understanding of LLM capabilities on real world tasks is vital for safe development.Designing adequate LLM benchmarks is challenging: Data from real-world tasks is hard to collect, public availability of static evaluation data results in test data contamination and benchmark overfitting, and periodically generating new evaluation data is tedious and may result in temporally inconsistent results. We introduce HelloFresh, based on continuous streams of real-world data generated by intrinsically motivated human labelers. It covers recent events from X (formerly Twitter) community notes and edits of Wikipedia pages, mitigating the risk of test data contamination and benchmark overfitting.Any X user can propose an X note to add additional context to a misleading post (formerly tweet); if the community classifies it as helpful, it is shown with the post. Similarly, Wikipedia relies on community-based consensus, allowing users to edit articles or revert edits made by other users.Verifying whether an X note is helpful or whether a Wikipedia edit should be accepted are hard tasks that require grounding by querying the web.We backtest state-of-the-art LLMs supplemented with simple web search access and find that HelloFresh yields a temporally consistent ranking.To enable continuous evaluation on Hellofresh, we host a public leaderboard and periodically updated evaluation data at https://tinyurl.com/hello-fresh-LLM.</abstract>
-      <url hash="998b0020">2024.findings-acl.754</url>
+      <url hash="4a501d04">2024.findings-acl.754</url>
       <bibkey>franzmeyer-etal-2024-hellofresh</bibkey>
       <doi>10.18653/v1/2024.findings-acl.754</doi>
       <video href="2024.findings-acl.754.mp4"/>
@@ -16677,7 +16677,7 @@
       <author><first>Chitta</first><last>Baral</last><affiliation>Arizona State University</affiliation></author>
       <pages>12717-12733</pages>
       <abstract>This study explores the sycophantic tendencies of Large Language Models (LLMs), where these models tend to provide answers that match what users want to hear, even if they are not entirely correct. The motivation behind this exploration stems from the common behavior observed in individuals searching the internet for facts with partial or misleading knowledge. Similar to using web search engines, users may recall fragments of misleading keywords and submit them to an LLM, hoping for a comprehensive response. Our empirical analysis of several LLMs shows the potential danger of these models amplifying misinformation when presented with misleading keywords. Additionally, we thoroughly assess four existing hallucination mitigation strategies to reduce LLMs sycophantic behavior. Our experiments demonstrate the effectiveness of these strategies for generating factually correct statements. Furthermore, our analyses delve into knowledge-probing experiments on factual keywords and different categories of sycophancy mitigation.</abstract>
-      <url hash="baab0a86">2024.findings-acl.755</url>
+      <url hash="9f1b5d43">2024.findings-acl.755</url>
       <bibkey>rrv-etal-2024-chaos</bibkey>
       <doi>10.18653/v1/2024.findings-acl.755</doi>
       <video href="2024.findings-acl.755.mp4"/>
@@ -16690,18 +16690,18 @@
       <author><first>Kyumin</first><last>Lee</last><affiliation>Worcester Polytechnic Institute</affiliation></author>
       <pages>12734-12751</pages>
       <abstract>With the capabilities of understanding and executing natural language instructions, Large language models (LLMs) can potentially act as a powerful tool for textual data augmentation. However, the quality of augmented data depends heavily on the augmentation instructions provided, and the effectiveness can fluctuate across different downstream tasks. While manually crafting and selecting instructions can offer some improvement, this approach faces scalability and consistency issues in practice due to the diversity of downstream tasks. In this work, we address these limitations by proposing a new solution, which can automatically generate a large pool of augmentation instructions and select the most suitable task-informed instructions, thereby empowering LLMs to create high-quality augmented data for different downstream tasks. Empirically, the proposed approach consistently generates augmented data with better quality compared to non-LLM and LLM-based data augmentation methods, leading to the best performance on 26 few-shot learning tasks sourced from a wide range of application domains.</abstract>
-      <url hash="3c38496e">2024.findings-acl.756</url>
+      <url hash="6fd07a7a">2024.findings-acl.756</url>
       <bibkey>li-etal-2024-empowering</bibkey>
       <doi>10.18653/v1/2024.findings-acl.756</doi>
     </paper>
     <paper id="757">
       <title>Choose Your Transformer: Improved Transferability Estimation of Transformer Models on Classification Tasks</title>
-      <author><first>Lukas</first><last>Garbas</last></author>
+      <author><first>Lukas</first><last>Garbaciauskas</last></author>
       <author><first>Max</first><last>Ploner</last><affiliation>Humboldt Universität Berlin</affiliation></author>
       <author><first>Alan</first><last>Akbik</last><affiliation>Humboldt Universität Berlin</affiliation></author>
       <pages>12752-12768</pages>
       <abstract>There currently exists a multitude of pre-trained transformer language models (LMs) that are readily available. From a practical perspective, this raises the question of which pre-trained LM will perform best if fine-tuned for a specific downstream NLP task. However, exhaustively fine-tuning all available LMs to determine the best-fitting model is computationally infeasible. To address this problem, we present an approach that inexpensively estimates a ranking of the expected performance of a given set of candidate LMs for a given task. Following a layer-wise representation analysis, we extend existing approaches such as H-score and LogME by aggregating representations across all layers of the transformer model. We present an extensive analysis of 20 transformer LMs, 6 downstream NLP tasks, and various estimators (linear probing, kNN, H-score, and LogME). Our evaluation finds that averaging the layer representations significantly improves the Pearson correlation coefficient between the true model ranks and the estimate, increasing from 0.58 to 0.86 for LogME and from 0.65 to 0.88 for H-score.</abstract>
-      <url hash="786d4623">2024.findings-acl.757</url>
+      <url hash="8ae1017c">2024.findings-acl.757</url>
       <bibkey>garbaciauskas-etal-2024-choose</bibkey>
       <doi>10.18653/v1/2024.findings-acl.757</doi>
       <video href="2024.findings-acl.757.mp4"/>
@@ -16717,7 +16717,7 @@
       <author><first>Nanyun</first><last>Peng</last><affiliation>University of California, Los Angeles</affiliation></author>
       <pages>12769-12781</pages>
       <abstract>Event linking connects event mentions in text with relevant nodes in a knowledge base (KB). Prior research in event linking has mainly borrowed methods from entity linking, overlooking the distinct features of events. Compared to the extensively explored entity linking task, events have more complex structures and can be more effectively distinguished by examining their associated arguments. Moreover, the information-rich nature of events leads to the scarcity of event KBs. This emphasizes the need for event linking models to identify and classify event mentions not in the KB as “out-of-KB,” an area that has received limited attention. In this work, we tackle these challenges by introducing an argument-aware approach. First, we improve event linking models by augmenting input text with tagged event argument information, facilitating the recognition of key information about event mentions. Subsequently, to help the model handle “out-of-KB” scenarios, we synthesize out-of-KB training examples from in-KB instances through controlled manipulation of event arguments. Our experiment across two test datasets showed significant enhancements in both in-KB and out-of-KB scenarios, with a notable 22% improvement in out-of-KB evaluations.</abstract>
-      <url hash="e3af6b2c">2024.findings-acl.758</url>
+      <url hash="8667e610">2024.findings-acl.758</url>
       <bibkey>hsu-etal-2024-argument</bibkey>
       <doi>10.18653/v1/2024.findings-acl.758</doi>
       <video href="2024.findings-acl.758.mp4"/>
@@ -16733,7 +16733,7 @@
       <author><first>Tomas</first><last>Pfister</last><affiliation>Google</affiliation></author>
       <pages>12782-12803</pages>
       <abstract>Grounded generation aims to equip language models (LMs) with the ability to produce more credible and accountable responses by accurately citing verifiable sources. However, existing methods, by either feeding LMs with raw or preprocessed materials, remain prone to errors. To address this, we introduce CaLM, a novel verification framework. CaLM leverages the insight that a robust grounded response should be consistent with information derived solely from its cited sources. Our framework empowers smaller LMs, which rely less on parametric memory and excel at processing relevant information given a query, to validate the output of larger LMs. Larger LM responses that closely align with the smaller LMs’ output, which relies exclusively on cited documents, are verified. Responses showing discrepancies are iteratively refined through a feedback loop. Experiments on three open-domain question-answering datasets demonstrate significant performance gains of 1.5% to 7% absolute average without any required model fine-tuning.</abstract>
-      <url hash="0c450a6e">2024.findings-acl.759</url>
+      <url hash="beeaa9ba">2024.findings-acl.759</url>
       <bibkey>hsu-etal-2024-calm</bibkey>
       <doi>10.18653/v1/2024.findings-acl.759</doi>
       <video href="2024.findings-acl.759.mp4"/>
@@ -16751,7 +16751,7 @@
       <author><first>Heng</first><last>Ji</last><affiliation>University of Illinois, Urbana-Champaign</affiliation></author>
       <pages>12804-12825</pages>
       <abstract>Event extraction has gained considerable interest due to its wide-ranging applications. However, recent studies draw attention to evaluation issues, suggesting that reported scores may not accurately reflect the true performance. In this work, we identify and address evaluation challenges, including inconsistency due to varying data assumptions or preprocessing steps, the insufficiency of current evaluation frameworks that may introduce dataset or data split bias, and the low reproducibility of some previous approaches. To address these challenges, we present TextEE, a standardized, fair, and reproducible benchmark for event extraction. TextEE comprises standardized data preprocessing scripts and splits for 16 datasets spanning eight diverse domains and includes 14 recent methodologies, conducting a comprehensive benchmark reevaluation. We also evaluate five varied large language models on our TextEE benchmark and demonstrate how they struggle to achieve satisfactory performance. Inspired by our reevaluation results and findings, we discuss the role of event extraction in the current NLP era, as well as future challenges and insights derived from TextEE. We believe TextEE, the first standardized comprehensive benchmarking tool, will significantly facilitate future event extraction research.</abstract>
-      <url hash="e93bd030">2024.findings-acl.760</url>
+      <url hash="2ed32d35">2024.findings-acl.760</url>
       <bibkey>huang-etal-2024-textee</bibkey>
       <doi>10.18653/v1/2024.findings-acl.760</doi>
     </paper>
@@ -16765,7 +16765,7 @@
       <author><first>Hanna</first><last>Wallach</last><affiliation>Microsoft</affiliation></author>
       <pages>12826-12833</pages>
       <abstract>This paper examines the experiences of African American Language (AAL) speakers when using language technologies. Previous work has used quantitative methods to uncover performance disparities between AAL speakers and White Mainstream English speakers when using language technologies, but has not sought to understand the impacts of these performance disparities on AAL speakers. Through interviews with 19 AAL speakers, we focus on understanding such impacts in a contextualized and human-centered manner. We find that AAL speakers often undertake invisible labor of adapting their speech patterns to successfully use language technologies, and they make connections between failures of language technologies for AAL speakers and a lack of inclusion of AAL speakers in language technology design processes and datasets. Our findings suggest that NLP researchers and practitioners should invest in developing contextualized and human-centered evaluations of language technologies that seek to understand the impacts of performance disparities on speakers of underrepresented languages and language varieties.</abstract>
-      <url hash="f6cdf5d2">2024.findings-acl.761</url>
+      <url hash="7088da3c">2024.findings-acl.761</url>
       <bibkey>cunningham-etal-2024-understanding</bibkey>
       <doi>10.18653/v1/2024.findings-acl.761</doi>
     </paper>
@@ -16781,7 +16781,7 @@
       <author><first>Xiang</first><last>Yue</last><affiliation>Carnegie Mellon University</affiliation></author>
       <pages>12834-12859</pages>
       <abstract>The introduction of large language models has significantly advanced code generation. However, open-source models often lack the execution capabilities and iterative refinement of advanced systems like the GPT-4 Code Interpreter. To address this, we introduce OpenCodeInterpreter, a family of open-source code systems designed for generating, executing, and iteratively refining code. Supported by Code Feedback, a dataset featuring 68K multi-turn interactions, OpenCodeInterpreter integrates execution and human feedback for dynamic code refinement. Our comprehensive evaluation of OpenCodeInterpreter across key benchmarks such as HumanEval, MBPP, and their enhanced versions from EvalPlus reveals its exceptional performance. Notably, OpenCodeInterpreter-33B achieves an accuracy of 83.2 (76.4) on the average (and plus versions) of HumanEval and MBPP, closely rivaling GPT-4’s 84.2 (76.2) and further elevates to 91.6 (84.6) with synthesized human feedback from GPT-4. OpenCodeInterpreterbrings the gap between open-source code generation models and proprietary systems like GPT-4 Code Interpreter.</abstract>
-      <url hash="86d36109">2024.findings-acl.762</url>
+      <url hash="680eb2f9">2024.findings-acl.762</url>
       <bibkey>zheng-etal-2024-opencodeinterpreter</bibkey>
       <doi>10.18653/v1/2024.findings-acl.762</doi>
     </paper>
@@ -16793,7 +16793,7 @@
       <author><first>Diyi</first><last>Yang</last><affiliation>Stanford University</affiliation></author>
       <pages>12860-12877</pages>
       <abstract>Information Retrieval (IR) systems are designed to deliver relevant content, but traditional systems may not optimize rankings for fairness, neutrality, or the balance of ideas. Consequently, IR can often introduce indexical biases, or biases in the positional order of documents. Although indexical bias can demonstrably affect people’s opinion, voting patterns, and other behaviors, these issues remain understudied as the field lacks reliable metrics and procedures for automatically measuring indexical bias. Towards this end, we introduce the PAIR framework, which supports automatic bias audits for ranked documents or entire IR systems. After introducing DUO, the first general-purpose automatic bias metric, we run an extensive evaluation of 8 IR systems on a new corpus of 32k synthetic and 4.7k natural documents, with 4k queries spanning 1.4k controversial issue topics. A human behavioral study validates our approach, showing that our bias metric can help predict when and how indexical bias will shift a reader’s opinion.</abstract>
-      <url hash="6c3fa7fd">2024.findings-acl.763</url>
+      <url hash="a534179b">2024.findings-acl.763</url>
       <bibkey>ziems-etal-2024-measuring</bibkey>
       <doi>10.18653/v1/2024.findings-acl.763</doi>
       <video href="2024.findings-acl.763.mp4"/>
@@ -16814,7 +16814,7 @@
       <author><first>Maged</first><last>Al-shaibani</last><affiliation>King Fahad University of Petroleum and Minerals</affiliation></author>
       <pages>12878-12901</pages>
       <abstract>Instruction tuning has emerged as a prominent methodology for teaching Large Language Models (LLMs) to follow instructions. However, current instruction datasets predominantly cater to English or are derived from English-dominated LLMs, leading to inherent biases toward Western culture. This bias negatively impacts non-English languages such as Arabic and the unique culture of the Arab region. This paper addresses this limitation by introducing CIDAR, the first open Arabic instruction-tuning dataset culturally aligned by native Arabic speakers. CIDAR contains 10,000 instruction and output pairs that represent the Arab region. We discuss the cultural relevance of CIDAR via the analysis and comparison to a few models fine-tuned on other datasets. Our experiments indicate that models fine-tuned on CIDAR achieve better cultural alignment compared to those fine-tuned on 30x more data.</abstract>
-      <url hash="404a1713">2024.findings-acl.764</url>
+      <url hash="08242b20">2024.findings-acl.764</url>
       <bibkey>alyafeai-etal-2024-cidar</bibkey>
       <doi>10.18653/v1/2024.findings-acl.764</doi>
       <video href="2024.findings-acl.764.mp4"/>
@@ -16833,7 +16833,7 @@
       <author><first>Curtis</first><last>Langlotz</last><affiliation>Stanford University</affiliation></author>
       <pages>12902-12915</pages>
       <abstract>In order to enable extraction of structured clinical data from unstructured radiology reports, we introduce RadGraph-XL, a large-scale, expert-annotated dataset for clinical entity and relation extraction. RadGraph-XL consists of 2,300 radiology reports, which are annotated with over 410,000 entities and relations by board-certified radiologists. Whereas previous approaches focus solely on chest X-rays, RadGraph-XL includes data from four anatomy-modality pairs - chest CT, abdomen/pelvis CT, brain MR, and chest X-rays. Then, in order to automate structured information extraction, we use RadGraph-XL to train transformer-based models for clinical entity and relation extraction. Our evaluations include comprehensive ablation studies as well as an expert reader study that evaluates trained models on out-of-domain data. Results demonstrate that our model surpasses the performance of previous methods by up to 52% and notably outperforms GPT-4 in this domain. We release RadGraph-XL as well as our trained model to foster further innovation and research in structured clinical information extraction.</abstract>
-      <url hash="804d3e32">2024.findings-acl.765</url>
+      <url hash="928bdcb0">2024.findings-acl.765</url>
       <bibkey>delbrouck-etal-2024-radgraph</bibkey>
       <doi>10.18653/v1/2024.findings-acl.765</doi>
     </paper>
@@ -16844,7 +16844,7 @@
       <author><first>Ganesh</first><last>Ramakrishnan</last><affiliation>Indian Institute of Technology Bombay, Indian Institute of Technology Bombay</affiliation></author>
       <pages>12916-12934</pages>
       <abstract>Instruction Tuning involves finetuning a language model on a collection of instruction-formatted datasets in order to enhance the generalizability of the model to unseen tasks. Studies have shown the importance of balancing different task proportions during finetuning, but finding the right balance remains challenging. Unfortunately, there’s currently no systematic method beyond manual tuning or relying on practitioners’ intuition. In this paper, we introduce SMART (Submodular data Mixture strAtegy for instRuction Tuning) — a novel data mixture strategy which makes use of a submodular function to assign importance scores to tasks which are then used to determine the mixture weights. Given a fine-tuning budget, SMART redistributes the budget among tasks and selects non-redundant samples from each task. Experimental results demonstrate that SMART significantly outperforms traditional methods such as examples proportional mixing and equal mixing. Furthermore, SMART facilitates the creation of data mixtures based on a few representative subsets of tasks alone and through task pruning analysis, we reveal that in a limited budget setting, allocating budget among a subset of representative tasks yields superior performance compared to distributing the budget among all tasks. The code for reproducing our results is open-sourced at https://github.com/kowndinya-renduchintala/SMART.</abstract>
-      <url hash="560c39c4">2024.findings-acl.766</url>
+      <url hash="44e90c64">2024.findings-acl.766</url>
       <bibkey>renduchintala-etal-2024-smart</bibkey>
       <doi>10.18653/v1/2024.findings-acl.766</doi>
       <video href="2024.findings-acl.766.mp4"/>
@@ -16860,7 +16860,7 @@
       <author><first>Khyathi</first><last>Chandu</last></author>
       <pages>12935-12948</pages>
       <abstract>Selective prediction minimizes incorrect predictions from vision-language models (VLMs) by allowing them to abstain from answering when uncertain. However, when deploying a vision-language system with low tolerance for inaccurate predictions, selective prediction may be over-cautious and abstain too frequently, even on many correct predictions. We introduce ReCoVERR, an inference-time algorithm to reduce the over-abstention of a selective vision-language system without increasing the error rate of the system’s predictions. When the VLM makes a low-confidence prediction, instead of abstaining ReCoVERR tries to find relevant clues in the image that provide additional evidence for the prediction. ReCoVERR uses an LLM to pose related questions to the VLM, collects high-confidence evidences, and if enough evidence confirms the prediction the system makes a prediction instead of abstaining. ReCoVERR enables three VLMs (BLIP2, InstructBLIP and LLaVA-1.5) to answer up to 20% more questions on the VQAv2 and A-OKVQA tasks without decreasing system accuracy, thus improving overall system reliability. Our code is available at https://github.com/tejas1995/ReCoVERR.</abstract>
-      <url hash="9ef524cb">2024.findings-acl.767</url>
+      <url hash="1052d24b">2024.findings-acl.767</url>
       <bibkey>srinivasan-etal-2024-selective</bibkey>
       <doi>10.18653/v1/2024.findings-acl.767</doi>
       <video href="2024.findings-acl.767.mp4"/>
@@ -16871,7 +16871,7 @@
       <author><first>Constantine</first><last>Lignos</last><affiliation>Brandeis University</affiliation></author>
       <pages>12949-12956</pages>
       <abstract>We investigate ways of using monolingual data in both the source and target languages for improving low-resource machine translation. As a case study, we experiment with translation from Finnish to Northern Sámi.Our experiments show that while conventional backtranslation remains a strong contender, using synthetic target-side data when training backtranslation models can be helpful as well.We also show that monolingual data can be used to train a language model which can act as a regularizer without any augmentation of parallel data.</abstract>
-      <url hash="e6a041c5">2024.findings-acl.768</url>
+      <url hash="f1d024d7">2024.findings-acl.768</url>
       <bibkey>saleva-lignos-2024-language</bibkey>
       <doi>10.18653/v1/2024.findings-acl.768</doi>
     </paper>
@@ -16881,7 +16881,7 @@
       <author><first>Murali</first><last>Annavaram</last><affiliation>University of Southern California</affiliation></author>
       <pages>12957-12968</pages>
       <abstract>Large Language models (LLMs) are achieving state-of-the-art performance in many different downstream tasks. However, the increasing urgency of data privacy puts pressure on practitioners to train LLMs with Differential Privacy (DP) on private data. Concurrently, the exponential growth in parameter size of LLMs necessitates model compression before deployment of LLMs on resource-constrained devices or latency-sensitive applications. Differential privacy and model compression generally must trade off utility loss to achieve their objectives. Moreover, simultaneously applying both schemes can compound the utility degradation. To this end, we propose DistilDP: a novel differentially private knowledge distillation algorithm that exploits synthetic data generated by a differentially private teacher LLM. The knowledge of a teacher LLM is transferred onto the student in two ways: one way from the synthetic data itself– the hard labels, and the other way by the output distribution of the teacher evaluated on the synthetic data– the soft labels. Furthermore, if the teacher and student share a similar architectural structure, we can further distill knowledge by aligning the hidden representations between both. Our experimental results demonstrate that DistilDP can substantially improve the utility over existing baselines, at least 9.0 PPL on the Big Patent dataset, with strong privacy parameters, <tex-math>\epsilon=2</tex-math>. These promising results progress privacy-preserving compression of autoregressive LLMs. Our code can be accessed here: https://github.com/james-flemings/dp_compress.</abstract>
-      <url hash="32fa1243">2024.findings-acl.769</url>
+      <url hash="0d054a9c">2024.findings-acl.769</url>
       <bibkey>flemings-annavaram-2024-differentially</bibkey>
       <doi>10.18653/v1/2024.findings-acl.769</doi>
       <video href="2024.findings-acl.769.mp4"/>
@@ -16896,21 +16896,21 @@
       <author><first>David</first><last>Wadden</last><affiliation>Allen Institute for Artificial Intelligence</affiliation></author>
       <pages>12969-12990</pages>
       <abstract>Large language models (LLMs) adapted to follow user instructions are now widely deployed as conversational agents. In this work, we examine one increasingly common instruction-following task: providing writing assistance to compose a long-form answer. To evaluate the capabilities of current LLMs on this task, we construct KIWI, a dataset of knowledge-intensive writing instructions in the scientific domain. Given a research question, an initial model-generated answer and a set of relevant papers, an expert annotator iteratively issues instructions for the model to revise and improve its answer. We collect 1,260 interaction turns from 234 interaction sessions with three state-of-the-art LLMs. Each turn includes a user instruction, a model response, and a human evaluation of the model response. Through a detailed analysis of the collected responses, we find that all models struggle to incorporate new information into an existing answer, and to perform precise and unambiguous edits. Further, we find that models struggle to judge whether their outputs successfully followed user instructions, with accuracy at least 10 points short of human agreement. Our findings indicate that KIWI will be a valuable resource to measure progress and improve LLMs’ instruction-following capabilities for knowledge intensive writing tasks.</abstract>
-      <url hash="ec4da04e">2024.findings-acl.770</url>
+      <url hash="6937adc7">2024.findings-acl.770</url>
       <bibkey>xu-etal-2024-kiwi</bibkey>
       <doi>10.18653/v1/2024.findings-acl.770</doi>
       <video href="2024.findings-acl.770.mp4"/>
     </paper>
     <paper id="771">
       <title><fixed-case>XL</fixed-case>-<fixed-case>H</fixed-case>ead<fixed-case>T</fixed-case>ags: Leveraging Multimodal Retrieval Augmentation for the Multilingual Generation of News Headlines and Tags</title>
-      <author><first>Faisal Tareque</first><last>Shohan</last></author>
+      <author><first>Faisal</first><last>Shohan</last></author>
       <author><first>Mir Tafseer</first><last>Nayeem</last><affiliation>University of Alberta</affiliation></author>
       <author><first>Samsul</first><last>Islam</last></author>
       <author><first>Abu Ubaida</first><last>Akash</last></author>
       <author><first>Shafiq</first><last>Joty</last><affiliation>SalesForce.com and Nanyang Technological University</affiliation></author>
       <pages>12991-13024</pages>
       <abstract>Millions of news articles published online daily can overwhelm readers. Headlines and entity (topic) tags are essential for guiding readers to decide if the content is worth their time. While headline generation has been extensively studied, tag generation remains largely unexplored, yet it offers readers better access to topics of interest. The need for conciseness in capturing readers’ attention necessitates improved content selection strategies for identifying salient and relevant segments within lengthy articles, thereby guiding language models effectively. To address this, we propose to leverage auxiliary information such as images and captions embedded in the articles to retrieve relevant sentences and utilize instruction tuning with variations to generate both headlines and tags for news articles in a multilingual context. To make use of the auxiliary information, we have compiled a dataset named XL-HeadTags, which includes 20 languages across 6 diverse language families. Through extensive evaluation, we demonstrate the effectiveness of our plug-and-play multimodal-multilingual retrievers for both tasks. Additionally, we have developed a suite of tools for processing and evaluating multilingual texts, significantly contributing to the research community by enabling more accurate and efficient analysis across languages.</abstract>
-      <url hash="f03db790">2024.findings-acl.771</url>
+      <url hash="75cf664a">2024.findings-acl.771</url>
       <bibkey>shohan-etal-2024-xl</bibkey>
       <doi>10.18653/v1/2024.findings-acl.771</doi>
     </paper>
@@ -16928,7 +16928,7 @@
       <author><first>Dong</first><last>Yu</last><affiliation>Tencent AI Lab</affiliation></author>
       <pages>13025-13048</pages>
       <abstract>This paper introduces the Decomposed Requirements Following Ratio (DRFR), a new metric for evaluating Large Language Models’ (LLMs) ability to follow instructions. Addressing a gap in current methodologies, DRFR breaks down complex instructions into simpler criteria, facilitating a detailed analysis of LLMs’ compliance with various aspects of tasks. Alongside this metric, we present InFoBench, a benchmark comprising 500 diverse instructions and 2,250 decomposed questions across multiple constraint categories. Our experiments compare DRFR with traditional scoring methods and explore annotation sources, including human experts, crowd-sourced workers, and GPT-4. The findings demonstrate DRFR’s higher reliability and the effectiveness of using GPT-4 as a cost-efficient annotator. The evaluation of several advanced LLMs using this framework reveals their strengths and areas needing improvement, particularly in complex instruction-following. This study contributes a novel metric and benchmark, offering insights for future LLM development and evaluation.</abstract>
-      <url hash="4dca4573">2024.findings-acl.772</url>
+      <url hash="37b5358e">2024.findings-acl.772</url>
       <bibkey>qin-etal-2024-infobench</bibkey>
       <doi>10.18653/v1/2024.findings-acl.772</doi>
       <video href="2024.findings-acl.772.mp4"/>
@@ -16941,7 +16941,7 @@
       <author><first>Vagelis</first><last>Hristidis</last><affiliation>University of California, Riverside</affiliation></author>
       <pages>13049-13063</pages>
       <abstract>Large Language Models (LLMs) have achieved state-of-the-art performance in text re-ranking. This process includes queries and candidate passages in the prompts, utilizing pointwise, listwise, and pairwise prompting strategies. A limitation of these ranking strategies with LLMs is their cost: the process can become expensive due to API charges, which are based on the number of input and output tokens. We study how to maximize the re-ranking performance given a budget, by navigating the vast search spaces of prompt choices, LLM APIs, and budget splits. We propose a suite of budget-constrained methods to perform text re-ranking using a set of LLM APIs. Our most efficient method, called EcoRank, is a two-layered pipeline that jointly optimizes decisions regarding budget allocation across prompt strategies and LLM APIs. Our experimental results on four popular QA and passage reranking datasets show that EcoRank outperforms other budget-aware supervised and unsupervised baselines.</abstract>
-      <url hash="f0c77aef">2024.findings-acl.773</url>
+      <url hash="969bb1b7">2024.findings-acl.773</url>
       <bibkey>rashid-etal-2024-ecorank</bibkey>
       <doi>10.18653/v1/2024.findings-acl.773</doi>
     </paper>
@@ -16953,7 +16953,7 @@
       <author><first>Muhammad</first><last>Abdul-Mageed</last><affiliation>University of British Columbia</affiliation></author>
       <pages>13064-13087</pages>
       <abstract>We introduce FinTral, a suite of state-of-the-art multimodal large language models (LLMs) built upon the Mistral-7b model and tailored for financial analysis. FinTral integrates textual, numerical, tabular, and image data. We enhance FinTral with domain-specific pretraining, instruction fine-tuning, and RLAIF training by exploiting a large collection of textual and visual datasets we curate for this work. We also introduce an extensive benchmark featuring nine tasks and 25 datasets for evaluation, including hallucinations in the financial domain. Our FinTral model trained with direct preference optimization employing advanced Tools and Retrieval methods, dubbed FinTral-DPO-T&amp;R, demonstrates an exceptional zero-shot performance. It outperforms ChatGPT-3.5 in all tasks and surpasses GPT-4 in five out of nine tasks, marking a significant advancement in AI-driven financial technology. We also demonstrate that FinTral has the potential to excel in real-time analysis and decision-making in diverse financial contexts.</abstract>
-      <url hash="0d409243">2024.findings-acl.774</url>
+      <url hash="0e43c7c9">2024.findings-acl.774</url>
       <bibkey>bhatia-etal-2024-fintral</bibkey>
       <doi>10.18653/v1/2024.findings-acl.774</doi>
       <video href="2024.findings-acl.774.mp4"/>
@@ -16974,7 +16974,7 @@
       <author><first>Trevor</first><last>Darrell</last><affiliation>Electrical Engineering &amp; Computer Science Department</affiliation></author>
       <pages>13088-13110</pages>
       <abstract>Large Multimodal Models (LMM) are built across modalities and the misalignment between two modalities can result in “hallucination”, generating textual outputs that are not grounded by the multimodal information in context. To address the multimodal misalignment issue, we adapt the Reinforcement Learning from Human Feedback (RLHF) from the text domain to the vision-language alignment, where human annotators are asked to compare two responses and pinpoint the more hallucinated one, and the vision-language model is trained to maximize the simulated human rewards. We propose a new alignment algorithm called Factually Augmented RLHF that augments the reward model with additional factual information such as image captions and ground-truth multi-choice options, which alleviates the reward hacking phenomenon in RLHF and further improves the performance. We also enhance the GPT-4-generated training data (for vision instruction tuning) with previously available human-written image-text pairs to improve the general capabilities of our model. To evaluate the proposed approach in real-world scenarios, we develop a new evaluation benchmark MMHAL-BENCH with a special focus on penalizing hallucinations. As the first LMM trained with RLHF, our approach achieves remarkable improvement on the LLaVA-Bench dataset with the 96% performance level of the text-only GPT-4 (while previous best methods can only achieve the 87% level), and an improvement of 60% on MMHAL-BENCH over other baselines.</abstract>
-      <url hash="5d6e2c77">2024.findings-acl.775</url>
+      <url hash="6f220ced">2024.findings-acl.775</url>
       <bibkey>sun-etal-2024-aligning</bibkey>
       <doi>10.18653/v1/2024.findings-acl.775</doi>
     </paper>
@@ -16986,7 +16986,7 @@
       <author><first>Chitta</first><last>Baral</last><affiliation>Arizona State University</affiliation></author>
       <pages>13111-13128</pages>
       <abstract>As Large Language Models (LLMs) play an increasingly pivotal role in natural language processing applications, their safety concerns become critical areas of NLP research. This has resulted in the development of various LLM defense strategies. Unfortunately, despite the shared goal of improving the safety of LLMs, the evaluation suites across various research works are disjoint and lack diverse inputs to ensure accurate and precise evaluation estimates. Furthermore, the important factor of ‘over-defensiveness’ on the safe inputs has largely remained overlooked. Addressing these limitations, this paper presents a systematic evaluation, comparison, and analysis of various LLM defense strategies over both ‘safety’ and ‘over-defensiveness’. To this end, we compile a large and diverse collection of safe and unsafe prompts, design precise evaluation methodology, and study the efficacy of various LLM defense strategies on multiple state-of-the-art LLMs. Our work reveals a number of crucial findings that we believe will pave the way and also facilitate further research in the critical area of improving the safety of LLMs.</abstract>
-      <url hash="d887e5a1">2024.findings-acl.776</url>
+      <url hash="82657b97">2024.findings-acl.776</url>
       <bibkey>varshney-etal-2024-art</bibkey>
       <doi>10.18653/v1/2024.findings-acl.776</doi>
     </paper>
@@ -16998,7 +16998,7 @@
       <author><first>Vagelis</first><last>Hristidis</last><affiliation>University of California, Riverside</affiliation></author>
       <pages>13129-13148</pages>
       <abstract>Existing work on Temporal Question Answering (TQA) has predominantly focused on questions anchored to specific timestamps or events (e.g. ‘Who was the US president in 1970?’). Little work has studied questions whose temporal context is relative to the present time (e.g. ‘Who was the previous US president?’). We refer to this problem as Present-Anchored Temporal QA (PATQA). PATQA poses unique challenges: (1) large language models (LLMs) may have outdated knowledge, (2) complex temporal relationships (e.g. ‘before’, ‘previous’) are hard to reason, (3) multi-hop reasoning may be required, and (4) the gold answers of benchmarks must be continuously updated. To address these challenges, we introduce the PAT-Questions benchmark, which includes single and multi-hop temporal questions. The answers in PAT-Questions can be automatically refreshed by re-running SPARQL queries on a knowledge graph, if available. We evaluate several state-of-the-art LLMs and a SOTA temporal reasoning model (TEMPREASON-T5) on PAT-Questions through direct prompting and retrieval-augmented generation (RAG). The results highlight the limitations of existing solutions in PATQA and motivate the need for new methods to improve PATQA reasoning capabilities.</abstract>
-      <url hash="d01a9e56">2024.findings-acl.777</url>
+      <url hash="3c86c85a">2024.findings-acl.777</url>
       <bibkey>meem-etal-2024-pat</bibkey>
       <doi>10.18653/v1/2024.findings-acl.777</doi>
       <video href="2024.findings-acl.777.mp4"/>
@@ -17015,7 +17015,7 @@
       <author><first>Minlie</first><last>Huang</last><affiliation>Tsinghua University, Tsinghua University</affiliation></author>
       <pages>13149-13162</pages>
       <abstract>Large language model agents have demonstrated remarkable advancements across various complex tasks. Recent works focus on optimizing the agent team or employing self-reflection to iteratively solve complex tasks. Since these agents are all based on the same LLM, only conducting self-evaluation or removing underperforming agents does not substantively enhance the capability of the agents. We argue that a comprehensive evaluation and accumulating experience from evaluation feedback is an effective approach to improving system performance. In this paper, we propose Reusable Experience Accumulation with <tex-math>\mathbf{360^\circ}</tex-math> Assessment (<tex-math>360^\circ</tex-math>REA), a hierarchical multi-agent framework inspired by corporate organizational practices. The framework employs a novel <tex-math>360^\circ</tex-math> performance assessment method for multi-perspective performance evaluation with fine-grained assessment. To enhance the capability of agents in addressing complex tasks, we introduce dual-level experience pool for agents to accumulate experience through fine-grained assessment. Extensive experiments on complex task datasets demonstrate the effectiveness of <tex-math>360^\circ</tex-math>REA.</abstract>
-      <url hash="1728dc5b">2024.findings-acl.778</url>
+      <url hash="6ab5172a">2024.findings-acl.778</url>
       <bibkey>gao-etal-2024-360</bibkey>
       <doi>10.18653/v1/2024.findings-acl.778</doi>
       <video href="2024.findings-acl.778.mp4"/>
@@ -17028,7 +17028,7 @@
       <author><first>Bhuwan</first><last>Dhingra</last><affiliation>Duke University</affiliation></author>
       <pages>13163-13175</pages>
       <abstract>This paper investigates the use of large language models (LLMs) for extracting sample lists of polymer nanocomposites (PNCs) from full-length materials science research papers. The challenge lies in the complex nature of PNC samples, which have numerous attributes scattered throughout the text. The complexity of annotating detailed information on PNCs limits the availability of data, making conventional document-level relation extraction techniques impractical due to the challenge in creating comprehensive named entity span annotations.To address this, we introduce a new benchmark and an evaluation technique for this task and explore different prompting strategies in a zero-shot manner. We also incorporate self-consistency to improve the performance. Our findings show that even advanced LLMs struggle to extract all of the samples from an article. Finally, we analyze the errors encountered in this process, categorizing them into three main challenges, and discuss potential strategies for future research to overcome them.</abstract>
-      <url hash="299bd2d9">2024.findings-acl.779</url>
+      <url hash="d4bdef4b">2024.findings-acl.779</url>
       <bibkey>khalighinejad-etal-2024-extracting</bibkey>
       <doi>10.18653/v1/2024.findings-acl.779</doi>
       <video href="2024.findings-acl.779.mp4"/>
@@ -17043,11 +17043,11 @@
       <author><first>Taibai</first><last>Xu</last></author>
       <author><first>Zemin</first><last>Zhang</last></author>
       <author><first>Lichan</first><last>Hong</last><affiliation>Google</affiliation></author>
-      <author><first>Ed</first><last>Chi</last><affiliation>Google</affiliation></author>
+      <author><first>Ed H.</first><last>Chi</last><affiliation>Google</affiliation></author>
       <author><first>Xinyang</first><last>Yi</last><affiliation>Google</affiliation></author>
       <pages>13176-13188</pages>
       <abstract>Recent advancements have showcased the potential of Large Language Models (LLMs) in executing reasoning tasks, particularly facilitated by Chain-of-Thought (CoT) prompting. While tasks like arithmetic reasoning involve clear, definitive answers and logical chains of thought, the application of LLM reasoning in recommendation systems (RecSys) presents a distinct challenge. RecSys tasks revolve around subjectivity and personalized preferences, an under-explored domain in utilizing LLMs’ reasoning capabilities. Our study explores several aspects to better understand reasoning for RecSys and demonstrate how task quality improves by utilizing LLM reasoning for both zero-shot and fine-tuning settings. Additionally, we propose <tex-math>\textbf{Rec-SAVER}</tex-math> (<tex-math>\textbf{Rec}</tex-math>ommender <tex-math>\textbf{S}</tex-math>ystems <tex-math>\textbf{A}</tex-math>utomatic <tex-math>\textbf{V}</tex-math>erification and <tex-math>\textbf{E}</tex-math>valuation of <tex-math>\textbf{R}</tex-math>easoning) to automatically assess the quality of LLM reasoning responses without the requirement of curated gold references or human raters. We show that our framework aligns with real human judgment on the coherence and faithfulness of reasoning responses. Overall, our work shows that incorporating reasoning into RecSys can improve personalized tasks, paving the way for further advancements in recommender system methodologies.</abstract>
-      <url hash="48769538">2024.findings-acl.780</url>
+      <url hash="c9b07232">2024.findings-acl.780</url>
       <bibkey>tsai-etal-2024-leveraging</bibkey>
       <doi>10.18653/v1/2024.findings-acl.780</doi>
       <video href="2024.findings-acl.780.mp4"/>
@@ -17059,7 +17059,7 @@
       <author><first>Muhammad</first><last>Abdul-Mageed</last><affiliation>University of British Columbia</affiliation></author>
       <pages>13189-13206</pages>
       <abstract>We address a notable gap in Natural Language Processing (NLP) by introducing a collection of resources designed to improve Machine Translation (MT) for low-resource languages, with a specific focus on African languages. First, We introduce two language models (LMs), Cheetah-1.2B and Cheetah-3.7B, with 1.2 billion and 3.7 billion parameters respectively. Next, we finetune the aforementioned models to create Toucan, an Afrocentric machine translation model designed to support 156 African language pairs. To evaluate Toucan, we carefully develop an extensive machine translation benchmark, dubbed Afro-Lingu-MT, tailored for evaluating machine translation. Toucan significantly outperforms other models, showcasing its remarkable performance on MT for African languages. Finally, we train a new model, spBLEU-1K, to enhance translation evaluation metrics, covering 1K languages, including African languages. This work aims to advance the field of NLP, fostering cross-cultural understanding and knowledge exchange, particularly in regions with limited language resources such as Africa.</abstract>
-      <url hash="38cdecf0">2024.findings-acl.781</url>
+      <url hash="4be643fd">2024.findings-acl.781</url>
       <bibkey>elmadany-etal-2024-toucan</bibkey>
       <doi>10.18653/v1/2024.findings-acl.781</doi>
       <video href="2024.findings-acl.781.mp4"/>
@@ -17075,7 +17075,7 @@
       <author><first>Julian</first><last>McAuley</last><affiliation>University of California, San Diego, University of California, San Diego</affiliation></author>
       <pages>13207-13219</pages>
       <abstract>We consider the task of building a dialogue system that can motivate users to adopt positive lifestyle changes, Motivational Interviewing (MI). Addressing such a task requires a system that could infer <i>how</i> to motivate the user effectively. We propose DIIR, a framework that is capable of learning and applying conversation strategies in the form of natural language inductive rules from expert demonstrations. Automatic and human evaluation on instruction-following large language models show natural language strategies descriptions discovered by DIIR can improve active listening skills, reduce unsolicited advice, and promote more collaborative and less authoritative conversations, outperforming in-context demonstrations that are over 50 times longer.</abstract>
-      <url hash="649a1eb1">2024.findings-acl.782</url>
+      <url hash="e5213a38">2024.findings-acl.782</url>
       <bibkey>xie-etal-2024-shot-dialogue</bibkey>
       <doi>10.18653/v1/2024.findings-acl.782</doi>
       <video href="2024.findings-acl.782.mp4"/>
@@ -17087,7 +17087,7 @@
       <author><first>Hitomi</first><last>Yanaka</last><affiliation>the University of Tokyo</affiliation></author>
       <pages>13220-13239</pages>
       <abstract>Compositional generalization refers to the ability to generalize to novel combinations of previously observed words and syntactic structures.Since it is regarded as a desired property of neural models, recent work has assessed compositional generalization in machine translation as well as semantic parsing.However, previous evaluations with machine translation have focused mostly on lexical generalization (i.e., generalization to unseen combinations of known words).Thus, it remains unclear to what extent models can translate sentences that require structural generalization (i.e., generalization to different sorts of syntactic structures).To address this question, we construct SGET, a machine translation dataset covering various types of compositional generalization with control of words and sentence structures.We evaluate neural machine translation models on SGET and show that they struggle more in structural generalization than in lexical generalization.We also find different performance trends in semantic parsing and machine translation, which indicates the importance of evaluations across various tasks.</abstract>
-      <url hash="e5148312">2024.findings-acl.783</url>
+      <url hash="486e0b1e">2024.findings-acl.783</url>
       <bibkey>kumon-etal-2024-evaluating</bibkey>
       <doi>10.18653/v1/2024.findings-acl.783</doi>
       <video href="2024.findings-acl.783.mp4"/>
@@ -17099,7 +17099,7 @@
       <author><first>Tomek</first><last>Strzalkowski</last><affiliation>Rensselaer Polytechnic Institute</affiliation></author>
       <pages>13240-13255</pages>
       <abstract>The identification of Figurative Language (FL) features in text is crucial for various Natural Language Processing (NLP) tasks, where understanding of the author’s intended meaning and its nuances is key for successful communication. At the same time, the use of a specific blend of various FL forms most accurately reflects a writer’s style, rather than the use of any single construct, such as just metaphors or irony. Thus, we postulate that FL features could play an important role in Authorship Attribution (AA) tasks. We believe that our is the first computational study of AA based on FL use. Accordingly, we propose a Multi-task Figurative Language Model (MFLM) that learns to detect multiple FL features in text at once. We demonstrate, through detailed evaluation across multiple test sets, that the our model tends to perform equally or outperform specialized binary models in FL detection. Subsequently, we evaluate the predictive capability of joint FL features towards the AA task on three datasets, observing improved AA performance through the integration of MFLM embeddings.</abstract>
-      <url hash="20e981ce">2024.findings-acl.784</url>
+      <url hash="7c0977f8">2024.findings-acl.784</url>
       <bibkey>katsios-etal-2024-figuratively</bibkey>
       <doi>10.18653/v1/2024.findings-acl.784</doi>
       <video href="2024.findings-acl.784.mp4"/>
@@ -17111,7 +17111,7 @@
       <author><first>Yilun</first><last>Zhou</last><affiliation>Massachusetts Institute of Technology</affiliation></author>
       <pages>13256-13274</pages>
       <abstract>Recent large language models (LLMs) have shown indications of mathematical reasoning ability on challenging competition-level problems, especially with self-generated verbalizations of intermediate reasoning steps (i.e., chain-of-thought prompting). However, current evaluations mainly focus on the end-to-end final answer correctness, and it is unclear whether LLMs can make use of helpful side information such as problem-specific hints. In this paper, we propose a challenging benchmark dataset for enabling such analyses. The Concept and Hint-Annotated Math Problems (CHAMP) consists of high school math competition problems, annotated with concepts, or general math facts, and hints, or problem-specific tricks. These annotations allow us to explore the effects of additional information, such as relevant hints, misleading concepts, or related problems. This benchmark is difficult, with the best model only scoring 58.1% in standard settings. With concepts and hints, performance sometimes improves, indicating that some models can make use of such side information. Furthermore, we annotate model-generated solutions for their correctness. Using this corpus, we find that models often arrive at the correct final answer through wrong reasoning steps. In addition, we test whether models are able to verify these solutions, and find that most models struggle.</abstract>
-      <url hash="ff0a2801">2024.findings-acl.785</url>
+      <url hash="a9b0f4d2">2024.findings-acl.785</url>
       <bibkey>mao-etal-2024-champ</bibkey>
       <doi>10.18653/v1/2024.findings-acl.785</doi>
     </paper>
@@ -17123,7 +17123,7 @@
       <author><first>Jie</first><last>Zhou</last></author>
       <pages>13275-13288</pages>
       <abstract>Contemporary translation engines based on the encoder-decoder framework have made significant strides in development.However, the emergence of Large Language Models (LLMs) has disrupted their position by presenting the potential for achieving superior translation quality.To uncover the circumstances in which LLMs excel and explore how their strengths can be harnessed to enhance translation quality,we first conduct a comprehensive analysis to assess the strengths and limitations of various commercial NMT systems and MT-oriented LLMs. Our findings indicate that neither NMT nor MT-oriented LLMs alone can effectively address all the translation issues, but MT-oriented LLMs show promise as a complementary solution to NMT systems.Building upon these insights, we propose Cooperative Decoding (CoDec), which treats NMT systems as a pretranslation model and MT-oriented LLMs as a supplemental solution to handle complex scenarios beyond the capability of NMT alone.Experimental results on the WMT22 test sets and a newly collected test set WebCrawl demonstrate the effectiveness and efficiency of CoDec, highlighting its potential as a robust solution for combining NMT systems with MT-oriented LLMs in the field of machine translation.</abstract>
-      <url hash="86cf5f4b">2024.findings-acl.786</url>
+      <url hash="20ca9fd8">2024.findings-acl.786</url>
       <bibkey>zeng-etal-2024-improving</bibkey>
       <doi>10.18653/v1/2024.findings-acl.786</doi>
     </paper>
@@ -17137,7 +17137,7 @@
       <author><first>Kei</first><last>Sawada</last><affiliation>rinna Co., Ltd.</affiliation></author>
       <pages>13289-13305</pages>
       <abstract>Advances in machine learning have made it possible to perform various text and speech processing tasks, such as automatic speech recognition (ASR), in an end-to-end (E2E) manner. E2E approaches utilizing pre-trained models are gaining attention for conserving training data and resources. However, most of their applications in ASR involve only one of either a pre-trained speech or a language model. This paper proposes integrating a pre-trained speech representation model and a large language model (LLM) for E2E ASR. The proposed model enables the optimization of the entire ASR process, including acoustic feature extraction and acoustic and language modeling, by combining pre-trained models with a bridge network and also enables the application of remarkable developments in LLM utilization, such as parameter-efficient domain adaptation and inference optimization. Experimental results demonstrate that the proposed model achieves a performance comparable to that of modern E2E ASR models by utilizing powerful pre-training models with the proposed integrated approach.</abstract>
-      <url hash="4bca9d6a">2024.findings-acl.787</url>
+      <url hash="24928392">2024.findings-acl.787</url>
       <bibkey>hono-etal-2024-integrating</bibkey>
       <doi>10.18653/v1/2024.findings-acl.787</doi>
     </paper>
@@ -17148,7 +17148,7 @@
       <author><first>Robin</first><last>Jia</last><affiliation>University of Southern California</affiliation></author>
       <pages>13306-13320</pages>
       <abstract>Detecting whether copyright holders’ works were used in LLM pretraining is poised to be an important problem. This work proposes using data watermarks to enable principled detection with only black-box model access, provided that the rightholder contributed multiple training documents and watermarked them before public release. By applying a randomly sampled data watermark, detection can be framed as hypothesis testing, which provides guarantees on the false detection rate. We study two watermarks: one that inserts random sequences, and another that randomly substitutes characters with Unicode lookalikes. We first show how three aspects of watermark design - watermark length, number of duplications, and interference - affect the power of the hypothesis test. Next, we study how a watermark’s detection strength changes under model and dataset scaling: while increasing the dataset size decreases the strength of the watermark, watermarks remain strong if the model size also increases. Finally, we view SHA hashes as natural watermarks and show that we can robustly detect hashes from BLOOM-176B’s training data, as long as they occurred at least 90 times. Together, our results point towards a promising future for data watermarks in real world use.</abstract>
-      <url hash="701a40d9">2024.findings-acl.788</url>
+      <url hash="4319aaa4">2024.findings-acl.788</url>
       <bibkey>wei-etal-2024-proving</bibkey>
       <doi>10.18653/v1/2024.findings-acl.788</doi>
     </paper>
@@ -17160,7 +17160,7 @@
       <author><first>Yi</first><last>Yang</last></author>
       <pages>13321-13332</pages>
       <abstract>Detecting hallucinations in large language model (LLM) outputs is pivotal, yet traditional fine-tuning for this classification task is impeded by the expensive and quickly outdated annotation process, especially across numerous vertical domains and in the face of rapid LLM advancements. In this study, we introduce an approach that automatically generates both faithful and hallucinated outputs by rewriting system responses. Experimental findings demonstrate that a T5-base model, fine-tuned on our generated dataset, surpasses state-of-the-art zero-shot detectors and existing synthetic generation methods in both accuracy and latency, indicating efficacy of our approach.</abstract>
-      <url hash="c796a5ff">2024.findings-acl.789</url>
+      <url hash="1bb97ee5">2024.findings-acl.789</url>
       <bibkey>zhang-etal-2024-enhancing-hallucination</bibkey>
       <doi>10.18653/v1/2024.findings-acl.789</doi>
       <video href="2024.findings-acl.789.mp4"/>
@@ -17177,7 +17177,7 @@
       <author><first>Zenglin</first><last>Xu</last><affiliation>Fudan University</affiliation></author>
       <pages>13333-13348</pages>
       <abstract>With the growing use of Transformer models hosted on cloud platforms to offer inference services, privacy concerns are escalating, especially concerning sensitive data like investment plans and bank account details. Secure Multi-Party Computing (SMPC) emerges as a promising solution to protect the privacy of inference data and model parameters. However, the application of SMPC in Privacy-Preserving Inference (PPI) for Transformer models often leads to considerable slowdowns or declines in performance. This is largely due to the multitude of nonlinear operations in the Transformer architecture, which are not well-suited to SMPC and are difficult to circumvent or optimize effectively. To address this concern, we introduce a comprehensive PPI framework called SecFormer to achieve fast and accurate PPI for Transformer models. We successfully eliminate the high-cost exponential and maximum operations in PPI without sacrificing model performance and develop a suite of efficient SMPC protocols by employing suitable numerical computation methods to boost other complex nonlinear functions in PPI, including GeLU, LayerNorm, and a redesigned Softmax. Our extensive experiments reveal that SecFormer outperforms MPCFormer in performance, showing improvements of 3.4% and 24.7% for <tex-math>BERT_{\text{BASE}}</tex-math> and <tex-math>BERT_{\text{LARGE}}</tex-math>, respectively. In terms of efficiency, SecFormer is 3.57 and 3.58 times faster than PUMA for <tex-math>BERT_{\text{BASE}}</tex-math> and <tex-math>BERT_{\text{LARGE}}</tex-math>, demonstrating its effectiveness and speed.</abstract>
-      <url hash="c1fd4e2c">2024.findings-acl.790</url>
+      <url hash="2e09ac03">2024.findings-acl.790</url>
       <bibkey>luo-etal-2024-secformer</bibkey>
       <doi>10.18653/v1/2024.findings-acl.790</doi>
     </paper>
@@ -17189,7 +17189,7 @@
       <author><first>Bhuwan</first><last>Dhingra</last><affiliation>Duke University</affiliation></author>
       <pages>13349-13365</pages>
       <abstract>With the proliferation of LLM-integrated applications such as GPT-s, millions are deployed, offering valuable services through proprietary instruction prompts. These systems, however, are prone to prompt extraction attacks through meticulously designed queries. To help mitigate this problem, we introduce the Raccoon benchmark which comprehensively evaluates a model’s susceptibility to prompt extraction attacks. Our novel evaluation method assesses models under both defenseless and defended scenarios, employing a dual approach to evaluate the effectiveness of existing defenses and the resilience of the models. The benchmark encompasses 14 categories of prompt extraction attacks, with additional compounded attacks that closely mimic the strategies of potential attackers, alongside a diverse collection of defense templates. This array is, to our knowledge, the most extensive compilation of prompt theft attacks and defense mechanisms to date. Our findings highlight universal susceptibility to prompt theft in the absence of defenses, with OpenAI models demonstrating notable resilience when protected. This paper aims to establish a more systematic benchmark for assessing LLM robustness against prompt extraction attacks, offering insights into their causes and potential countermeasures.</abstract>
-      <url hash="272c489e">2024.findings-acl.791</url>
+      <url hash="5d8c6dc6">2024.findings-acl.791</url>
       <bibkey>wang-etal-2024-raccoon</bibkey>
       <doi>10.18653/v1/2024.findings-acl.791</doi>
     </paper>
@@ -17204,7 +17204,7 @@
       <author><first>Jian-Yun</first><last>Nie</last><affiliation>University of Montreal</affiliation></author>
       <pages>13366-13378</pages>
       <abstract>Conversational search facilitates complex information retrieval by enabling multi-turn interactions between users and the system. Supporting such interactions requires a comprehensive understanding of the conversational inputs to formulate a good search query based on historical information. In particular, the search query should include the relevant information from the previous conversation turns.However, current approaches for conversational dense retrieval primarily rely on fine-tuning a pre-trained ad-hoc retriever using the whole conversational search session, which can be lengthy and noisy. Moreover, existing approaches are limited by the amount of manual supervision signals in the existing datasets.To address the aforementioned issues, we propose a **H**istory-**A**ware **Conv**ersational **D**ense **R**etrieval (HAConvDR) system, which incorporates two ideas: context-denoised query reformulation and automatic mining of supervision signals based on the actual impact of historical turns.Experiments on two public conversational search datasets demonstrate the improved history modeling capability of HAConvDR, in particular for long conversations with topic shifts.</abstract>
-      <url hash="303c3318">2024.findings-acl.792</url>
+      <url hash="5968b712">2024.findings-acl.792</url>
       <bibkey>mo-etal-2024-history</bibkey>
       <doi>10.18653/v1/2024.findings-acl.792</doi>
     </paper>
@@ -17218,7 +17218,7 @@
       <author><first>Yanghua</first><last>Xiao</last><affiliation>Fudan University</affiliation></author>
       <pages>13379-13389</pages>
       <abstract>Multi-Modal Knowledge Graphs (MMKGs) have proven valuable for various downstream tasks. However, scaling them up is challenging because building large-scale MMKGs often introduces mismatched images (i.e., noise). Most entities in KGs belong to the long tail, meaning there are few images of them available online. This scarcity makes it difficult to determine whether a found image matches the entity. To address this, we draw on the Triangle of Reference Theory and suggest enhancing vision-language models with concept guidance. Specifically, we introduce COG, a two-stage framework with COncept-Guided vision-language models. The framework comprises a Concept Integration module, which effectively identifies image-text pairs of long-tailed entities, and an Evidence Fusion module, which offers explainability and enables human verification. To demonstrate the effectiveness of COG, we create a dataset of 25k image-text pairs of long-tailed entities. Our comprehensive experiments show that COG not only improves the accuracy of recognizing long-tailed image-text pairs compared to baselines but also offers flexibility and explainability.</abstract>
-      <url hash="10005a66">2024.findings-acl.793</url>
+      <url hash="f3084fd0">2024.findings-acl.793</url>
       <bibkey>zhang-etal-2024-light</bibkey>
       <doi>10.18653/v1/2024.findings-acl.793</doi>
       <video href="2024.findings-acl.793.mp4"/>
@@ -17231,7 +17231,7 @@
       <author><first>Yue</first><last>Zhang</last><affiliation>Westlake University</affiliation></author>
       <pages>13390-13405</pages>
       <abstract>Zero-shot stance detection that aims to detect the stance (typically against, favor, or neutral) towards unseen targets has attracted considerable attention. However, most previous studies only focus on targets from a single or limited text domains (e.g., financial domain), and thus zero-shot models cannot generalize well to unseen targets of diverse domains (e.g., political domain). In this paper, we consider a more realistic task, i.e., open-domain stance detection, which aims at training a model that is able to generalize well to unseen targets across multiple domains of interest. Particularly, we propose a novel dataset generation method ZeroStance, which leverages ChatGPT to construct a synthetic open-domain dataset CHATStance that covers a wide range of domains. We then train an open-domain model on our synthetic dataset after proper data filtering. Extensive results indicate that our model, when trained on this synthetic dataset, shows superior generalization to unseen targets of diverse domains over baselines on most benchmarks. Our method requires only a task description in the form of a prompt and is much more cost-effective and data-efficient than previous methods. We will release our code and data to facilitate future research.</abstract>
-      <url hash="01d34813">2024.findings-acl.794</url>
+      <url hash="6cdaa852">2024.findings-acl.794</url>
       <bibkey>zhao-etal-2024-zerostance</bibkey>
       <doi>10.18653/v1/2024.findings-acl.794</doi>
     </paper>
@@ -17242,7 +17242,7 @@
       <author><first>Preethi</first><last>Jyothi</last><affiliation>Indian Institute of Technology Bombay</affiliation></author>
       <pages>13406-13422</pages>
       <abstract>Large language models (LLMs) are very proficient text generators. We leverage this capability of LLMs to generate task-specific data via zero-shot prompting and promote cross-lingual transfer for low-resource target languages. Given task-specific data in a source language and a teacher model trained on this data, we propose using this teacher to label LLM generations and employ a set of simple data selection strategies that use the teacher’s label probabilities. Our data selection strategies help us identify a representative subset of diverse generations that help boost zero-shot accuracies while being efficient, in comparison to using all the LLM generations (without any subset selection). We also highlight other important design choices that affect cross-lingual performance such as the use of translations of source data and what labels are best to use for the LLM generations. We observe significant performance gains across sentiment analysis and natural language inference tasks (of up to a maximum of 7.13 absolute points and 1.5 absolute points on average) across a number of target languages (Hindi, Marathi, Urdu, Swahili) and domains.</abstract>
-      <url hash="3c2b85be">2024.findings-acl.795</url>
+      <url hash="f15ac49b">2024.findings-acl.795</url>
       <bibkey>fazili-etal-2024-boosting</bibkey>
       <doi>10.18653/v1/2024.findings-acl.795</doi>
       <video href="2024.findings-acl.795.mp4"/>
@@ -17256,7 +17256,7 @@
       <author><first>Bo</first><last>Wang</last><affiliation>School of Artificial Intelligence, Jilin University</affiliation></author>
       <pages>13423-13439</pages>
       <abstract>Learning multi-task models for jointly detecting stance and verifying rumors poses challenges due to the need for training data of stance at post level and rumor veracity at claim level, which are difficult to obtain. To address this issue, we leverage large language models (LLMs) as the foundation annotators for the joint stance detection (SD) and rumor verification (RV) tasks, dubbed as JSDRV. We introduce a novel reinforcement tuning framework to enhance the joint predictive capabilities of LLM-based SD and RV components. Specifically, we devise a policy for selecting LLM-annotated data at the two levels, employing a hybrid reward mechanism to choose high-quality labels for effective LLM fine-tuning on both tasks. Results demonstrate that JSDRV improves the capabilities of LLMs in the joint tasks, not only outperforming state-of-the-art methods but also generalizing to non-LLMs accommodated as task models.</abstract>
-      <url hash="fce3bb74">2024.findings-acl.796</url>
+      <url hash="f98624a9">2024.findings-acl.796</url>
       <bibkey>yang-etal-2024-reinforcement</bibkey>
       <doi>10.18653/v1/2024.findings-acl.796</doi>
       <video href="2024.findings-acl.796.mp4"/>
@@ -17268,7 +17268,7 @@
       <author><first>Benyou</first><last>Wang</last><affiliation>The Chinese University of Hong Kong, Shenzhen</affiliation></author>
       <pages>13440-13451</pages>
       <abstract>Despite the success of data augmentation in improving CLIP model, existing methods that utilize LLM or SAM to enrich the information in captions still suffer from several limitations, including insufficient detail and excessive hallucinations, ultimately resulting in compromised alignment and masking the true potential of dense information. This can lead to erroneous conclusions about CLIP’s ability to handle rich data, impeding the development of more effective models. To address the limitations of existing methods, we introduce a novel pipeline that generates highly detailed, factually accurate captions for images, which facilitates in-depth analysis of the potential for dense information in multimodal alignment. Contrary to previous findings, our investigation revealed that lengthening captions boosts performance across diverse benchmarks, even surpassing the effectiveness of meticulously crafted hard negative samples. Building on these insights, DELIP is introduced, demonstrably enhancing both foundational multimodal alignment and compositional reasoning abilities. Finally, we explore strategies to expand the context window of the text encoder, unlocking the potential of richer data for CLIP and paving the way for advancements in leveraging dense information for multimodal alignment.</abstract>
-      <url hash="39a1071a">2024.findings-acl.797</url>
+      <url hash="d098332e">2024.findings-acl.797</url>
       <bibkey>fan-etal-2024-exploring</bibkey>
       <doi>10.18653/v1/2024.findings-acl.797</doi>
     </paper>
@@ -17280,7 +17280,7 @@
       <author><first>Karthik</first><last>Narasimhan</last><affiliation>Princeton University</affiliation></author>
       <pages>13452-13461</pages>
       <abstract>We propose Referral-Augmented Retrieval (RAR), a simple technique that concatenates document indices with referrals: text from other documents that cite or link to the given document. We find that RAR provides significant performance gains for tasks across paper retrieval, entity retrieval, and open-domain question-answering in both zero-shot and in-domain (e.g., fine-tuned) settings. We examine how RAR provides especially strong improvements on more structured tasks, and can greatly outperform generative text expansion techniques such as DocT5Query and Query2Doc, with a 37% and 21% absolute improvement on ACL paper retrieval, respectively. We also compare three ways to aggregate referrals for RAR. Overall, we believe RAR can help revive and re-contextualize the classic information retrieval idea of using anchor texts to improve the representations of documents in a wide variety of corpuses in the age of neural retrieval.</abstract>
-      <url hash="a2baf5f7">2024.findings-acl.798</url>
+      <url hash="1feeddb2">2024.findings-acl.798</url>
       <bibkey>tang-etal-2024-referral</bibkey>
       <doi>10.18653/v1/2024.findings-acl.798</doi>
       <video href="2024.findings-acl.798.mp4"/>
@@ -17294,7 +17294,7 @@
       <author><first>Sujian</first><last>Li</last><affiliation>Peking University</affiliation></author>
       <pages>13462-13474</pages>
       <abstract>This paper explores to construct a general text evaluator based on open-source Large Language Models (LLMs), a domain predominantly occupied by commercial counterparts such as GPT-4. Recognizing the limitations of open-source models like Llama in evaluative tasks, we introduce InstructEval, a general multi-aspect text evaluator developed through instruction tuning of open-source LLMs. To overcome the shortage of annotated resources for multi-aspect evaluations, InstructEval combines extensive open Human Preference Modeling (HPM) datasets with a small set of multi-aspect annotated data.This approach not only enhances effectiveness in overall evaluation tasks but also exhibits improved performance in multi-aspect evaluation tasks.As demonstrated by our extensive experiments, InstructEval achieves comparable or superior performance to commercial LLMs like ChatGPT or GPT-4 in terms of both overall and multi-aspect evaluation.</abstract>
-      <url hash="cab47bd4">2024.findings-acl.799</url>
+      <url hash="dad7a84a">2024.findings-acl.799</url>
       <bibkey>wu-etal-2024-instructeval</bibkey>
       <doi>10.18653/v1/2024.findings-acl.799</doi>
     </paper>
@@ -17305,7 +17305,7 @@
       <author><first>Thai</first><last>Le</last><affiliation>Indiana University</affiliation></author>
       <pages>13475-13491</pages>
       <abstract>Existing works have shown that fine-tuned textual transformer models achieve state-of-the-art prediction performances but are also vulnerable to adversarial text perturbations. Traditional adversarial evaluation is often done <i>only after</i> fine-tuning the models and ignoring the training data. In this paper, we want to prove that there is also a strong correlation between training data and model robustness. To this end, we extract 13 different features representing a wide range of input fine-tuning corpora properties and use them to predict the adversarial robustness of the fine-tuned models. Focusing mostly on encoder-only transformer models BERT and RoBERTa with additional results for BART, ELECTRA and GPT2, we provide diverse evidence to support our argument. First, empirical analyses show that (a) extracted features can be used with a lightweight classifier such as Random Forest to effectively predict the attack success rate and (b) features with the most influence on the model robustness have a clear correlation with the robustness. Second, our framework can be used as a fast and effective additional tool for robustness evaluation since it (a) saves 30x-193x runtime compared to the traditional technique, (b) is transferable across models, (c) can be used under adversarial training, and (d) robust to statistical randomness. Our code is publicly available at <url>https://github.com/CaptainCuong/RobustText_ACL2024</url>.</abstract>
-      <url hash="2144db08">2024.findings-acl.800</url>
+      <url hash="2c9d83e6">2024.findings-acl.800</url>
       <bibkey>cuong-etal-2024-curious</bibkey>
       <doi>10.18653/v1/2024.findings-acl.800</doi>
       <video href="2024.findings-acl.800.mp4"/>
@@ -17320,7 +17320,7 @@
       <author><first>Julian</first><last>McAuley</last><affiliation>University of California, San Diego, University of California, San Diego</affiliation></author>
       <pages>13492-13510</pages>
       <abstract>Do current large language models (LLMs) better solve graph reasoning and generation tasks with parameter updates? In this paper, we propose <b>InstructGraph</b>, a framework that empowers LLMs with the abilities of graph reasoning and generation by instruction tuning and preference alignment. Specifically, we first propose a structured format verbalizer to unify all graph data into a universal code-like format, which can simply represent the graph without any external graph-specific encoders. Furthermore, a graph instruction tuning stage is introduced to guide LLMs in solving graph reasoning and generation tasks. Finally, we identify potential hallucination problems in graph tasks and sample negative instances for preference alignment, the target of which is to enhance the output’s reliability of the model. Extensive experiments across multiple graph-centric tasks exhibit that InstructGraph can achieve the best performance and outperform GPT-4 and LLaMA2 by more than 13% and 38%, respectively.</abstract>
-      <url hash="d261a5e0">2024.findings-acl.801</url>
+      <url hash="efde4fd0">2024.findings-acl.801</url>
       <bibkey>wang-etal-2024-instructgraph</bibkey>
       <doi>10.18653/v1/2024.findings-acl.801</doi>
       <video href="2024.findings-acl.801.mp4"/>
@@ -17334,7 +17334,7 @@
       <author><first>Seung-won</first><last>Hwang</last><affiliation>Seoul National University</affiliation></author>
       <pages>13511-13525</pages>
       <abstract>Agents powered by large language models (LLMs) inherit important limitations, such as the restricted context length, dependency on human-engineered exemplars (e.g., for task decomposition), and insufficient generalization. To address these challenges, we propose RaDA, a novel planning method for Web agents that does not require manual exemplars, efficiently leverages the LLMs’ context, and enhances generalization. RaDA disentangles planning into two stages: for a new given task, during Retrieval-augmented Task Decomposition (RaD), it decomposes tasks into high-level subtasks; next, during Retrieval-augmented Action Generation (RaA), it traverses the trajectory obtained with RaD to iteratively synthesize actions based on dynamically retrieved exemplars. We compare RaDA with strong baselines covering a broad space of design choices, using both GPT-3.5 and GPT-4 as backbones; and we find consistent improvements over previous SOTA in two challenging benchmarks, CompWoB and Mind2Web, covering settings with different complexities. We show the contributions of RaDA via ablation studies and qualitative analysis; and we discuss the structural benefits of our more compositional design.</abstract>
-      <url hash="3f8a4fad">2024.findings-acl.802</url>
+      <url hash="887f97da">2024.findings-acl.802</url>
       <bibkey>kim-etal-2024-rada</bibkey>
       <doi>10.18653/v1/2024.findings-acl.802</doi>
       <video href="2024.findings-acl.802.mp4"/>
@@ -17354,7 +17354,7 @@
       <author><first>Weizhu</first><last>Chen</last><affiliation>Microsoft GenAI</affiliation></author>
       <pages>13526-13544</pages>
       <abstract>Large language models (LLMs) have demonstrated impressive reasoning capabilities, yet there is ongoing debate about these abilities and the potential data contamination problem recently. This paper aims to evaluate the reasoning capacities of LLMs, specifically in solving recent competition-level programming problems in Codeforces, which are expert-crafted and unique, requiring deep understanding and robust reasoning skills. We first provide a comprehensive evaluation of GPT-4’s perceived zero-shot performance on this task, considering various aspects such as problems’ release time, difficulties, and types of errors encountered. Surprisingly, the perceived performance of GPT-4 has experienced a cliff like decline in problems after September 2021 consistently across all the difficulties and types of problems, which shows the potential data contamination, as well as the challenges for any existing LLM to solve unseen complex reasoning problems. We further explore various approaches such as fine-tuning, Chain-of-Thought prompting and problem description simplification. Unfortunately, none of them is able to consistently mitigate the challenges. Through our work, we emphasize the importance of this excellent data source for assessing the genuine reasoning capabilities of LLMs, and foster the development of LLMs with stronger reasoning abilities and better generalization in the future.</abstract>
-      <url hash="448b07e4">2024.findings-acl.803</url>
+      <url hash="8fbcfb12">2024.findings-acl.803</url>
       <bibkey>huang-etal-2024-competition</bibkey>
       <doi>10.18653/v1/2024.findings-acl.803</doi>
     </paper>
@@ -17368,7 +17368,7 @@
       <author><first>Erik</first><last>Cambria</last><affiliation>Nanyang Technological University</affiliation></author>
       <pages>13545-13565</pages>
       <abstract>Hypothetical induction is recognized as the main reasoning type when scientists make observations about the world and try to propose hypotheses to explain those observations. Past research on hypothetical induction is under a constrained setting: (1) the observation annotations in the dataset are carefully manually handpicked sentences (resulting in a close-domain setting); and (2) the ground truth hypotheses are mostly commonsense knowledge, making the task less challenging. In this work, we tackle these problems by proposing the first dataset for social science academic hypotheses discovery, with the final goal to create systems that automatically generate valid, novel, and helpful scientific hypotheses, given only a pile of raw web corpus. Unlike previous settings, the new dataset requires (1) using open-domain data (raw web corpus) as observations; and (2) proposing hypotheses even new to humanity. A multi-module framework is developed for the task, including three different feedback mechanisms to boost performance, which exhibits superior performance in terms of both GPT-4 based and expert-based evaluation.To the best of our knowledge, this is the first work showing that LLMs are able to generate novel (”not existing in literature”) and valid (”reflecting reality”) scientific hypotheses.</abstract>
-      <url hash="f5be3975">2024.findings-acl.804</url>
+      <url hash="9e17f9c3">2024.findings-acl.804</url>
       <bibkey>yang-etal-2024-large-language</bibkey>
       <doi>10.18653/v1/2024.findings-acl.804</doi>
       <video href="2024.findings-acl.804.mp4"/>
@@ -17379,7 +17379,7 @@
       <author><first>Yuchen</first><last>Lyu</last></author>
       <pages>13566-13577</pages>
       <abstract>Recent studies have shown that fusing text labels and context sentences is an effective method for learning prototype representations in few-shot relation extraction. However, the **inconsistency of prototype representations** across different few-shot tasks persists due to different context sentences for the same relation, even with the integration of text labels into prototype representations. Conversely, the text label for each relation is unique and consistent, 1)which prompts us to propose a **dual prototype learning method**. Unlike previous methods that only construct support-based prototypes, we additionally construct label-based prototypes. Furthermore, we introduce a graph-based prototype adjustment module to construct topological information between support-based and label-based prototypes, thereby generating a more effective similarity measure through a simple linear combination. In addition, relations of different granularities have different distribution widths in the same semantic space, the **imbalanced distribution in the semantic space** leads to a lack of comparability among relations. To create a more discriminative semantic space, 2)we propose a **granularity-aware prototype learning method** that unifies the distribution width of relations, making relations of different granularities have similar distribution widths. Experimental results on two public benchmark datasets show that our proposed methods achieve state-of-the-art performance in few-shot relation classification.</abstract>
-      <url hash="12567f05">2024.findings-acl.805</url>
+      <url hash="58e2eb03">2024.findings-acl.805</url>
       <bibkey>li-lyu-2024-gradual</bibkey>
       <doi>10.18653/v1/2024.findings-acl.805</doi>
     </paper>
@@ -17392,7 +17392,7 @@
       <author><first>Rui</first><last>Wang</last></author>
       <pages>13578-13589</pages>
       <abstract>Recent advancements in Chinese Spelling Correction (CSC) predominantly leverage pre-trained language models (PLMs). However, a notable challenge with fine-tuned PLM-based CSC models is their tendency to over-correct, leading to poor generalization for error patterns outside the standard distribution. To address this, we developed a teacher network guided by prior knowledge for distillation learning of CSC models. Unlike traditional teacher networks, which depend on task-related pre-training, our method infuses task-related prior information into the teacher network, offering guidance beyond mere labels to the student network. This strategy significantly enhances the CSC model’s language modeling capabilities, crucial for minimizing over-correction. Importantly, our approach is model-independent and the teacher network does not require task-related pre-training, making it broadly applicable for enhancing various PLM-based CSC models with minimal additional computational resources. Extensive experiments on widely used benchmarks demonstrate that our method achieves new state-of-the-art results. Additionally, we explored the potential of generalizing our method to other non-autoregressive text-generation tasks.</abstract>
-      <url hash="df889d77">2024.findings-acl.806</url>
+      <url hash="bdfee05e">2024.findings-acl.806</url>
       <bibkey>wei-etal-2024-training</bibkey>
       <doi>10.18653/v1/2024.findings-acl.806</doi>
       <video href="2024.findings-acl.806.mp4"/>
@@ -17410,7 +17410,7 @@
       <author><first>Rita</first><last>Cucchiara</last><affiliation>Università di Modena e Reggio Emilia</affiliation></author>
       <pages>13590-13618</pages>
       <abstract>Connecting text and visual modalities plays an essential role in generative intelligence. For this reason, inspired by the success of large language models, significant research efforts are being devoted to the development of Multimodal Large Language Models (MLLMs). These models can seamlessly integrate visual and textual modalities, while providing a dialogue-based interface and instruction-following capabilities. In this paper, we provide a comprehensive review of recent visual-based MLLMs, analyzing their architectural choices, multimodal alignment strategies, and training techniques. We also conduct a detailed analysis of these models across a wide range of tasks, including visual grounding, image generation and editing, visual understanding, and domain-specific applications. Additionally, we compile and describe training datasets and evaluation benchmarks, conducting comparisons among existing models in terms of performance and computational requirements. Overall, this survey offers a comprehensive overview of the current state of the art, laying the groundwork for future MLLMs.</abstract>
-      <url hash="28c2333c">2024.findings-acl.807</url>
+      <url hash="3007c0eb">2024.findings-acl.807</url>
       <bibkey>caffagni-etal-2024-revolution</bibkey>
       <doi>10.18653/v1/2024.findings-acl.807</doi>
       <video href="2024.findings-acl.807.mp4"/>
@@ -17424,8 +17424,8 @@
       <author><first>Bo</first><last>Du</last><affiliation>Wuhan University</affiliation></author>
       <author><first>Dacheng</first><last>Tao</last><affiliation>University of Sydney</affiliation></author>
       <pages>13619-13639</pages>
-      <abstract/>
-      <url hash="dedf093f">2024.findings-acl.808</url>
+      <abstract>Advancing automated programming necessitates robust and comprehensive code generation benchmarks, yet current evaluation frameworks largely neglect object-oriented programming (OOP) in favour of functional programming (FP), e.g., HumanEval and MBPP. To address this, our study introduces a pioneering OOP-focused benchmark, featuring 431 Python programs that encompass essential OOP concepts and features like classes and encapsulation methods. We propose a novel evaluation metric, pass@o, tailored for OOP, enhancing traditional pass@k metric. Our evaluation of 23 leading large language models (LLMs), including both general and code-specialized models, reveals three key insights: 1) pass@o offers a more relevant and comprehensive assessment for OOP code generation; 2) Despite excelling in FP, code-specialized LLMs like WizardCoder lag in OOP compared to models like ChatGPT; 3) The poor performance of all advanced LLMs on our OOP benchmark highlights a critical need for improvements in this field. Our benchmark and scripts will be publicly released at GitHub.</abstract>
+      <url hash="a8b19f1b">2024.findings-acl.808</url>
       <bibkey>wang-etal-2024-oop</bibkey>
       <doi>10.18653/v1/2024.findings-acl.808</doi>
     </paper>
@@ -17444,7 +17444,7 @@
       <author><first>Dahua</first><last>Lin</last><affiliation>The Chinese University of Hong Kong</affiliation></author>
       <pages>13640-13656</pages>
       <abstract>The programming skill is one crucial ability for Large Language Models (LLMs), necessitating a deep understanding of programming languages (PLs) and their correlation with natural languages (NLs). We examine the impact of pre-training data on code-focused LLMs’ performance by assessing the comment density as a measure of PL-NL alignment. Given the scarcity of code-comment aligned data in pre-training corpora, we introduce a novel data augmentation method that generates comments for existing code, coupled with a data filtering strategy that filters out code data poorly correlated with natural language. We conducted experiments on three code-focused LLMs and observed consistent improvements in performance on two widely-used programming skill benchmarks. Notably, the model trained on the augmented data outperformed both the model used for generating comments and the model further trained on the data without augmentation.</abstract>
-      <url hash="259d1172">2024.findings-acl.809</url>
+      <url hash="385bae46">2024.findings-acl.809</url>
       <bibkey>song-etal-2024-code</bibkey>
       <doi>10.18653/v1/2024.findings-acl.809</doi>
       <video href="2024.findings-acl.809.mp4"/>
@@ -17458,7 +17458,7 @@
       <author><first>Min</first><last>Zhang</last><affiliation>Harbin Institute of Technology, Shenzhen</affiliation></author>
       <pages>13657-13670</pages>
       <abstract>Domain adaptation remains a challenge in the realm of Neural Machine Translation (NMT), even in the era of large language models (LLMs). Existing non-parametric approaches like nearest neighbor machine translation have made small Autoregressive Translation (AT) models achieve efficient domain generalization and adaptation without updating parameters, but leaving the Non-Autoregressive Translation (NAT) counterparts under-explored. To fill this blank, we introduce <tex-math>Bi</tex-math>-<tex-math>k</tex-math>NN, an innovative and efficient domain adaptation approach for NAT models that tailors a k-nearest-neighbor algorithm for NAT. Specifically, we introduce an effective datastore construction and correlated updating strategies to conform the parallel nature of NAT. Additionally, we train a meta-network that seamlessly integrates the NN distribution with the NMT distribution robustly during the iterative decoding process of NAT. Our experimental results across four benchmark datasets demonstrate that our <tex-math>Bi</tex-math>-<tex-math>k</tex-math>NN not only achieves significant improvements over the Base-NAT model (7.8 BLEU on average) but also exhibits enhanced efficiency.</abstract>
-      <url hash="127e7326">2024.findings-acl.810</url>
+      <url hash="f84d4021">2024.findings-acl.810</url>
       <bibkey>you-etal-2024-efficient</bibkey>
       <doi>10.18653/v1/2024.findings-acl.810</doi>
     </paper>
@@ -17471,7 +17471,7 @@
       <author><first>Min</first><last>Zhang</last><affiliation>Harbin Institute of Technology, Shenzhen</affiliation></author>
       <pages>13671-13685</pages>
       <abstract>Large language models (LLMs) have presented remarkable capabilities in the wide range of natural language understanding and reasoning tasks. Despite their success, a few works indicate that LLMs suffer from the “reversal curse”, in which LLMs can’t employ the inverted structure “B is A” when they are trained based on “A is B”. To explore the effect of the “reversal curse” for LLMs on complex mathematical reasoning tasks, we present two reversal datasets upon GSM8K and MathQA and verify that LLMs also struggle to solve reversal mathematical problems. We analyze the potential reason and attribute it to the insufficient modeling of the relationship between reasoning steps caused by the left-to-right objective. Consequently, based on the characteristics of multi-step reasoning, we design a novel training method to improve the general and reversal reasoning abilities. Finally, we conduct experiments on four mathematical datasets, and the results demonstrate that our method significantly improves the general reasoning capacities and alleviates the reversal problem. Our datasets and codes are available at https: //github.com/AllForward/ReversalMath.</abstract>
-      <url hash="0a8bb5ef">2024.findings-acl.811</url>
+      <url hash="0eee8da1">2024.findings-acl.811</url>
       <bibkey>guo-etal-2024-exploring</bibkey>
       <doi>10.18653/v1/2024.findings-acl.811</doi>
     </paper>
@@ -17484,7 +17484,7 @@
       <author><first>Zhendong</first><last>Niu</last><affiliation>Beijing Institute of Technology</affiliation></author>
       <pages>13686-13696</pages>
       <abstract>Knowledge graph completion (KGC) task is to infer the missing knowledge in the knowledge graph based on known factual triples. However, present KGC approaches still face the following two challenges. Those methods perform simple linear update on relation representation, and only local neighborhood information is aggregated, which makes it difficult to capture logic semantic between relations and global topological context information. To tackle the above challenges, we propose a unified joint approach with Topological Context learning and Rule Augmentation (TCRA) for KGC. The TCRA framework consists of an entity topological context learning mechanism based on dual-branch hierarchical graph attention network, and a relation rule context learning mechanism based on Rule-Transformer and rule-to-relation aggregator. The former mechanism encodes the topological structure features of entities, aggregates the local neighborhood topological context information of entities on the three levels (entity, relation and triple), and build clusters of global head or tail entities related to the same relation. It can capture the local and global topological context information of entities related to the same relation. The latter mechanism introduces chain-like Horn rules as the context information of relations, and encodes the logical semantic of relations to enrich the relation representation. Experimental performances on three benchmark datasets FB15k-237, WN18RR and Kinship indicate the effectiveness and superiority of our proposed approach. The codes are publicly available.</abstract>
-      <url hash="cdb22546">2024.findings-acl.812</url>
+      <url hash="e35411e3">2024.findings-acl.812</url>
       <bibkey>guo-etal-2024-unified</bibkey>
       <doi>10.18653/v1/2024.findings-acl.812</doi>
       <video href="2024.findings-acl.812.mp4"/>
@@ -17504,7 +17504,7 @@
       <author><first>Thang</first><last>Luong</last><affiliation>Google</affiliation></author>
       <pages>13697-13720</pages>
       <abstract>Since most large language models (LLMs) are trained once and never updated, they struggle to dynamically adapt to our ever-changing world. In this work, we present FreshQA, a dynamic QA benchmark that tests a model’s ability to answer questions that may require reasoning over up-to-date world knowledge. We develop a two-mode human evaluation procedure to measure both correctness and hallucination, which we use to benchmark both closed and open-source LLMs by collecting &gt;50K human judgments. We observe that all LLMs struggle to answer questions that require fast-changing world knowledge as well as questions with false premises that need to be debunked. In response, we develop FreshPrompt, a few-shot prompting method that curates and organizes relevant information from a search engine into an LLM’s prompt. Our experiments show that FreshPrompt outperforms both competing search engine-augmented prompting methods such as Self-Ask (Press et al., 2022) as well as commercial systems such as Perplexity.AI. To facilitate future work, we additionally develop FreshEval, a reliable autorater for quick evaluation and comparison on FreshQA. Our latest results with FreshEval suggest that open-source LLMs such as Mixtral (Jiang et al., 2024), when combined with FreshPrompt, are competitive with closed-source and commercial systems on search-augmented QA.</abstract>
-      <url hash="08eabf88">2024.findings-acl.813</url>
+      <url hash="2335713c">2024.findings-acl.813</url>
       <bibkey>vu-etal-2024-freshllms</bibkey>
       <doi>10.18653/v1/2024.findings-acl.813</doi>
     </paper>
@@ -17517,7 +17517,7 @@
       <author><first>Dacheng</first><last>Tao</last><affiliation>University of Sydney</affiliation></author>
       <pages>13721-13736</pages>
       <abstract>With the development of instruction-tuned large language models (LLMs), improving the safety of LLMs has become more critical. However, the current approaches for aligning the LLMs output with expected safety usually require substantial training efforts, e.g., high-quality safety data and expensive computational resources, which are costly and inefficient. To this end, we present reverse prompt contrastive decoding (ROSE), a simple-yet-effective method to directly boost the safety of existing instruction-tuned LLMs without any additional training. The principle of ROSE is to improve the probability of desired safe output via suppressing the undesired output induced by the carefully-designed reverse prompts. Experiments on 6 safety and 2 general-purpose tasks show that, our ROSE not only brings consistent and significant safety improvements (up to +13.8% safety score) upon 5 types of instruction-tuned LLMs, but also benefits the general-purpose ability of LLMs. In-depth analyses explore the underlying mechanism of ROSE, and reveal when and where to use it.</abstract>
-      <url hash="0cb19a6e">2024.findings-acl.814</url>
+      <url hash="823bc7e2">2024.findings-acl.814</url>
       <bibkey>zhong-etal-2024-rose</bibkey>
       <doi>10.18653/v1/2024.findings-acl.814</doi>
     </paper>
@@ -17536,7 +17536,7 @@
       <author><first>Bing</first><last>Han</last><affiliation>mybank, antgroup</affiliation></author>
       <pages>13737-13747</pages>
       <abstract>Concept reasoning is an important capability for models to understand the world. However, the existing datasets, such as concept extraction and concept generation, suffer from modeledge leakage and context leakage. To address these limitations, we construct a dataset of concept reasoning for large language models (CR-LLM) with modeledge leakage prevention and context leakage prevention, which consists of 2,167 samples and covers different concept types. In addition, we propose a hybrid reasoning method, consisting of inductive reasoning, deductive reasoning and a controller. This method allows large language models to adaptively select the optimal reasoning method for each input sample. Finally, we conduct extensive experiments on CR-LLM using different models and methods. The results show that existing large language models and reasoning methods perform sub-optimally in the concept reasoning task. In contrast, our proposed method significantly improves the capabilities, achieving a 7% increase in accuracy compared to CoT and demonstrating better granularity. We release CR-LLM and code at https://github.com/Nianqi-Li/Concept-Reasoning-for-LLMs.</abstract>
-      <url hash="4ae2b3a5">2024.findings-acl.815</url>
+      <url hash="70777d90">2024.findings-acl.815</url>
       <bibkey>li-etal-2024-cr</bibkey>
       <doi>10.18653/v1/2024.findings-acl.815</doi>
     </paper>
@@ -17550,7 +17550,7 @@
       <author><first>Yaliang</first><last>Li</last><affiliation>Alibaba Group</affiliation></author>
       <pages>13748-13761</pages>
       <abstract>Recently, multi-task instruction tuning has been utilized to improve sentence representation learning (SRL). It enables SRL models to generate task-specific representations with the guidance of task instruction, thus exhibiting strong generalization ability on unseen tasks. However, these methods mostly neglect the potential interference problems across different tasks and instances, which may affect the training of the model.To address this issue, we propose a data curriculum method, namely **Data-CUBE**, that arranges the order of all the multi-task data for training, to minimize the interference risks from two aspects.At the task level, we aim to find the optimal task order to minimize the total cross-task interference risk and formulate this problem as the traveling salesman problem, which is further solved by a specially designed simulated annealing algorithm. At the instance level, we propose a measurement method to quantify the difficulty of all instances per task, and then arrange instances in an easy-to-difficult order for training.Experimental results show that our approach can boost the performance of state-of-the-art methods. Our code and data will be publicly released.</abstract>
-      <url hash="445ab017">2024.findings-acl.816</url>
+      <url hash="329f69f7">2024.findings-acl.816</url>
       <bibkey>min-etal-2024-data</bibkey>
       <doi>10.18653/v1/2024.findings-acl.816</doi>
       <video href="2024.findings-acl.816.mp4"/>
@@ -17565,7 +17565,7 @@
       <author><first>Xu</first><last>Chu</last></author>
       <pages>13762-13774</pages>
       <abstract>Extracting semantic topics from short texts presents a significant challenge in the field of data mining. While efforts have been made to mitigate data sparsity issue, the limited length of short documents also results in the absence of semantically relevant words, causing biased evidence lower bound and incomplete labels for likelihood maximization. We refer to this issue as the label sparsity problem. To combat this problem, we propose kNNTM, a neural short text topic model that incorporates a <tex-math>k</tex-math>-Nearest-Neighbor-based label completion algorithm by augmenting the reconstruction label with <tex-math>k</tex-math>-nearest documents to complement these relevant but unobserved words. Furthermore, seeking a precise reflection of distances between documents, we propose a fused multi-view distances metric that takes both local word similarities and global topic semantics into consideration. Extensive experiments on multiple public short-text datasets show that kNNTM model outperforms the state-of-the-art baseline models and can derive both high-quality topics and document representations.</abstract>
-      <url hash="2184f999">2024.findings-acl.817</url>
+      <url hash="664c51ec">2024.findings-acl.817</url>
       <bibkey>lin-etal-2024-combating</bibkey>
       <doi>10.18653/v1/2024.findings-acl.817</doi>
       <video href="2024.findings-acl.817.mp4"/>
@@ -17577,7 +17577,7 @@
       <author><first>Yue</first><last>Zhang</last><affiliation>Westlake University</affiliation></author>
       <pages>13775-13791</pages>
       <abstract>The application scope of large language models (LLMs) is increasingly expanding. In practical use, users might provide feedback based on the model’s output, hoping for a responsive model that can complete responses according to their feedback. Whether the model can appropriately respond to users’ refuting feedback and consistently follow through with execution has not been thoroughly analyzed. In light of this, this paper proposes a comprehensive benchmark, <b>RefuteBench</b>, covering tasks such as question answering, machine translation, and email writing. The evaluation aims to assess whether models can positively accept feedback in form of refuting instructions and whether they can consistently adhere to user demands throughout the conversation. We conduct evaluations on numerous LLMs and find that LLMs are stubborn, i.e. exhibit inclination to their internal knowledge, often failing to comply with user feedback. Additionally, as the length of the conversation increases, models gradually forget the user’s stated feedback and roll back to their own responses. We further propose a <i>recall-and-repeat</i> prompts as a simple and effective way to enhance the model’s responsiveness to feedback.</abstract>
-      <url hash="f05abc59">2024.findings-acl.818</url>
+      <url hash="60cc0eab">2024.findings-acl.818</url>
       <bibkey>yan-etal-2024-refutebench</bibkey>
       <doi>10.18653/v1/2024.findings-acl.818</doi>
       <video href="2024.findings-acl.818.mp4"/>
@@ -17588,7 +17588,7 @@
       <author><first>Yixin</first><last>Cao</last><affiliation>Fudan University</affiliation></author>
       <pages>13792-13803</pages>
       <abstract>Complex logical query answering (CLQA) is a challenging task that involves finding answer entities for complex logical queries over incomplete knowledge graphs (KGs). Previous research has explored the use of pre-trained knowledge graph completion (KGC) models, which can predict the missing facts in KGs, to answer complex logical queries. However, KGC models are typically evaluated using ranking evaluation metrics, which may result in values of predictions of KGC models that are not well-calibrated. In this paper, we propose a method for calibrating KGC models, namely CKGC, which enables KGC models to adapt to answering complex logical queries. Notably, CKGC is lightweight and effective. The adaptation function is simple, allowing the model to quickly converge during the adaptation process. The core concept of CKGC is to map the values of predictions of KGC models to the range [0, 1], ensuring that values associated with true facts are close to 1, while values linked to false facts are close to 0. Through experiments on three benchmark datasets, we demonstrate that our proposed calibration method can significantly boost model performance in the CLQA task. Moreover, our approach can enhance the performance of CLQA while preserving the ranking evaluation metrics of KGC models. The code is available at https://github.com/changyi7231/CKGC.</abstract>
-      <url hash="2a157b32">2024.findings-acl.819</url>
+      <url hash="fda55301">2024.findings-acl.819</url>
       <bibkey>xiao-cao-2024-complex</bibkey>
       <doi>10.18653/v1/2024.findings-acl.819</doi>
       <video href="2024.findings-acl.819.mp4"/>
@@ -17601,7 +17601,7 @@
       <author><first>Hsin-Hsi</first><last>Chen</last><affiliation>National Taiwan University</affiliation></author>
       <pages>13804-13815</pages>
       <abstract>This paper introduces a novel approach to analyzing the forward-looking statements in equity research reports by integrating argument mining with sentiment analysis. Recognizing the limitations of traditional models in capturing the nuances of future-oriented analysis, we propose a refined categorization of argument units into claims, premises, and scenarios, coupled with a unique sentiment analysis framework. Furthermore, we incorporate a temporal dimension to categorize the anticipated impact duration of market events. To facilitate this study, we present the Equity Argument Mining and Sentiment Analysis (Equity-AMSA) dataset. Our research investigates the extent to which detailed domain-specific annotations can be provided, the necessity of fine-grained human annotations in the era of large language models, and whether our proposed framework can improve performance in downstream tasks over traditional methods. Experimental results reveal the significance of manual annotations, especially for scenario identification and sentiment analysis. The study concludes that our annotation scheme and dataset contribute to a deeper understanding of forward-looking statements in equity research reports.</abstract>
-      <url hash="99860fe7">2024.findings-acl.820</url>
+      <url hash="d7be3aa1">2024.findings-acl.820</url>
       <bibkey>lin-etal-2024-argument</bibkey>
       <doi>10.18653/v1/2024.findings-acl.820</doi>
       <video href="2024.findings-acl.820.mp4"/>
@@ -17615,7 +17615,7 @@
       <author><first>Min</first><last>Zhang</last><affiliation>Harbin Institute of Technology, Shenzhen</affiliation></author>
       <pages>13816-13836</pages>
       <abstract>Large language models (LLMs) have showcased their remarkable capabilities to handle various downstream tasks, including multilingual machine translation ability. Despite their impressive performance, decoder-only LLMs lack an explicit alignment between source and target contexts, leading to translation that may not faithfully represent the original content. To address this, we propose three learning strategies to encourage LLMs to pay more attention to the source context during translation: 1) adjusting attention weights on the source context by adaptive attention re-weighting; 2) suppressing the irrelevant target prefix using contrastive decoding; 3) avoiding excessive reliance on the target prefix through target-constrained tuning. To verify the effectiveness of our model, we curate a new dataset specifically focusing on unfaithful translations generated by LLMs. Experimental results on both human-collected and general test sets verify the effectiveness of our model across multiple language pairs. Further human evaluation demonstrates the efficacy of our method in reducing hallucinatory translation and improving the fidelity of translations.</abstract>
-      <url hash="2d555727">2024.findings-acl.821</url>
+      <url hash="3397e636">2024.findings-acl.821</url>
       <bibkey>zhang-etal-2024-paying</bibkey>
       <doi>10.18653/v1/2024.findings-acl.821</doi>
     </paper>
@@ -17626,7 +17626,7 @@
       <author><first>Minho</first><last>Lee</last><affiliation>Kyungpook National University</affiliation></author>
       <pages>13837-13856</pages>
       <abstract>Block Diagrams play an essential role in visualizing the relationships between components or systems. Generating summaries of block diagrams is important for document understanding or question answering (QA) tasks by providing concise overviews of complex systems. However, it’s a challenging task as it requires compressing complex relationships into informative descriptions. In this paper, we present “BlockNet”, a fusion framework that summarizes block diagrams by integrating local and global information, catering to both English and Korean languages. Additionally, we introduce a new multilingual method to produce block diagram data, resulting in a high-quality dataset called “BD-EnKo”. In BlockNet, we develop “BlockSplit”, an Optical Character Recognition (OCR) based algorithm employing the divide-and-conquer principle for local information extraction. We train an OCR-free transformer architecture for global information extraction using BD-EnKo and public data. To assess the effectiveness of our model, we conduct thorough experiments on different datasets. The assessment shows that BlockNet surpasses all previous methods and models, including GPT-4V, for block diagram summarization.</abstract>
-      <url hash="9d8a454d">2024.findings-acl.822</url>
+      <url hash="76a6ba31">2024.findings-acl.822</url>
       <bibkey>bhushan-etal-2024-unveiling</bibkey>
       <doi>10.18653/v1/2024.findings-acl.822</doi>
       <video href="2024.findings-acl.822.mp4"/>
@@ -17642,7 +17642,7 @@
       <author><first>Xinyu</first><last>Dai</last><affiliation>Nanjing University</affiliation></author>
       <pages>13857-13867</pages>
       <abstract>Text2SQL is a task that translates natural language into SQL statements. Context-dependent Text2SQL offers a more natural database interaction by simulating dialogues between users and databases, with CoSQL and SparC as representative datasets. Yet, these datasets struggle to accurately replicate real-world situations. To address this, we introduce MultiSQL, which extends them in three key aspects: (1) Diverse SQL Operations. We incorporate diverse SQL types such as Create, Update, and Insert to broaden the scope of SQL operations. (2) Schema-Integrated Context. We integrated query context with database schema dependencies to better depict database complexity. (3) Extended Dialogues. We expand dialogue length to better simulate long conversations and complex interactions. This multi-type, schema-integrated, context-dependent Text2SQL dataset comprises nearly 800 dialogue groups and over 9,000 interaction turns across 166 complex databases, offering a better benchmark for interactive user-database dialogue.Addressing MultiSQL’s challenges, we refined evaluation metrics to better capture diverse SQL types and schema dependencies. We designed a prompt framework that leverages historical data and self-refinement to accurately capture the dependency between text queries and database structures. Experiments with GPT-3.5, GPT-4, and LLaMA2-7B show both the effectiveness of our strategies and the challenges of MultiSQL. The datasets is available at https://github.com/grandchicken/MultiSQL.</abstract>
-      <url hash="8a4a9d20">2024.findings-acl.823</url>
+      <url hash="87069468">2024.findings-acl.823</url>
       <bibkey>li-etal-2024-multisql</bibkey>
       <doi>10.18653/v1/2024.findings-acl.823</doi>
       <video href="2024.findings-acl.823.mp4"/>
@@ -17657,7 +17657,7 @@
       <author><first>Min</first><last>Zhang</last><affiliation>Harbin Institute of Technology, Shenzhen</affiliation></author>
       <pages>13868-13881</pages>
       <abstract>Tuning-based large language models for machine translation (aka large translation model, LTM) have demonstrated significant performance in the field of machine translation. Despite their success, these models often face difficulties in leveraging demonstrations to further improve their performance. To tackle this challenge, we introduce a novel approach that integrates demonstration-aware training and inference strategies within the framework of tuning-based LTMs, hereby referred to as demonstration-aware LTMs. During training, we enrich the model’s learning process by incorporating both sentence- and document-level demonstrations derived from its original training dataset. During inference, the model synergizes its own contextual translations with retrieved high-quality demonstrations, leading to more precise and contextually appropriate outputs. Empirical results reveal that our demonstration-aware LTM not only mitigates the negative impacts traditionally associated with demonstrations but also secures substantial improvements in translation accuracy, particularly in domain-specific and document-level translation tasks. Source code and scripts are freely available at https://github.com/ChenLi0620/Demo-Aware-LLM-MT.</abstract>
-      <url hash="4a17f2ea">2024.findings-acl.824</url>
+      <url hash="965e89ac">2024.findings-acl.824</url>
       <bibkey>li-etal-2024-towards-demonstration</bibkey>
       <doi>10.18653/v1/2024.findings-acl.824</doi>
     </paper>
@@ -17669,7 +17669,7 @@
       <author><first>Joonsuk</first><last>Park</last><affiliation>University of Richmond</affiliation></author>
       <pages>13882-13893</pages>
       <abstract>Pre-trained language models (PLMs) exhibit promise in retrieval tasks but struggle with out-of-domain data due to distribution shifts.Addressing this, generative domain adaptation (DA), known as GPL, tackles distribution shifts by generating pseudo queries and labels to train models for predicting query-document relationships in new domains.However, it overlooks the domain distribution, causing the model to struggle with aligning the distribution in the target domain.We, therefore, propose a Distribution-Aware Domain Adaptation (DADA) to guide the model to consider the domain distribution knowledge at the level of both a single document and the corpus, which is referred to as observation-level feedback and domain-level feedback, respectively.Our method effectively adapts the model to the target domain and expands document representation to unseen gold query terms using domain and observation feedback, as demonstrated by empirical results on the BEIR benchmark.</abstract>
-      <url hash="2bb841ff">2024.findings-acl.825</url>
+      <url hash="e7c57ae1">2024.findings-acl.825</url>
       <bibkey>lee-etal-2024-dada</bibkey>
       <doi>10.18653/v1/2024.findings-acl.825</doi>
       <video href="2024.findings-acl.825.mp4"/>
@@ -17683,7 +17683,7 @@
       <author><first>Tony</first><last>Mak</last><affiliation>Google</affiliation></author>
       <pages>13894-13908</pages>
       <abstract>While self-correction has shown promise in improving LLM outputs in terms of style and quality (e.g. Chen et al., 2023b; Madaan et al.,2023), recent attempts to self-correct logical or reasoning errors often cause correct answers to become incorrect, resulting in worse performances overall (Huang et al., 2023). In this paper, we show that poor self-correction performance stems from LLMs’ inability tofind logical mistakes, rather than their ability to correct a known mistake. Firstly, we benchmark several state-of-the-art LLMs ontheir mistake-finding ability and demonstrate that they generally struggle with the task, even in highly objective, unambiguous cases. Secondly, we test the correction abilities of LLMs – separately from mistake finding – using a backtracking setup that feeds ground truth mistake location information to the model. We show that this boosts downstream task performance across our 5 reasoning tasks, indicating that LLMs’ correction abilities are robust. Finally, we show that it is possible to obtain mistake location information without ground truth labels or in-domain training data. We train a small classifier with out-of-domain data, which exhibits stronger mistake-finding performance than prompting a large model. We release our dataset of LLM-generated logical mistakes, BIG-Bench Mistake, to enable further research into locating LLM reasoning mistakes.</abstract>
-      <url hash="973a7a42">2024.findings-acl.826</url>
+      <url hash="2ee0ff71">2024.findings-acl.826</url>
       <bibkey>tyen-etal-2024-llms</bibkey>
       <doi>10.18653/v1/2024.findings-acl.826</doi>
       <video href="2024.findings-acl.826.mp4"/>
@@ -17700,7 +17700,7 @@
       <author><first>Fabio Massimo</first><last>Zanzotto</last><affiliation>University of Rome Tor Vergata</affiliation></author>
       <pages>13909-13920</pages>
       <abstract>Understanding textual description to generate code seems to be an achieved capability of instruction-following Large Language Models (LLMs) in zero-shot scenario. However, there is a severe possibility that this translation ability may be influenced by having seen target textual descriptions and the related code. This effect is known as Data Contamination.In this study, we investigate the impact of Data Contamination on the performance of GPT-3.5 in the Text-to-SQL code-generating tasks. Hence, we introduce a novel method to detect Data Contamination in GPTs and examine GPT-3.5’s Text-to-SQL performances using the known Spider Dataset and our new unfamiliar dataset Termite. Furthermore, we analyze GPT-3.5’s efficacy on databases with modified information via an adversarial table disconnection (ATD) approach, complicating Text-to-SQL tasks by removing structural pieces of information from the database. Our results indicate a significant performance drop in GPT-3.5 on the unfamiliar Termite dataset, even with ATD modifications, highlighting the effect of Data Contamination on LLMs in Text-to-SQL translation tasks.</abstract>
-      <url hash="d0bb8beb">2024.findings-acl.827</url>
+      <url hash="cbb712be">2024.findings-acl.827</url>
       <bibkey>ranaldi-etal-2024-investigating</bibkey>
       <doi>10.18653/v1/2024.findings-acl.827</doi>
     </paper>
@@ -17714,7 +17714,7 @@
       <author><first>Elena</first><last>Simperl</last><affiliation>King’s College London</affiliation></author>
       <pages>13921-13937</pages>
       <abstract>Whilst fact verification has attracted substantial interest in the natural language processing community, verifying misinforming statements against data visualizations such as charts has so far been overlooked. Charts are commonly used in the real-world to summarize and com municate key information, but they can also be easily misused to spread misinformation and promote certain agendas. In this paper, we introduce ChartCheck, a novel, large-scale dataset for explainable fact-checking against real-world charts, consisting of 1.7k charts and 10.5k human-written claims and explanations. We systematically evaluate ChartCheck using vision-language and chart-to-table models, and propose a baseline to the community. Finally, we study chart reasoning types and visual attributes that pose a challenge to these models.</abstract>
-      <url hash="271d8d40">2024.findings-acl.828</url>
+      <url hash="49a567b2">2024.findings-acl.828</url>
       <bibkey>akhtar-etal-2024-chartcheck</bibkey>
       <doi>10.18653/v1/2024.findings-acl.828</doi>
       <video href="2024.findings-acl.828.mp4"/>
@@ -17727,7 +17727,7 @@
       <author><first>Maarten</first><last>Rijke</last><affiliation>University of Amsterdam</affiliation></author>
       <pages>13938-13946</pages>
       <abstract>Entity linking (EL) in conversations faces notable challenges in practical applications, primarily due to scarcity of entity-annotated conversational datasets and sparse knowledge bases (KB) containing domain-specific, long-tail entities. We designed targeted evaluation scenarios to measure the efficacy of EL models under resource constraints. Our evaluation employs two KBs: Fandom, exemplifying real-world EL complexities, and the widely used Wikipedia. First, we assess EL models’ ability to generalize to a new unfamiliar KB using Fandom and a novel zero-shot conversational entity linking dataset that we curated based on Reddit discussions on Fandom entities. We then evaluate the adaptability of EL models to conversational settings without prior training. Our results indicate that current zero-shot EL models falter when introduced to new, domain-specific KBs without prior training, significantly dropping in performance.Our findings reveal that previous evaluation approaches fall short of capturing real-world complexities for zero-shot EL, highlighting the necessity for new approaches to design and assess conversational EL models to adapt to limited resources. The evaluation frame-work and dataset proposed are tailored to facilitate this research.</abstract>
-      <url hash="4eda4d75">2024.findings-acl.829</url>
+      <url hash="2ad6aa92">2024.findings-acl.829</url>
       <bibkey>hoveyda-etal-2024-real</bibkey>
       <doi>10.18653/v1/2024.findings-acl.829</doi>
       <video href="2024.findings-acl.829.mp4"/>
@@ -17746,7 +17746,7 @@
       <author><first>Xiping</first><last>Hu</last><affiliation>Beijing Institute of Technology</affiliation></author>
       <pages>13947-13966</pages>
       <abstract>Using large language models (LLMs) to assist psychological counseling is a significant but challenging task at present. Attempts have been made on improving empathetic conversations or acting as effective assistants in the treatment with LLMs. However, the existing datasets lack consulting knowledge, resulting in LLMs lacking professional consulting competence. Moreover, how to automatically evaluate multi-turn dialogues within the counseling process remains an understudied area. To bridge the gap, we propose CPsyCoun, a report-based multi-turn dialogue reconstruction and evaluation framework for Chinese psychological counseling. To fully exploit psychological counseling reports, a two-phase approach is devised to construct high-quality dialogues while a comprehensive evaluation benchmark is developed for the effective automatic evaluation of multi-turn psychological consultations. Competitive experimental results demonstrate the effectiveness of our proposed framework in psychological counseling. We open-source the datasets and model for future research.</abstract>
-      <url hash="f6165a5b">2024.findings-acl.830</url>
+      <url hash="f4c909c6">2024.findings-acl.830</url>
       <bibkey>zhang-etal-2024-cpsycoun</bibkey>
       <doi>10.18653/v1/2024.findings-acl.830</doi>
     </paper>
@@ -17759,7 +17759,7 @@
       <author><first>Tanmoy</first><last>Chakraborty</last><affiliation>Indian Institute of Technology, Delhi</affiliation></author>
       <pages>13967-13983</pages>
       <abstract>Employing language models to generate explanations for an incoming implicit hate post is an active area of research. The explanation is intended to make explicit the underlying stereotype and aid content moderators. The training often combines top-k relevant knowledge graph (KG) tuples to provide world knowledge and improve performance on standard metrics. Interestingly, our study presents conflicting evidence for the role of the quality of KG tuples in generating implicit explanations. Consequently, simpler models incorporating external toxicity signals outperform KG-infused models. Compared to the KG-based setup, we observe a comparable performance for SBIC (LatentHatred) datasets with a performance variation of +0.44 (+0.49), +1.83 (-1.56), and -4.59 (+0.77) in BLEU, ROUGE-L, and BERTScore. Further human evaluation and error analysis reveal that our proposed setup produces more precise explanations than zero-shot GPT-3.5, highlighting the intricate nature of the task.</abstract>
-      <url hash="3d78a662">2024.findings-acl.831</url>
+      <url hash="e705ec32">2024.findings-acl.831</url>
       <bibkey>yadav-etal-2024-tox</bibkey>
       <doi>10.18653/v1/2024.findings-acl.831</doi>
       <video href="2024.findings-acl.831.mp4"/>
@@ -17774,7 +17774,7 @@
       <author><first>Tomas</first><last>Pfister</last><affiliation>Google</affiliation></author>
       <pages>13984-14011</pages>
       <abstract>Large language models (LLMs) have attracted great interest in many real-world applications; however, their “black-box” nature necessitates scalable and faithful explanations. Shapley values have matured as an explainability method for deep learning, but extending them to LLMs is difficult due to long input contexts and autoregressive output generation. We introduce , an efficient post-hoc explanation method incorporating LLM-specific techniques, which leads to significant runtime improvements: token-level explanations in minutes not hours, and document-level explanations within seconds. We demonstrate how such explanations can improve end-to-end performance of retrieval augmented generation by localizing important words within long documents and reranking passages collected by retrieval systems. On various open-domain question answering benchmarks, we show TextGenSHAP improves the retrieval recall and prediction accuracy significantly.</abstract>
-      <url hash="45a66ab9">2024.findings-acl.832</url>
+      <url hash="095fdbc0">2024.findings-acl.832</url>
       <bibkey>enouen-etal-2024-textgenshap</bibkey>
       <doi>10.18653/v1/2024.findings-acl.832</doi>
       <video href="2024.findings-acl.832.mp4"/>
@@ -17789,7 +17789,7 @@
       <author><first>Xipeng</first><last>Qiu</last><affiliation>Fudan University</affiliation></author>
       <pages>14012-14023</pages>
       <abstract>Data plays a fundamental role in the training of Large Language Models (LLMs). While attention has been paid to the collection and composition of datasets, determining the data sampling strategy in training remains an open question. Most LLMs are trained with a simple strategy, random sampling. However, this sampling strategy ignores the unbalanced nature of training data distribution, which can be sub-optimal. In this paper, we propose ClusterClip Sampling to balance the text distribution of training data for better model training. Specifically, ClusterClip Sampling utilizes data clustering to reflect the data distribution of the training set and balances the common samples and rare samples during training based on the cluster results. A repetition clip operation is introduced to mitigate the overfitting issue led by samples from certain clusters. Extensive experiments validate the effectiveness of ClusterClip Sampling, which outperforms random sampling and other cluster-based sampling variants under various training datasets and large language models.</abstract>
-      <url hash="4f6080d1">2024.findings-acl.833</url>
+      <url hash="fbcee5c0">2024.findings-acl.833</url>
       <bibkey>shao-etal-2024-balanced</bibkey>
       <doi>10.18653/v1/2024.findings-acl.833</doi>
     </paper>
@@ -17805,7 +17805,7 @@
       <author><first>Xiaoling</first><last>Wang</last><affiliation>East China Normal University</affiliation></author>
       <pages>14024-14040</pages>
       <abstract>Generalizing to longer sentences is important for recent Transformer-based language models. Besides algorithms manipulating explicit position features, the success of Transformers without position encodings (NoPE) provides a new way to overcome the challenge. In this paper, we study the length generalization property of NoPE. We find that although NoPE can extend to longer sequences than the commonly used explicit position encodings, it still has a limited context length. We identify a connection between the failure of NoPE’s generalization and the distraction of attention distributions. We propose a parameter-efficient tuning for searching attention heads’ best temperature hyper-parameters, which substantially expands NoPE’s context size. Experiments on long sequence language modeling, the synthetic passkey retrieval task and real-world long context tasks show that NoPE can achieve competitive performances with state-of-the-art length generalization algorithms. The source code is publicly accessible</abstract>
-      <url hash="5a6273b6">2024.findings-acl.834</url>
+      <url hash="eb8b23b7">2024.findings-acl.834</url>
       <bibkey>wang-etal-2024-length</bibkey>
       <doi>10.18653/v1/2024.findings-acl.834</doi>
     </paper>
@@ -17822,7 +17822,7 @@
       <author><first>Min</first><last>Zhang</last><affiliation>Harbin Institute of Technology, Shenzhen</affiliation></author>
       <pages>14041-14055</pages>
       <abstract>Motivated by the success of unsupervised neural machine translation (UNMT), we introduce an unsupervised sign language translation and generation network (USLNet), which learns from abundant single-modality (text and video) data without parallel sign language data. USLNet comprises two main components: single-modality reconstruction modules (text and video) that rebuild the input from its noisy version in the same modality and cross-modality back-translation modules (text-video-text and video-text-video) that reconstruct the input from its noisy version in the different modality using back-translation procedure. Unlike the single-modality back-translation procedure in text-based UNMT, USLNet faces the cross-modality discrepancy in feature representation, in which the length and the feature dimension mismatch between text and video sequences. We propose a sliding window method to address the issues of aligning variable-length text with video sequences. To our knowledge, USLNet is the first unsupervised sign language translation and generation model capable of generating both natural language text and sign language video in a unified manner. Experimental results on the BBC-Oxford Sign Language dataset and Open-Domain American Sign Language dataset reveal that USLNet achieves competitive results compared to supervised baseline models, indicating its effectiveness in sign language translation and generation.</abstract>
-      <url hash="a19bacdb">2024.findings-acl.835</url>
+      <url hash="d50ea106">2024.findings-acl.835</url>
       <bibkey>guo-etal-2024-unsupervised</bibkey>
       <doi>10.18653/v1/2024.findings-acl.835</doi>
     </paper>
@@ -17837,7 +17837,7 @@
       <author><first>Roberto</first><last>Navigli</last><affiliation>Sapienza University of Rome</affiliation></author>
       <pages>14056-14080</pages>
       <abstract>Data scarcity is a prevalent challenge in the era of Large Language Models (LLMs). The insatiable hunger of LLMs for large corpora becomes even more pronounced when dealing with non-English and low-resource languages. The issue is particularly exacerbated in Semantic Parsing (SP), i.e. the task of converting text into a formal representation. The complexity of semantic formalisms makes training human annotators and subsequent data annotation unfeasible on a large scale, especially across languages. To mitigate this, we first introduce the Multilingual Semantic Layer (MSL), a conceptual evolution of previous formalisms, which decouples from disambiguation and external inventories and simplifies the task. MSL provides the necessary tools to encode the meaning across languages, paving the way for developing a high-quality semantic parsing dataset across different languages in a semi-automatic strategy. Subsequently, we manually refine a portion of this dataset and fine-tune GPT-3.5 to propagate these refinements across the dataset. Then, we manually annotate 1,100 sentences in eleven languages, including low-resource ones. Finally, we assess our dataset’s quality, showcasing the performance gap reduction across languages in Semantic Parsing.</abstract>
-      <url hash="4a59f287">2024.findings-acl.836</url>
+      <url hash="abfcbf43">2024.findings-acl.836</url>
       <bibkey>martinez-lorenzo-etal-2024-mitigating</bibkey>
       <doi>10.18653/v1/2024.findings-acl.836</doi>
     </paper>
@@ -17852,7 +17852,7 @@
       <author><first>Chenliang</first><last>Li</last></author>
       <pages>14081-14094</pages>
       <abstract>In recent years, Large Language Models (LLMs) have demonstrated remarkable capabilities across a wide array of text-centric tasks. However, their ‘large’ scale introduces significant computational and storage challenges, particularly in managing the key-value states of the transformer, which limits their wider applicability. Therefore, we propose to adaptively release resources from caches and rebuild the necessary key-value states. Particularly, we accomplish this by a lightweight controller module to approximate an ideal top-<tex-math>K</tex-math> sparse attention. This module retains the tokens with the highest top-<tex-math>K</tex-math> attention weights and simultaneously rebuilds the discarded but necessary tokens, which may become essential for future decoding. Comprehensive experiments in natural language generation and modeling reveal that our method is not only competitive with full attention in terms of performance but also achieves a significant throughput improvement of up to <tex-math>\textbf{221.8}</tex-math>%. The code for replication is available on the https://github.com/WHUIR/ADORE.</abstract>
-      <url hash="b0aac1b8">2024.findings-acl.837</url>
+      <url hash="a794102c">2024.findings-acl.837</url>
       <bibkey>zhang-etal-2024-efficient</bibkey>
       <doi>10.18653/v1/2024.findings-acl.837</doi>
       <video href="2024.findings-acl.837.mp4"/>
@@ -17872,7 +17872,7 @@
       <author><first>Bing</first><last>Qin</last><affiliation>Harbin Institute of Technology</affiliation></author>
       <pages>14095-14113</pages>
       <abstract>Despite the impressive performance on information-seeking tasks, large language models (LLMs) still struggle with hallucinations. Attributed LLMs, which augment generated text with in-line citations, demonstrate potential in mitigating hallucinations and improving verifiability. However, current approaches suffer from suboptimal citation quality due to their reliance on in-context learning. Furthermore, the practice of merely citing document identifiers complicates the process for users to pinpoint specific supporting evidence. In this work, we introduce FRONT, a training framework that teaches LLMs to generate Fine-grained grounded citations. By initially grounding fine-grained supporting quotes, which then guide the generation process, these quotes not only provide supervision signals to improve citation quality but also serve as fine-grained attributions. Experiments on the ALCE benchmark demonstrate the efficacy of FRONT in generating superior grounded responses and highly supportive citations. With LLaMA-2-7B, the framework significantly outperforms all the baselines, achieving an average of 14.21% improvement in citation quality across all datasets, even surpassing ChatGPT.</abstract>
-      <url hash="4e1f9b2a">2024.findings-acl.838</url>
+      <url hash="88410032">2024.findings-acl.838</url>
       <bibkey>huang-etal-2024-learning</bibkey>
       <doi>10.18653/v1/2024.findings-acl.838</doi>
     </paper>
@@ -17884,7 +17884,7 @@
       <author><first>Roberto</first><last>Navigli</last><affiliation>Sapienza University of Rome</affiliation></author>
       <pages>14114-14132</pages>
       <abstract>Entity Linking (EL) and Relation Extraction (RE) are fundamental tasks in Natural Language Processing, serving as critical components in a wide range of applications. In this paper, we propose ReLiK, a Retriever-Reader architecture for both EL and RE, where, given an input text, the Retriever module undertakes the identification of candidate entities or relations that could potentially appear within the text. Subsequently, the Reader module is tasked to discern the pertinent retrieved entities or relations and establish their alignment with the corresponding textual spans. Notably, we put forward an innovative input representation that incorporates the candidate entities or relations alongside the text, making it possible to link entities or extract relations in a single forward pass and to fully leverage pre-trained language models contextualization capabilities, in contrast with previous Retriever-Reader-based methods, which require a forward pass for each candidate. Our formulation of EL and RE achieves state-of-the-art performance in both in-domain and out-of-domain benchmarks while using academic budget training and with up to 40x inference speed compared to competitors. Finally, we show how our architecture can be used seamlessly for Information Extraction (cIE), i.e. EL + RE, and setting a new state of the art by employing a shared Reader that simultaneously extracts entities and relations.</abstract>
-      <url hash="2cbf4cf1">2024.findings-acl.839</url>
+      <url hash="c2dccc59">2024.findings-acl.839</url>
       <bibkey>orlando-etal-2024-relik</bibkey>
       <doi>10.18653/v1/2024.findings-acl.839</doi>
       <video href="2024.findings-acl.839.mp4"/>
@@ -17897,7 +17897,7 @@
       <author><first>Jing</first><last>Jiang</last><affiliation>Singapore Management University</affiliation></author>
       <pages>14133-14147</pages>
       <abstract>In Conversational Intent Discovery (CID), Small Language Models (SLMs) struggle with overfitting to familiar intents and fail to label newly discovered ones. This issue stems from their limited grasp of semantic nuances and their intrinsically discriminative framework. Therefore, we propose Synergizing Large Language Models (LLMs) with pre-trained SLMs for CID (SynCID). It harnesses the profound semantic comprehension of LLMs alongside the operational agility of SLMs. By utilizing LLMs to refine both utterances and existing intent labels, SynCID significantly enhances the semantic depth, subsequently realigning these enriched descriptors within the SLMs’ feature space to correct cluster distortion and promote robust learning of representations. A key advantage is its capacity for the early identification of new intents, a critical aspect for deploying conversational agents successfully. Additionally, SynCID leverages the in-context learning strengths of LLMs to generate labels for new intents. Thorough evaluations across a wide array of datasets have demonstrated its superior performance over traditional CID methods.</abstract>
-      <url hash="1321bcdd">2024.findings-acl.840</url>
+      <url hash="f108afa7">2024.findings-acl.840</url>
       <bibkey>liang-etal-2024-synergizing</bibkey>
       <doi>10.18653/v1/2024.findings-acl.840</doi>
       <video href="2024.findings-acl.840.mp4"/>
@@ -17908,8 +17908,8 @@
       <author><first>Karim</first><last>Ghonim</last><affiliation>University of Roma “La Sapienza”</affiliation></author>
       <author><first>Roberto</first><last>Navigli</last><affiliation>Sapienza University of Rome</affiliation></author>
       <pages>14148-14161</pages>
-      <abstract>Recent advancements in text summarization, particularly with the advent of Large Language Models (LLMs), have shown remarkable performance. However, a notable challenge persists as a substantial number of automatically-generated summaries exhibit factual inconsistencies, such as hallucinations. In response to this issue, various approaches for the evaluation of consistency for summarization have emerged. Yet, these newly-introduced metrics face several limitations, including lack of interpretability, focus on short document summaries (e.g., news articles), and computational impracticality, especially for LLM-based metrics. To address these shortcomings, we propose Factuality Evaluation of summarization based on Natural language Inference and Claim Extraction (FENICE), a more interpretable and efficient factuality-oriented metric. FENICE leverages an NLI-based alignment between information in the source document and a set of atomic facts, referred to as claims, extracted from the summary. Our metric sets a new state of the art on AGGREFACT, the de-facto benchmark for factuality evaluation. Moreover, we extend our evaluation to a more challenging setting by conducting a human annotation process of long-form summarization. In the hope of fostering research in summarization factuality evaluation, we release the code of our metric and our factuality annotations of long-form summarization at https://github.com/Babelscape/FENICE.</abstract>
-      <url hash="678004ec">2024.findings-acl.841</url>
+      <abstract>Recent advancements in text summarization, particularly with the advent of Large Language Models (LLMs), have shown remarkable performance. However, a notable challenge persists as a substantial number of automatically-generated summaries exhibit factual inconsistencies, such as hallucinations. In response to this issue, various approaches for the evaluation of consistency for summarization have emerged. Yet, these newly-introduced metrics face several limitations, including lack of interpretability, focus on short document summaries (e.g., news articles), and computational impracticality, especially for LLM-based metrics. To address these shortcomings, we propose Factuality Evaluation of summarization based on Natural language Inference and Claim Extraction (FENICE), a more interpretable and efficient factuality-oriented metric. FENICE leverages an NLI-based alignment between information in the source document and a set of atomic facts, referred to as claims, extracted from the summary. Our metric sets a new state of the art on AGGREFACT, the de-facto benchmark for factuality evaluation. Moreover, we extend our evaluation to a more challenging setting by conducting a human annotation process of long-form summarization. In the hope of fostering research in summarization factuality evaluation, we release the code of our metric and our factuality annotations of long-form summarization at anonymizedurl.</abstract>
+      <url hash="6506d092">2024.findings-acl.841</url>
       <bibkey>scire-etal-2024-fenice</bibkey>
       <doi>10.18653/v1/2024.findings-acl.841</doi>
       <video href="2024.findings-acl.841.mp4"/>
@@ -17924,7 +17924,7 @@
       <author><first>Zhichao</first><last>Lu</last></author>
       <pages>14162-14167</pages>
       <abstract>Recently, the self-consistency decoding strategy has shown the ability to improve performance for complex reasoning tasks with large language models (LLMs). However, the costs may be high because the sampling process of the strategy generates some low-probability text, resulting in low-quality reasoning paths. As a consequence, it requires a relatively large sampling number to obtain good aggregation performance. In this paper, we propose an alternative strategy, <i>self-para-consistency</i>. It first generates multiple paraphrases for each test question, then generates reasoning paths for the original and all the paraphrased questions based on greedy decoding, and finally selects the most consistent answer. Since all the candidate paths have relatively high probabilities, the sampling number could be much smaller than the self-consistency strategy. Extensive experiments on complex reasoning datasets demonstrate the effectiveness of our method in reducing the sampling number.</abstract>
-      <url hash="5d54878c">2024.findings-acl.842</url>
+      <url hash="910cf519">2024.findings-acl.842</url>
       <bibkey>chen-etal-2024-self-para</bibkey>
       <doi>10.18653/v1/2024.findings-acl.842</doi>
       <video href="2024.findings-acl.842.mp4"/>
@@ -17935,7 +17935,7 @@
       <author><first>Jan</first><last>Snajder</last><affiliation>UniZg-FER, University of Zagreb</affiliation></author>
       <pages>14168-14181</pages>
       <abstract>Pre-trained language models based on masked language modeling (MLM) excel in natural language understanding (NLU) tasks. While fine-tuned MLM-based encoders consistently outperform causal language modeling decoders of comparable size, recent decoder-only large language models (LLMs) perform on par with smaller MLM-based encoders. Although their performance improves with scale, LLMs fall short of achieving state-of-the-art results in information extraction (IE) tasks, many of which are formulated as sequence labeling (SL). We hypothesize that LLMs’ poor SL performance stems from causal masking, which prevents the model from attending to tokens on the right of the current token. Yet, how exactly and to what extent LLMs’ performance on SL can be improved remains unclear. We explore techniques for improving the SL performance of open LLMs on IE tasks by applying layer-wise removal of the causal mask (CM) during LLM fine-tuning. This approach yields performance gains competitive with state-of-the-art SL models, matching or outperforming the results of CM removal from all blocks. Our findings hold for diverse SL tasks, demonstrating that open LLMs with layer-dependent CM removal outperform strong MLM-based encoders and even instruction-tuned LLMs.</abstract>
-      <url hash="d5f50579">2024.findings-acl.843</url>
+      <url hash="4f6379aa">2024.findings-acl.843</url>
       <bibkey>dukic-snajder-2024-looking</bibkey>
       <doi>10.18653/v1/2024.findings-acl.843</doi>
       <video href="2024.findings-acl.843.mp4"/>
@@ -17947,7 +17947,7 @@
       <author><first>Taro</first><last>Watanabe</last><affiliation>Nara Institute of Science and Technology, Japan</affiliation></author>
       <pages>14182-14214</pages>
       <abstract>It is very challenging to curate a dataset for language-specific knowledge and common sense in order to evaluate natural language understanding capabilities of language models. Due to the limitation in the availability of annotators, most current multilingual datasets are created through translation, which cannot evaluate such language-specific aspects. Therefore, we propose Multilingual CommonsenseQA (mCSQA) based on the construction process of CSQA but leveraging language models for a more efficient construction, e.g., by asking LM to generate questions/answers, refine answers and verify QAs followed by reduced human efforts for verification. Constructed dataset is a benchmark for cross-lingual language-transfer capabilities of multilingual LMs, and experimental results showed high language-transfer capabilities for questions that LMs could easily solve, but lower transfer capabilities for questions requiring deep knowledge or commonsense. This highlights the necessity of language-specific datasets for evaluation and training. Finally, our method demonstrated that multilingual LMs could create QA including language-specific knowledge, significantly reducing the dataset creation cost compared to manual creation. The datasets are available at https://huggingface.co/datasets/yusuke1997/mCSQA.</abstract>
-      <url hash="87403369">2024.findings-acl.844</url>
+      <url hash="634d1dfe">2024.findings-acl.844</url>
       <bibkey>sakai-etal-2024-mcsqa</bibkey>
       <doi>10.18653/v1/2024.findings-acl.844</doi>
       <video href="2024.findings-acl.844.mp4"/>
@@ -17961,7 +17961,7 @@
       <author><first>Wan</first><last>Guanglu</last><affiliation>Meituan</affiliation></author>
       <pages>14215-14231</pages>
       <abstract>Syntactically Controlled Paraphrase Generation (SCPG), which aims at generating sentences having syntactic structures resembling given exemplars, is attracting more research efforts in recent years. We took an empirical survey on previous SCPG datasets and methods and found three tacitly approved while seldom mentioned intrinsic shortcomings/trade-offs in terms of data obtaining, task formulation, and pre-training strategies. As a mitigation to these shortcomings, we proposed a novel Dual-Stage Multi-Task (DSMT) pre-training scheme, involving a series of structure-oriented and syntax-oriented tasks, which, in our opinion, gives sequential text models the ability of com-prehending intrinsically non-sequential structures like Linearized Constituency Trees (LCTs), understanding the underlying syntactics, and even generating them by parsing sentences. We performed further pre-training of the popular T5 model on these novel tasks and fine-tuned the trained model on every possible variant of SCPG task in literature, finding that our models significantly outperformed (up to 10+ BLEU-4) previous state-of-the-art methods. Finally, we carried out ablation studies which demonstrated the effectiveness of our DSMT methods and emphasized on the SCPG performance gains compared to vanilla T5 models, especially on hard samples or under few-shot settings.</abstract>
-      <url hash="1f970c1f">2024.findings-acl.845</url>
+      <url hash="bf23ef71">2024.findings-acl.845</url>
       <bibkey>liu-etal-2024-dual</bibkey>
       <doi>10.18653/v1/2024.findings-acl.845</doi>
     </paper>
@@ -17975,19 +17975,19 @@
       <author><first>Min</first><last>Zhang</last><affiliation>Harbin Institute of Technology, Shenzhen</affiliation></author>
       <pages>14232-14244</pages>
       <abstract>Large Language Models (LLMs) have demonstrated an impressive capability known as In-context Learning (ICL), which enables them to acquire knowledge from textual demonstrations without the need for parameter updates.However, many studies have highlighted that the model’s performance is sensitive to the choice of demonstrations, presenting a significant challenge for practical applications where we lack prior knowledge of user queries.Consequently, we need to construct an extensive demonstration pool and incorporate external databases to assist the model, leading to considerable time and financial costs.In light of this, some recent research has shifted focus towards zero-shot ICL, aiming to reduce the model’s reliance on external information by leveraging their inherent generative capabilities. Despite the effectiveness of these approaches, the content generated by the model may be unreliable, and the generation process is time-consuming.To address these issues, we propose Demonstration Augmentation for In-context Learning (DAIL), which employs the model’s previously predicted historical samples as demonstrations for subsequent ones.DAIL brings no additional inference cost and does not rely on the model’s generative capabilities.Our experiments reveal that DAIL can significantly improve the model’s performance over direct zero-shot inference and can even outperform few-shot ICL without any external information.</abstract>
-      <url hash="a6d3aae2">2024.findings-acl.846</url>
+      <url hash="4d4ddb3d">2024.findings-acl.846</url>
       <bibkey>su-etal-2024-demonstration</bibkey>
       <doi>10.18653/v1/2024.findings-acl.846</doi>
     </paper>
     <paper id="847">
       <title>Pushing the Limits of Zero-shot End-to-End Speech Translation</title>
       <author><first>Ioannis</first><last>Tsiamas</last><affiliation>Apple and Universidad Politécnica de Cataluna</affiliation></author>
-      <author><first>Gerard I.</first><last>Gállego</last><affiliation>Universidad Politécnica de Cataluna</affiliation></author>
+      <author><first>Gerard</first><last>Gállego</last><affiliation>Universidad Politécnica de Cataluna</affiliation></author>
       <author><first>José</first><last>Fonollosa</last><affiliation>Universitat Politècnica de Catalunya</affiliation></author>
       <author><first>Marta</first><last>Costa-jussà</last><affiliation>Meta</affiliation></author>
       <pages>14245-14267</pages>
       <abstract>Data scarcity and the modality gap between the speech and text modalities are two major obstacles of end-to-end Speech Translation (ST) systems, thus hindering their performance. Prior work has attempted to mitigate these challenges by leveraging external MT data and optimizing distance metrics that bring closer the speech-text representations. However, achieving competitive results typically requires some ST data. For this reason, we introduce ZeroSwot, a method for zero-shot ST that bridges the modality gap without any paired ST data. Leveraging a novel CTC compression and Optimal Transport, we train a speech encoder using only ASR data, to align with the representation space of a massively multilingual MT model. The speech encoder seamlessly integrates with the MT model at inference, enabling direct translation from speech to text, across all languages supported by the MT model. Our experiments show that we can effectively close the modality gap without ST data, while our results on MuST-C and CoVoST demonstrate our method’s superiority over not only previous zero-shot models, but also supervised ones, achieving state-of-the-art results.</abstract>
-      <url hash="5e4faa95">2024.findings-acl.847</url>
+      <url hash="4916d6f6">2024.findings-acl.847</url>
       <bibkey>tsiamas-etal-2024-pushing</bibkey>
       <doi>10.18653/v1/2024.findings-acl.847</doi>
       <video href="2024.findings-acl.847.mp4"/>
@@ -18001,7 +18001,7 @@
       <author><first>Ruifeng</first><last>Xu</last><affiliation>Harbin Institute of Technology</affiliation></author>
       <pages>14268-14290</pages>
       <abstract>Numeral systems and units of measurement are two conjoined topics in activities of human beings and have mutual effects with the languages expressing them. Currently, the evaluation of Large Language Models (LLMs) often involves mathematical reasoning, yet little attention is given to how minor changes in numbers or units can drastically alter the complexity of problems and the performance of LLMs. In this paper, we scrutinize existing LLMs on processing of numerals and units of measurement by constructing datasets with perturbations. We first anatomize the reasoning of math word problems to different sub-procedures like numeral conversions from language to numbers and measurement conversions based on units. Then we further annotate math word problems from ancient Chinese arithmetic works which are challenging in numerals and units of measurement. Experiments on perturbed datasets demonstrate that LLMs still encounter difficulties in handling numeral and measurement conversions.</abstract>
-      <url hash="dae824c5">2024.findings-acl.848</url>
+      <url hash="88faffec">2024.findings-acl.848</url>
       <bibkey>xu-etal-2024-numcot</bibkey>
       <doi>10.18653/v1/2024.findings-acl.848</doi>
     </paper>
@@ -18017,7 +18017,7 @@
       <author><first>Ravi</first><last>Kokku</last></author>
       <pages>14291-14307</pages>
       <abstract>In an ever-expanding world of domain-specific knowledge, the increasing complexity of consuming, and storing information necessitates the generation of summaries from large information repositories. However, every persona of a domain has different requirements of information and hence their summarization. For example, in the healthcare domain, a persona-based (such as Doctor, Nurse, Patient etc.) approach is imperative to deliver targeted medical information efficiently. Persona-based summarization of domain-specific information by humans is a high cognitive load task and is generally not preferred. The summaries generated by two different humans have high variability and do not scale in cost and subject matter expertise as domains and personas grow. Further, AI-generated summaries using generic Large Language Models (LLMs) may not necessarily offer satisfactory accuracy for different domains unless they have been specifically trained on domain-specific data and can also be very expensive to use in day-to-day operations. Our contribution in this paper is two-fold: 1) We present an approach to efficiently fine-tune a domain-specific small foundation LLM using a healthcare corpus and also show that we can effectively evaluate the summarization quality using AI-based critiquing. 2) We further show that AI-based critiquing has good concordance with Human-based critiquing of the summaries. Hence, such AI-based pipelines to generate domain-specific persona-based summaries can be easily scaled to other domains such as legal, enterprise documents, education etc. in a very efficient and cost-effective manner.</abstract>
-      <url hash="88a49504">2024.findings-acl.849</url>
+      <url hash="f225d36c">2024.findings-acl.849</url>
       <bibkey>mullick-etal-2024-persona</bibkey>
       <doi>10.18653/v1/2024.findings-acl.849</doi>
     </paper>
@@ -18028,7 +18028,7 @@
       <author><first>Danish</first><last>Pruthi</last><affiliation>Indian Institute of Science, Bangalore</affiliation></author>
       <pages>14308-14331</pages>
       <abstract>As corporations rush to integrate large language models (LLMs) it is critical that they provide factually accurate information, that is robust to any presuppositions that a user may express. In this work, we introduce UPHILL, a dataset consisting of health-related queries with varying degrees of presuppositions. Using UPHILL, we evaluate the factual accuracy and consistency of InstructGPT, ChatGPT, GPT-4 and Bing Copilot models. We find that while model responses rarely contradict true health claims (posed as questions), all investigated models fail to challenge false claims. Alarmingly, responses from these models agree with 23-32% of the existing false claims, and 49-55% with novel fabricated claims. As we increase the extent of presupposition in input queries, responses from all models except Bing Copilot agree with the claim considerably more often, regardless of its veracity. Given the moderate factual accuracy, and the inability of models to challenge false assumptions, our work calls for a careful assessment of current LLMs for use in high-stakes scenarios.</abstract>
-      <url hash="5a6f7f07">2024.findings-acl.850</url>
+      <url hash="9df0c760">2024.findings-acl.850</url>
       <bibkey>kaur-etal-2024-evaluating</bibkey>
       <doi>10.18653/v1/2024.findings-acl.850</doi>
       <video href="2024.findings-acl.850.mp4"/>
@@ -18042,7 +18042,7 @@
       <author><first>Roberto</first><last>Navigli</last><affiliation>Sapienza University of Rome</affiliation></author>
       <pages>14332-14347</pages>
       <abstract>Word Sense Disambiguation (WSD) is the task of associating a word in a given context with its most suitable meaning among a set of possible candidates. While the task has recently witnessed renewed interest, with systems achieving performances above the estimated inter-annotator agreement, at the time of writing it still struggles to find downstream applications. We argue that one of the reasons behind this is the difficulty of applying WSD to plain text. Indeed, in the standard formulation, models work under the assumptions that a) all the spans to disambiguate have already been identified, and b) all the possible candidate senses of each span are provided, both of which are requirements that are far from trivial. In this work, we present a new task called Word Sense Linking (WSL) where, given an input text and a reference sense inventory, systems have to both identify which spans to disambiguate and then link them to their most suitable meaning.We put forward a transformer-based architecture for the task and thoroughly evaluate both its performance and those of state-of-the-art WSD systems scaled to WSL, iteratively relaxing the assumptions of WSD. We hope that our work will foster easier integration of lexical semantics into downstream applications.</abstract>
-      <url hash="eaa659d6">2024.findings-acl.851</url>
+      <url hash="02dca14e">2024.findings-acl.851</url>
       <bibkey>bejgu-etal-2024-word</bibkey>
       <doi>10.18653/v1/2024.findings-acl.851</doi>
       <video href="2024.findings-acl.851.mp4"/>
@@ -18053,7 +18053,7 @@
       <author><first>Ivan</first><last>Titov</last><affiliation>University of Edinburgh and University of Amsterdam</affiliation></author>
       <pages>14348-14366</pages>
       <abstract>Memorisation is a natural part of learning from real-world data: neural models pick up on atypical input-output combinations and store those training examples in their parameter space. That this happens is well-known, but how and where are questions that remain largely unanswered. Given a multi-layered neural model, where does memorisation occur in the millions of parameters?Related work reports conflicting findings: a dominant hypothesis based on image classification is that lower layers learn generalisable features and that deeper layers specialise and memorise. Work from NLP suggests this does not apply to language models, but has been mainly focused on memorisation of facts.We expand the scope of the localisation question to 12 natural language classification tasks and apply 4 memorisation localisation techniques.Our results indicate that memorisation is a gradual process rather than a localised one, establish that memorisation is task-dependent, and give nuance to the generalisation first, memorisation second hypothesis.</abstract>
-      <url hash="69476010">2024.findings-acl.852</url>
+      <url hash="84164068">2024.findings-acl.852</url>
       <bibkey>dankers-titov-2024-generalisation</bibkey>
       <doi>10.18653/v1/2024.findings-acl.852</doi>
       <video href="2024.findings-acl.852.mp4"/>
@@ -18067,7 +18067,7 @@
       <author><first>Jinan</first><last>Xu</last><affiliation>Beijing Jiaotong University</affiliation></author>
       <pages>14367-14378</pages>
       <abstract>Temporal knowledge graph reasoning has emerged as a crucial task for answering time-dependent questions within a knowledge graph (KG).Despite tremendous progress, the present research is impeded by the sparsity of a temporal KG and an over-reliance on simple single-relational reasoning patterns. To overcome these challenges, we introduce MulQuestions, a new temporal KG reasoning benchmark featuring over 200k entities and 960k questions designed to facilitate complex, multi-relational and multi-hop reasoning. Additionally, we propose a new model adept at conducting pattern-aware and time-sensitive reasoning across temporal KGs. The model’s efficacy is confirmed through rigorous evaluations, showcasing its effectiveness in sparse data conditions and adeptness at handling questions with long reasoning chains. We have made our benchmark and model publicly accessible at [https://anonymous].</abstract>
-      <url hash="c8042a1a">2024.findings-acl.853</url>
+      <url hash="7303c2ed">2024.findings-acl.853</url>
       <bibkey>liu-etal-2024-towards-multi</bibkey>
       <doi>10.18653/v1/2024.findings-acl.853</doi>
     </paper>
@@ -18082,7 +18082,7 @@
       <author><first>Yiqun</first><last>Liu</last><affiliation>Tsinghua University</affiliation></author>
       <pages>14379-14391</pages>
       <abstract>Hallucinations in large language models (LLMs) refer to the phenomenon of LLMs producing responses that are coherent yet factually inaccurate. This issue undermines the effectiveness of LLMs in practical applications, necessitating research into detecting and mitigating hallucinations of LLMs. Previous studies have mainly concentrated on post-processing techniques for hallucination detection, which tend to be computationally intensive and limited in effectiveness due to their separation from the LLM’s inference process. To overcome these limitations, we introduce MIND, an unsupervised training framework that leverages the internal states of LLMs for real-time hallucination detection without requiring manual annotations. Additionally, we present HELM, a new benchmark for evaluating hallucination detection across multiple LLMs, featuring diverse LLM outputs and the internal states of LLMs during their inference process. Our experiments demonstrate that MIND outperforms existing state-of-the-art methods in hallucination detection.</abstract>
-      <url hash="7dd8031a">2024.findings-acl.854</url>
+      <url hash="10376d20">2024.findings-acl.854</url>
       <bibkey>su-etal-2024-unsupervised</bibkey>
       <doi>10.18653/v1/2024.findings-acl.854</doi>
     </paper>
@@ -18093,7 +18093,7 @@
       <author><first>Weiming</first><last>Lu</last><affiliation>Zhejiang University</affiliation></author>
       <pages>14392-14402</pages>
       <abstract>Understanding sentiment is arguably an advanced and important capability of AI agents in the physical world. In previous works, many efforts have been devoted to individual sentiment subtasks, without considering interrelated sentiment knowledge among these subtasks. Although some recent works model multiple sentiment subtasks in a unified manner, they merely simply combine these subtasks without deeply exploring the hierarchical relationships among subtasks. In this paper, we introduce GSA-7B, an open-source large language model specific to the sentiment domain. Specifically, we deeply explore the hierarchical relationships between sentiment subtasks, proposing progressive sentiment reasoning benchmark and progressive task instructions. Subsequently, we use Llama2-7B as the backbone model and propose parameter-efficient progressive tuning paradigm which is implemented by constructing chain of LoRA, resulting in the creation of GSA-7B. Experimental results show that GSA-7B as a unified model performs well across all datasets in the progressive sentiment reasoning benchmark. Additionally, under the few-shot setting, GSA-7B also exhibits good generalization ability for sentiment subtasks and datasets that were not encountered during its training phase.</abstract>
-      <url hash="bebfacce">2024.findings-acl.855</url>
+      <url hash="cdf478fa">2024.findings-acl.855</url>
       <bibkey>hou-etal-2024-progressive</bibkey>
       <doi>10.18653/v1/2024.findings-acl.855</doi>
     </paper>
@@ -18107,7 +18107,7 @@
       <author><first>Khoa</first><last>Doan</last><affiliation>VinUniversity</affiliation></author>
       <pages>14403-14421</pages>
       <abstract>Despite outstanding performance in a variety of Natural Language Processing (NLP) tasks, recent studies have revealed that NLP models are vulnerable to adversarial attacks that slightly perturb the input to cause the models to misbehave. Several attacks can even compromise the model without requiring access to the model architecture or model parameters (i.e., a blackbox setting), and thus are detrimental to existing NLP applications. To perform these attacks, the adversary queries the victim model many times to determine the most important parts in an input text and transform. In this work, we propose a lightweight and attack-agnostic defense whose main goal is to perplex the process of generating an adversarial example in these query-based black-box attacks; that is to fool the textual fooler. This defense, named AdvFooler, works by randomizing the latent representation of the input at inference time. Different from existing defenses, AdvFooler does not necessitate additional computational overhead during training nor does it rely on assumptions about the potential adversarial perturbation set while having a negligible impact on the model’s accuracy. Our theoretical and empirical analyses highlight the significance of robustness resulting from confusing the adversary via randomizing the latent space, as well as the impact of randomization on clean accuracy. Finally, we empirically demonstrate near state-of-the-art robustness of AdvFooler against representative adversarial attacks on two benchmark datasets.</abstract>
-      <url hash="ef2a095e">2024.findings-acl.856</url>
+      <url hash="9ca7da26">2024.findings-acl.856</url>
       <bibkey>hoang-etal-2024-fooling</bibkey>
       <doi>10.18653/v1/2024.findings-acl.856</doi>
       <video href="2024.findings-acl.856.mp4"/>
@@ -18119,7 +18119,7 @@
       <author><first>Pushpak</first><last>Bhattacharyya</last><affiliation>Indian Institute of Technology, Bombay, Dhirubhai Ambani Institute Of Information and Communication Technology</affiliation></author>
       <pages>14422-14431</pages>
       <abstract>Modern natural language processing (NLP) systems thrive when given access to large datasets. However, a large fraction of the world’s languages are not privy to such benefits due to sparse documentation and inadequate digital representation. This is especially true for Indian regional languages. As a first step towards expanding the reach of NLP technologies to extremely low-resource Indian languages, we present a new parallel part-of-speech (POS) evaluation dataset for Angika, Magahi, Bhojpuri and Hindi. Angika, Magahi, Bhojpuri, along with the more well-known Hindi, are all languages spoken in the Indian states of Bihar, Jharkhand and West Bengal. Ours is notably the first NLP resource, even for a shallow NLP task like POS-tagging, for Angika. We establish POS-tagging baselines using state-of-the-art multilingual pretrained language models (PLMs) finetuned on Hindi data, and show zero-shot evaluations on the other three languages. While all four languages use the same Devanagari script, pretrained tokenizers underperform in zero-shot on the three languages. We propose a simple look-back fix to address the tokenization challenge yielding F1-score improvements of up to 8% on Angika and show how it comes very close to an oracle setting when the underlying Hindi word is known (and can be accurately tokenized).</abstract>
-      <url hash="d4075694">2024.findings-acl.857</url>
+      <url hash="44f5bd21">2024.findings-acl.857</url>
       <bibkey>kumar-etal-2024-part</bibkey>
       <doi>10.18653/v1/2024.findings-acl.857</doi>
       <video href="2024.findings-acl.857.mp4"/>
@@ -18134,7 +18134,7 @@
       <author><first>Cecilia</first><last>Zhao</last><affiliation>University of Macau, New York University and Ohio State University, Columbus</affiliation></author>
       <pages>14432-14447</pages>
       <abstract>Pre-trained Language Models (PLMs) have shown impressive results in various Natural Language Generation (NLG) tasks, such as powering chatbots and generating stories. However, an ethical concern arises due to their potential to produce verbatim copies of paragraphs from their training data. This is problematic as PLMs are trained on corpora constructed by human authors. As such, there is a pressing need for research to promote the generation of original content by these models. In this study, we introduce a unique “self-plagiarism” contrastive decoding strategy, aimed at boosting the originality of text produced by PLMs. Our method entails modifying prompts in LLMs to develop an amateur model and a professional model. Specifically, the amateur model is urged to plagiarize using three plagiarism templates we have designed, while the professional model maintains its standard language model status. This strategy employs prompts to stimulate the model’s capacity to identify non-original candidate token combinations and subsequently impose penalties. The application of this strategy is integrated prior to the model’s final layer, ensuring smooth integration with most existing PLMs (T5, GPT, LLaMA) without necessitating further adjustments. Implementing our strategy, we noted a significant decline in non-original sequences comprised of more than three words in the academic AASC dataset and the story-based ROCStories dataset. Source code and scripts will be released after the paper’s acceptance and publication.</abstract>
-      <url hash="bdb52674">2024.findings-acl.858</url>
+      <url hash="1124e6b5">2024.findings-acl.858</url>
       <bibkey>lan-etal-2024-focus</bibkey>
       <doi>10.18653/v1/2024.findings-acl.858</doi>
     </paper>
@@ -18146,7 +18146,7 @@
       <author><first>Taihao</first><last>Li</last><affiliation>Zhejiang Lab</affiliation></author>
       <pages>14448-14458</pages>
       <abstract>This paper investigates unsupervised multimodal domain adaptation for multimodal emotion recognition, which is a solution for data scarcity yet remains under studied. Due to the varying distribution discrepancies of different modalities between source and target domains, the primary challenge lies in how to balance the domain alignment across modalities to guarantee they are all well aligned. To achieve this, we first develop our model based on the information bottleneck theory to learn optimal representation for each modality independently. Then, we align the domains via matching the label distributions and the representations. In order to balance the representation alignment, we propose to minimize a surrogate of the alignment losses, which is equivalent to adaptively adjusting the weights of the modalities throughout training, thus achieving balanced domain alignment across modalities. Overall, the proposed approach features <b>A</b>daptively <b>m</b>odality-bal<b>an</b>ced <b>d</b>omain <b>a</b>daptation, dubbed <b>Amanda</b>, for multimodal emotion recognition. Extensive empirical results on commonly used benchmark datasets demonstrate that Amanda significantly outperforms competing approaches. The code is available at <url>https://github.com/sunjunaimer/Amanda</url>.</abstract>
-      <url hash="56ece6c8">2024.findings-acl.859</url>
+      <url hash="7070b62e">2024.findings-acl.859</url>
       <bibkey>zhang-etal-2024-amanda</bibkey>
       <doi>10.18653/v1/2024.findings-acl.859</doi>
       <video href="2024.findings-acl.859.mp4"/>
@@ -18158,7 +18158,7 @@
       <author><first>Florian</first><last>Matthes</last><affiliation>Technische Universität München</affiliation></author>
       <pages>14459-14469</pages>
       <abstract>In recent years, Large Language Models (LLMs) have demonstrated an impressive ability to encode knowledge during pre-training on large text corpora. They can leverage this knowledge for downstream tasks like question answering (QA), even in complex areas involving health topics. Considering their high potential for facilitating clinical work in the future, understanding the quality of encoded medical knowledge and its recall in LLMs is an important step forward. In this study, we examine the capability of LLMs to exhibit medical knowledge recall by constructing a novel dataset derived from systematic reviews – studies synthesizing evidence-based answers for specific medical questions. Through experiments on the new MedREQAL dataset, comprising question-answer pairs extracted from rigorous systematic reviews, we assess six LLMs, such as GPT and Mixtral, analyzing their classification and generation performance. Our experimental insights into LLM performance on the novel biomedical QA dataset reveal the still challenging nature of this task.</abstract>
-      <url hash="cf408afd">2024.findings-acl.860</url>
+      <url hash="4710c8cc">2024.findings-acl.860</url>
       <bibkey>vladika-etal-2024-medreqal</bibkey>
       <doi>10.18653/v1/2024.findings-acl.860</doi>
       <video href="2024.findings-acl.860.mp4"/>
@@ -18174,7 +18174,7 @@
       <author><first>Agha Ali</first><last>Raza</last><affiliation>Lahore University of Management Sciences</affiliation></author>
       <pages>14470-14480</pages>
       <abstract>Deepfakes, particularly in the auditory domain, have become a significant threat, necessitating the development of robust countermeasures. This paper addresses the escalating challenges posed by deepfake attacks on Automatic Speaker Verification (ASV) systems. We present a novel Urdu deepfake audio dataset for deepfake detection, focusing on two spoofing attacks – Tacotron and VITS TTS. The dataset construction involves careful consideration of phonemic cover and balance and comparison with existing corpora like PRUS and PronouncUR. Evaluation with AASIST-L model shows EERs of 0.495 and 0.524 for VITS TTS and Tacotron-generated audios, respectively, with variability across speakers. Further, this research implements a detailed human evaluation, incorporating a user study to gauge whether people are able to discern deepfake audios from real (bonafide) audios. The ROC curve analysis shows an area under the curve (AUC) of 0.63, indicating that individuals demonstrate a limited ability to detect deepfakes (approximately 1 in 3 fake audio samples are regarded as real). Our work contributes a valuable resource for training deepfake detection models in low-resource languages like Urdu, addressing the critical gap in existing datasets. The dataset is publicly available at: https://github.com/CSALT-LUMS/urdu-deepfake-dataset.</abstract>
-      <url hash="ac3e39fe">2024.findings-acl.861</url>
+      <url hash="5e2590d6">2024.findings-acl.861</url>
       <bibkey>munir-etal-2024-deepfake</bibkey>
       <doi>10.18653/v1/2024.findings-acl.861</doi>
       <video href="2024.findings-acl.861.mp4"/>
@@ -18185,7 +18185,7 @@
       <author><first>Laura</first><last>Perez-Beltrachini</last><affiliation>University of Edinburgh</affiliation></author>
       <pages>14481-14497</pages>
       <abstract>Synthetically created Cross-Lingual Summarisation (CLS) datasets are prone to include document-summary pairs where the reference summary is unfaithful to the corresponding document as it contains content not supported by the document (i.e., hallucinated content). This low data quality misleads model learning and obscures evaluation results. Automatic ways to assess hallucinations and improve training have been proposed for monolingual summarisation, predominantly in English. For CLS, we propose to use off-the-shelf cross-lingual Natural Language Inference (X-NLI) to evaluate faithfulness of reference and model generated summaries. Then, we study training approaches that are aware of faithfulness issues in the training data and propose an approach that uses unlikelihood loss to teach a model about unfaithful summary sequences. Our results show that it is possible to train CLS models that yield more faithful and at the same time informative summaries.</abstract>
-      <url hash="0a7c56e4">2024.findings-acl.862</url>
+      <url hash="5de1ab87">2024.findings-acl.862</url>
       <bibkey>zhang-perez-beltrachini-2024-leveraging</bibkey>
       <doi>10.18653/v1/2024.findings-acl.862</doi>
       <video href="2024.findings-acl.862.mp4"/>
@@ -18201,7 +18201,7 @@
       <author><first>Min</first><last>Zhang</last><affiliation>Harbin Institute of Technology, Shenzhen</affiliation></author>
       <pages>14498-14511</pages>
       <abstract>In the field of information extraction (IE), tasks across a wide range of modalities and their combinations have been traditionally studied in isolation, leaving a gap in deeply recognizing and analyzing cross-modal information. To address this, this work for the first time introduces the concept of grounded Multimodal Universal Information Extraction (MUIE), providing a unified task framework to analyze any IE tasks over various modalities, along with their fine-grained groundings. To tackle MUIE, we tailor a multimodal large language model (MLLM), Reamo, capable of extracting and grounding information from all modalities, i.e., recognizing everything from all modalities at once. Reamo is updated via varied tuning strategies, equipping it with powerful capabilities for information recognition and fine-grained multimodal grounding. To address the absence of a suitable benchmark for grounded MUIE, we curate a high-quality, diverse, and challenging test set, which encompasses IE tasks across 9 common modality combinations with the corresponding multimodal groundings. The extensive comparison of Reamo with existing MLLMs integrated into pipeline approaches demonstrates its advantages across all evaluation dimensions, establishing a strong benchmark for the follow-up research. Our resources are publicly released at https://haofei.vip/MUIE.</abstract>
-      <url hash="1d29e927">2024.findings-acl.863</url>
+      <url hash="8fe08272">2024.findings-acl.863</url>
       <bibkey>zhang-etal-2024-recognizing</bibkey>
       <doi>10.18653/v1/2024.findings-acl.863</doi>
       <video href="2024.findings-acl.863.mp4"/>
@@ -18220,7 +18220,7 @@
       <author><first>Yunchao</first><last>Wei</last><affiliation>Beijing Jiaotong University</affiliation></author>
       <pages>14512-14531</pages>
       <abstract>The remarkable multimodal capabilities demonstrated by OpenAI’s GPT-4 have sparked significant interest in the development of multimodal Large Language Models (LLMs). A primary research objective of such models is to align visual and textual modalities effectively while comprehending human instructions.Current methodologies often rely on annotations derived from benchmark datasets to construct image-dialogue datasets for training purposes, akin to instruction tuning in LLMs. However, these datasets often exhibit domain bias, potentially constraining the generative capabilities of the models. In an effort to mitigate these limitations, we propose a novel data collection methodology that synchronously synthesizes images and dialogues for visual instruction tuning. This approach harnesses the power of generative models, marrying the abilities of ChatGPT and text-to-image generative models to yield a diverse and controllable dataset with varied image content. This not only provides greater flexibility compared to existing methodologies but also significantly enhances several model capabilities. Our research includes comprehensive experiments conducted on various datasets using the open-source LLAVA model as a testbed for our proposed pipeline. Our results underscore marked enhancements across more than ten commonly assessed capabilities.</abstract>
-      <url hash="4d5beb0e">2024.findings-acl.864</url>
+      <url hash="2b202cc8">2024.findings-acl.864</url>
       <bibkey>li-etal-2024-enhanced</bibkey>
       <doi>10.18653/v1/2024.findings-acl.864</doi>
       <video href="2024.findings-acl.864.mp4"/>
@@ -18236,7 +18236,7 @@
       <author><first>Yohei</first><last>Oseki</last><affiliation>University of Tokyo</affiliation></author>
       <pages>14532-14550</pages>
       <abstract>The imitation of the children’s language acquisition process has been explored to make language models (LMs) more efficient.In particular, errors caused by children’s regularization (so-called overregularization, e.g., using wroted for the past tense of write) have been widely studied to reveal the mechanisms of language acquisition. Existing research has analyzed regularization in language acquisition only by modeling word inflection directly, which is unnatural in light of human language acquisition. In this paper, we hypothesize that language models that imitate the errors children make during language acquisition have a learning process more similar to humans. To verify this hypothesis, we analyzed the learning curve and error preferences of verb inflections in small-scale LMs using acceptability judgments. We analyze the differences in results by model architecture, data, and tokenization. Our model shows child-like U-shaped learning curves clearly for certain verbs, but the preferences for types of overgeneralization did not fully match the observations in children.</abstract>
-      <url hash="acc626c7">2024.findings-acl.865</url>
+      <url hash="ce09fce5">2024.findings-acl.865</url>
       <bibkey>haga-etal-2024-modeling</bibkey>
       <doi>10.18653/v1/2024.findings-acl.865</doi>
       <video href="2024.findings-acl.865.mp4"/>
@@ -18249,7 +18249,7 @@
       <author><first>Maosong</first><last>Sun</last></author>
       <pages>14551-14558</pages>
       <abstract>Large language models have achieved remarkable success in general language understanding tasks. However, as a family of generative methods with the objective of next token prediction, the semantic evolution with the depth of these models are not fully explored, unlike their predecessors, such as BERT-like architectures. In this paper, we specifically investigate the bottom-up evolution of lexical semantics for a popular LLM, namely Llama2, by probing its hidden states at the end of each layer using a contextualized word identification task. Our experiments show that the representations in lower layers encode lexical semantics, while the higher layers, with weaker semantic induction, are responsible for prediction. This is in contrast to models with discriminative objectives, such as mask language modeling, where the higher layers obtain better lexical semantics. The conclusion is further supported by the monotonic increase in performance via the hidden states for the last meaningless symbols, such as punctuation, in the prompting strategy. Our codes are available at https://github.com/RyanLiut/LLM_LexSem.</abstract>
-      <url hash="68762475">2024.findings-acl.866</url>
+      <url hash="e6ac2909">2024.findings-acl.866</url>
       <bibkey>liu-etal-2024-fantastic</bibkey>
       <doi>10.18653/v1/2024.findings-acl.866</doi>
       <video href="2024.findings-acl.866.mp4"/>
@@ -18261,7 +18261,7 @@
       <author><first>Davide</first><last>Mottin</last><affiliation>Aarhus University</affiliation></author>
       <pages>14559-14574</pages>
       <abstract>As Machine Learning (ML) models grow in size and demand higher-quality training data, the expenses associated with re-training and fine-tuning these models are escalating rapidly. Inspired by recent impressive achievements of Large Language Models (LLMs) in different fields, this paper delves into the question: can LLMs efficiently improve an ML’s performance at a minimal cost? We show that, through our proposed training-free framework LLMCorr, an LLM can work as a post-hoc corrector to propose corrections for the predictions of an arbitrary ML model. In particular, we form a contextual knowledge database by incorporating the dataset’s label information and the ML model’s predictions on the validation dataset. Leveraging the in-context learning capability of LLMs, we ask the LLM to summarise the instances in which the ML model makes mistakes and the correlation between primary predictions and true labels. Following this, the LLM can transfer its acquired knowledge to suggest corrections for the ML model’s predictions. Our experimental results on text analysis and the challenging molecular predictions show that LLMCorr improves the performance of a number of models by up to 39%.</abstract>
-      <url hash="4194a9f1">2024.findings-acl.867</url>
+      <url hash="14ed9d56">2024.findings-acl.867</url>
       <bibkey>zhong-etal-2024-harnessing</bibkey>
       <doi>10.18653/v1/2024.findings-acl.867</doi>
       <video href="2024.findings-acl.867.mp4"/>
@@ -18277,7 +18277,7 @@
       <author><first>Zhongyu</first><last>Wei</last><affiliation>Fudan University</affiliation></author>
       <pages>14575-14595</pages>
       <abstract>How can we construct an automated debate judge to evaluate an extensive, vibrant, multi-turn debate? This task is challenging, as judging a debate involves grappling with lengthy texts, intricate argument relationships, and multi-dimensional assessments.At the same time, current research mainly focuses on short dialogues, rarely touching upon the evaluation of an entire debate.In this paper, by leveraging Large Language Models (LLMs), we propose Debatrix, which makes the analysis and assessment of multi-turn debates more aligned with majority preferences. Specifically, Debatrix features a vertical, iterative chronological analysis and a horizontal, multi-dimensional evaluation collaboration.To align with real-world debate scenarios, we introduced the PanelBench benchmark, comparing our system’s performance to actual debate outcomes.The findings indicate a notable enhancement over directly using LLMs for debate evaluation.Source code and benchmark data are available at https://github.com/ljcleo/debatrix.</abstract>
-      <url hash="6538bbbf">2024.findings-acl.868</url>
+      <url hash="d17131dd">2024.findings-acl.868</url>
       <bibkey>liang-etal-2024-debatrix</bibkey>
       <doi>10.18653/v1/2024.findings-acl.868</doi>
       <video href="2024.findings-acl.868.mp4"/>
@@ -18292,7 +18292,7 @@
       <author><first>Rui</first><last>Yan</last><affiliation>Renmin University of China</affiliation></author>
       <pages>14596-14609</pages>
       <abstract>Language models trained on large-scale corpus often generate harmful responses that are harmful and contrary to human values. A prevalent approach for human alignment is reinforcement learning from human feedback (RLHF), utilizing algorithms such as proximal policy optimization (PPO). However, these methods are often characterized by complexity, instability, and substantial resource consumption. Considering that existing large language models (LLMs) like ChatGPT are already relatively well-aligned and cost-friendly, researchers propose to align the language model with human preferences from AI feedback. Nevertheless, the common practices, that unidirectionally distill the responses, are constrained by the inherent capability of LLMs. To address it, we introduce CycleAlign, a framework that distills alignment capabilities from the parameter-invisible LLMs (black-box) to the parameter-visible models (white-box) in an iterative manner. CycleAlign iteratively improves both the white-box and black-box models by integrating static and dynamic in-context learning and a belief alignment method.Empirical results illustrate that the model fine-tuned by CycleAlign remarkably exceeds existing methods, and achieves the state-of-the-art performance in alignment with human value.</abstract>
-      <url hash="243e2bc4">2024.findings-acl.869</url>
+      <url hash="4f7f807f">2024.findings-acl.869</url>
       <bibkey>hong-etal-2024-cyclealign</bibkey>
       <doi>10.18653/v1/2024.findings-acl.869</doi>
     </paper>
@@ -18303,7 +18303,7 @@
       <author><first>Carolyn</first><last>Rose</last><affiliation>School of Computer Science, Carnegie Mellon University</affiliation></author>
       <pages>14610-14622</pages>
       <abstract>The field of multimodal document understanding has produced a suite of models that have achieved stellar performance across several tasks, even coming close to human performance on certain benchmarks. Nevertheless, the application of these models to real-world enterprise datasets remains constrained by a number of limitations. In this position paper, we discuss these limitations in the context of three key aspects of research: dataset curation, model development, and evaluation on downstream tasks. By analyzing 14 datasets and 7 SotA models, we identify major gaps in their utility in the context of a real-world scenario. We demonstrate how each limitation impedes the widespread use of SotA models in enterprise settings, and present a set of research challenges that are motivated by these limitations. Lastly, we propose a research agenda that is aimed at driving the field towards higher impact in enterprise applications.</abstract>
-      <url hash="33c0aee6">2024.findings-acl.870</url>
+      <url hash="fc9e084e">2024.findings-acl.870</url>
       <bibkey>nourbakhsh-etal-2024-towards</bibkey>
       <doi>10.18653/v1/2024.findings-acl.870</doi>
       <video href="2024.findings-acl.870.mp4"/>
@@ -18318,7 +18318,7 @@
       <author><first>Suzan</first><last>Verberne</last><affiliation>Universiteit Leiden</affiliation></author>
       <pages>14623-14635</pages>
       <abstract>An important unexplored aspect in previous work on user satisfaction estimation for Task-Oriented Dialogue (TOD) systems is their evaluation in terms of robustness for the identification of user dissatisfaction: current benchmarks for user satisfaction estimation in TOD systems are highly skewed towards dialogues for which the user is satisfied. The effect of having a more balanced set of satisfaction labels on performance is unknown. However, balancing the data with more dissatisfactory dialogue samples requires further data collection and human annotation, which is costly and time-consuming. In this work, we leverage large language models (LLMs) and unlock their ability to generate satisfaction-aware counterfactual dialogues to augment the set of original dialogues of a test collection. We gather human annotations to ensure the reliability of the generated samples. We evaluate two open-source LLMs as user satisfaction estimators on our augmented collection against state-of-the-art fine-tuned models. Our experiments show that when used as few-shot user satisfaction estimators, open-source LLMs show higher robustness to the increase in the number of dissatisfaction labels in the test collection than the fine-tuned state-of-the-art models. Our results shed light on the need for data augmentation approaches for user satisfaction estimation in TOD systems. We release our aligned counterfactual dialogues, which are curated by human annotation, to facilitate further research on this topic.</abstract>
-      <url hash="f937feee">2024.findings-acl.871</url>
+      <url hash="2bb867c6">2024.findings-acl.871</url>
       <bibkey>abolghasemi-etal-2024-cause</bibkey>
       <doi>10.18653/v1/2024.findings-acl.871</doi>
     </paper>
@@ -18327,11 +18327,11 @@
       <author><first>Matteo</first><last>Gabburo</last><affiliation>University of Trento</affiliation></author>
       <author><first>Nicolaas</first><last>Jedema</last><affiliation>Amazon</affiliation></author>
       <author><first>Siddhant</first><last>Garg</last><affiliation>Meta</affiliation></author>
-      <author><first>Leonardo F. R.</first><last>Ribeiro</last><affiliation>Amazon</affiliation></author>
+      <author><first>Leonardo</first><last>Ribeiro</last><affiliation>Amazon</affiliation></author>
       <author><first>Alessandro</first><last>Moschitti</last><affiliation>Amazon AGI</affiliation></author>
       <pages>14636-14650</pages>
       <abstract>In this paper, we investigate which questions are challenging for retrieval-based Question Answering (QA). We (i) propose retrieval complexity (RC), a novel metric conditioned on the completeness of retrieved documents, which measures the difficulty of answering questions, and (ii) propose an unsupervised pipeline to measure RC given an arbitrary retrieval system.Our proposed pipeline measures RC more accurately than alternative estimators, including LLMs, on six challenging QA benchmarks. Further investigation reveals that RC scores strongly correlate with both QA performance and expert judgment across five of the six studied benchmarks, indicating that RC is an effective measure of question difficulty.Subsequent categorization of high-RC questions shows that they span a broad set of question shapes, including multi-hop, compositional, and temporal QA, indicating that RC scores can categorize a new subset of complex questions. Our system can also have a major impact on retrieval-based systems by helping to identify more challenging questions on existing datasets.</abstract>
-      <url hash="2f60c41b">2024.findings-acl.872</url>
+      <url hash="685d88a8">2024.findings-acl.872</url>
       <bibkey>gabburo-etal-2024-measuring</bibkey>
       <doi>10.18653/v1/2024.findings-acl.872</doi>
       <video href="2024.findings-acl.872.mp4"/>
@@ -18346,7 +18346,7 @@
       <author><first>Maria</first><last>Liakata</last><affiliation>Queen Mary University London</affiliation></author>
       <pages>14651-14672</pages>
       <abstract>We introduce a hybrid abstractive summarisation approach combining hierarchical VAEs with LLMs to produce clinically meaningful summaries from social media user timelines, appropriate for mental health monitoring. The summaries combine two different narrative points of view: (a) clinical insights in third person, generated by feeding into an LLM clinical expert-guided prompts, and importantly, (b) a temporally sensitive abstractive summary of the user’s timeline in first person, generated by a novel hierarchical variational autoencoder, TH-VAE. We assess the generated summaries via automatic evaluation against expert summaries and via human evaluation with clinical experts, showing that timeline summarisation by TH-VAE results in more factual and logically coherent summaries rich in clinical utility and superior to LLM-only approaches in capturing changes over time.</abstract>
-      <url hash="abe76989">2024.findings-acl.873</url>
+      <url hash="f95015a3">2024.findings-acl.873</url>
       <bibkey>song-etal-2024-combining</bibkey>
       <doi>10.18653/v1/2024.findings-acl.873</doi>
       <video href="2024.findings-acl.873.mp4"/>
@@ -18359,7 +18359,7 @@
       <author><first>Antonio</first><last>Vergari</last><affiliation>University of Edinburgh, University of Edinburgh</affiliation></author>
       <pages>14673-14695</pages>
       <abstract>Recent work showed the possibility of building open-vocabulary large language models (LLMs) that directly operate on pixel representations. These models are implemented as autoencoders that reconstruct masked patches of rendered text.However, these pixel-based LLMs are limited to discriminative tasks (e.g., classification) and, similar to BERT, cannot be used to generate text.Therefore, they cannot be used for generative tasks such as free-form question answering. In this work, we introduce PIXAR, the first pixel-based autoregressive LLM that performs text generation. Consisting of only a decoder, PIXAR can perform free-form generative tasks while keeping the number of parameters on par with previous encoder-decoder models.Furthermore, we highlight the challenges of generating text as non-noisy images and show this is due to using a maximum likelihood objective. To overcome this problem, we propose an adversarial pretraining stage that improves the readability and accuracy of PIXAR by 8.1 on LAMBADA and 8.5 on bAbI— making it comparable to GPT-2 on text generation tasks.This paves the way to build open-vocabulary LLMs that operate on perceptual input only and calls into question the necessity of the usual symbolic input representation, i.e., text as (sub)tokens.</abstract>
-      <url hash="183fd99c">2024.findings-acl.874</url>
+      <url hash="fb17e89c">2024.findings-acl.874</url>
       <bibkey>tai-etal-2024-pixar</bibkey>
       <doi>10.18653/v1/2024.findings-acl.874</doi>
       <video href="2024.findings-acl.874.mp4"/>
@@ -18377,7 +18377,7 @@
       <author><first>Kai</first><last>Yu</last><affiliation>Shanghai Jiao Tong University</affiliation></author>
       <pages>14696-14707</pages>
       <abstract>Large language models (LLMs) have demonstrated proficiency across various natural language processing (NLP) tasks but often require additional training, such as continual pre-training and supervised fine-tuning. However, the costs associated with this, primarily due to their large parameter count, remain high. This paper proposes leveraging <i>sparsity</i> in pre-trained LLMs to expedite this training process. By observing sparsity in activated neurons during forward iterations, we identify the potential for computational speed-ups by excluding inactive neurons. We address associated challenges by extending existing neuron importance evaluation metrics and introducing a ladder omission rate scheduler. Our experiments on Llama-2 demonstrate that Sparsity-Accelerated Training (SAT) achieves comparable or superior performance to standard training while significantly accelerating the process. Specifically, SAT achieves a 45% throughput improvement in continual pre-training and saves 38% training time in supervised fine-tuning. It offers a simple, hardware-agnostic, and easily deployable framework for additional LLM training.</abstract>
-      <url hash="1135e9f6">2024.findings-acl.875</url>
+      <url hash="97a5731b">2024.findings-acl.875</url>
       <bibkey>ma-etal-2024-sparsity</bibkey>
       <doi>10.18653/v1/2024.findings-acl.875</doi>
       <video href="2024.findings-acl.875.mp4"/>
@@ -18389,7 +18389,7 @@
       <author><first>Wei</first><last>Xu</last><affiliation>Tsinghua University, Tsinghua University</affiliation></author>
       <pages>14708-14726</pages>
       <abstract>Large language models (LLMs) showcase impressive reasoning capabilities when coupled with Chain-of-Thought (CoT) prompting. However, the robustness of this approach warrants further investigation. In this paper, we introduce a novel scenario termed preemptive answers, where the LLM obtains an answer before engaging in reasoning. This situation can arise inadvertently or induced by malicious users by prompt injection attacks. Experiments reveal that preemptive answers significantly impair the model’s reasoning capability across various CoT methods and a broad spectrum of datasets. To bolster the robustness of reasoning, we propose two measures aimed at mitigating this issue to some extent.</abstract>
-      <url hash="6d423ef2">2024.findings-acl.876</url>
+      <url hash="9c26c051">2024.findings-acl.876</url>
       <bibkey>xu-etal-2024-preemptive</bibkey>
       <doi>10.18653/v1/2024.findings-acl.876</doi>
       <video href="2024.findings-acl.876.mp4"/>
@@ -18401,7 +18401,7 @@
       <author><first>Arabella</first><last>Sinclair</last><affiliation>University of Aberdeen</affiliation></author>
       <pages>14727-14742</pages>
       <abstract>We explore which linguistic factors—at the sentence and token level—play an important role in influencing language model predictions, and investigate whether these are reflective of results found in humans and human corpora (Gries and Kootstra, 2017). We make use of the structural priming paradigm—where recent exposure to a structure facilitates processing of the same structure—to investigate where priming effects manifest, and what factors predict them. We find these effects can be explained via the inverse frequency effect found in human priming, where rarer elements within a prime increase priming effects, as well as lexical dependence between prime and target. Our results provide an important piece in the puzzle of understanding how properties within their context affect structural prediction in language models.</abstract>
-      <url hash="b1ba1ef4">2024.findings-acl.877</url>
+      <url hash="16df68f3">2024.findings-acl.877</url>
       <bibkey>jumelet-etal-2024-language</bibkey>
       <doi>10.18653/v1/2024.findings-acl.877</doi>
     </paper>
@@ -18426,7 +18426,7 @@
       <author><first>Junran</first><last>Peng</last></author>
       <pages>14743-14777</pages>
       <abstract>The advent of Large Language Models (LLMs) has paved the way for complex tasks such as role-playing, which enhances user interactions by enabling models to imitate various characters. However, the closed-source nature of state-of-the-art LLMs and their general-purpose training limit role-playing optimization. In this paper, we introduce RoleLLM, a framework to benchmark, elicit, and enhance role-playing abilities in LLMs. RoleLLM comprises four stages: (1) Role Profile Construction for 100 roles; (2) Context-Based Instruction Generation (Context-Instruct) for role-specific knowledge extraction; (3) Role Prompting using GPT (RoleGPT) for speaking style imitation; and (4) Role-Conditioned Instruction Tuning (RoCIT) for fine-tuning open-source models along with role customization. By Context-Instruct and RoleGPT, we create RoleBench, the first systematic and fine-grained character-level benchmark dataset for role-playing with 168,093 samples. Moreover, RoCIT on RoleBench yields RoleLLaMA (English) and RoleGLM (Chinese), significantly enhancing role-playing abilities and even achieving comparable results with RoleGPT (using GPT-4).</abstract>
-      <url hash="a9a03120">2024.findings-acl.878</url>
+      <url hash="e7913514">2024.findings-acl.878</url>
       <bibkey>wang-etal-2024-rolellm</bibkey>
       <doi>10.18653/v1/2024.findings-acl.878</doi>
       <video href="2024.findings-acl.878.mp4"/>
@@ -18440,7 +18440,7 @@
       <author><first>Zilong</first><last>Zheng</last><affiliation>Beijing Institute for General Artificial Intelligence</affiliation></author>
       <pages>14778-14814</pages>
       <abstract>Recent advances in Large Language Models (LLMs) have shown inspiring achievements in constructing autonomous agents that rely onlanguage descriptions as inputs. However, it remains unclear how well LLMs can function as few-shot or zero-shot embodied agents in dynamic interactive environments. To address this gap, we introduce LangSuit·E, a versatile and simulation-free testbed featuring 6 representative embodied tasks in textual embodied worlds. Compared with previous LLM-based testbeds, LangSuit·E (i) offers adaptability to diverse environments without multiple simulation engines, (ii) evaluates agents’ capacity to develop “internalized world knowledge” with embodied observations, and (iii) allows easy customization of communication and action strategies. To address the embodiment challenge, we devise a novel chain-of-thought (CoT) schema, EmMem, which summarizes embodied states w.r.t. history information. Comprehensive benchmark results illustrate challenges and insights of embodied planning. LangSuit·E represents a significant step toward building embodied generalists in the context of language models.</abstract>
-      <url hash="085f40d2">2024.findings-acl.879</url>
+      <url hash="7f489d5e">2024.findings-acl.879</url>
       <bibkey>jia-etal-2024-langsuit</bibkey>
       <doi>10.18653/v1/2024.findings-acl.879</doi>
       <video href="2024.findings-acl.879.mp4"/>
@@ -18456,7 +18456,7 @@
       <author><first>Owen</first><last>Rambow</last><affiliation>Stony Brook University</affiliation></author>
       <pages>14815-14823</pages>
       <abstract>Evaluating the theory of mind (ToM) capabilities of language models (LMs) has recently received a great deal of attention. However, many existing benchmarks rely on synthetic data, which risks misaligning the resulting experiments with human behavior. We introduce the first ToM dataset based on naturally occurring spoken dialogs, Common-ToM, and show that LMs struggle to demonstrate ToM. We then show that integrating a simple, explicit representation of beliefs improves LM performance on Common-ToM.</abstract>
-      <url hash="80522231">2024.findings-acl.880</url>
+      <url hash="25fa7dfa">2024.findings-acl.880</url>
       <bibkey>soubki-etal-2024-views</bibkey>
       <doi>10.18653/v1/2024.findings-acl.880</doi>
       <video href="2024.findings-acl.880.mp4"/>
@@ -18469,7 +18469,7 @@
       <author><first>Sunayana</first><last>Sitaram</last><affiliation>Microsoft</affiliation></author>
       <pages>14824-14867</pages>
       <abstract>Parameter efficient finetuning has emerged as a viable solution for improving the performance of Large Language Models without requiring massive resources and compute. Prior work on multilingual evaluation has shown that there is a large gap between the performance of LLMs on English and other languages. Further, there is also a large gap between the performance of smaller open-source models and larger LLMs. Finetuning can be an effective way to bridge this gap and make language models more equitable. In this work, we finetune the Llama-2 and Mistral models on two synthetic multilingual instruction tuning datasets to determine its effect on model performance on six downstream tasks covering forty one languages in all. Additionally, we experiment with various parameters, such as rank for low-rank adaptation and values of quantisation to determine their effects on downstream performance and find that higher rank and higher quantisation values benefit low-resource languages. We find that parameter efficient finetuning of smaller open-source models sometimes bridges the gap between the performance of these models and the larger ones, however, English performance can take a hit. We also find that finetuning sometimes improves performance on low-resource languages, while degrading performance on high-resource languages.</abstract>
-      <url hash="33642153">2024.findings-acl.881</url>
+      <url hash="90252145">2024.findings-acl.881</url>
       <bibkey>aggarwal-etal-2024-maple</bibkey>
       <doi>10.18653/v1/2024.findings-acl.881</doi>
       <video href="2024.findings-acl.881.mp4"/>
@@ -18484,7 +18484,7 @@
       <author><first>Yuexian</first><last>Zou</last><affiliation>Peking University</affiliation></author>
       <pages>14868-14879</pages>
       <abstract>As a crucial task in the task-oriented dialogue systems, spoken language understanding (SLU) has garnered increasing attention. However, errors from automatic speech recognition (ASR) often hinder the performance of understanding. To tackle this problem, we propose MoE-SLU, an ASR-Robust SLU framework based on the mixture-of-experts technique. Specifically, we first introduce three strategies to generate additional transcripts from clean transcripts. Then, we employ the mixture-of-experts technique to weigh the representations of the generated transcripts, ASR transcripts, and the corresponding clean manual transcripts. Additionally, we also regularize the weighted average of predictions and the predictions of ASR transcripts by minimizing the Jensen-Shannon Divergence (JSD) between these two output distributions. Experiment results on three benchmark SLU datasets demonstrate that our MoE-SLU achieves state-of-the-art performance. Further model analysis also verifies the superiority of our method.</abstract>
-      <url hash="25e45b46">2024.findings-acl.882</url>
+      <url hash="aef9cacf">2024.findings-acl.882</url>
       <bibkey>cheng-etal-2024-moe</bibkey>
       <doi>10.18653/v1/2024.findings-acl.882</doi>
     </paper>
@@ -18495,7 +18495,7 @@
       <author><first>Nicholas</first><last>Andrews</last><affiliation>Johns Hopkins University</affiliation></author>
       <pages>14880-14891</pages>
       <abstract>Instruction-tuning trains a language model on hundreds of tasks jointly to improve a model’s ability to learn in-context;however, the mechanisms that drive in-context learning are poorly understood and, as a result, the role of instruction-tuning on in-context generalization is poorly understood as well.In this work, we study the impact of instruction-tuning on multi-task transfer: how well a model’s parameters adapt to an unseen task via fine-tuning.We find that instruction-tuning negatively impacts a model’s transfer to unseen tasks, and that model transfer and in-context generalization are highly correlated, suggesting that this catastrophic forgetting may impact in-context learning.We study methods to improve model transfer, finding that multi-task training—how well the training tasks are optimized—can significantly impact ICL generalization; additionally, we find that continual training on unsupervised pre-training data can mitigate forgetting and improve ICL generalization as well.Finally, we demonstrate that, early into training, the impact of instruction-tuning on model transfer to tasks impacts in-context generalization on that task.Overall, we provide significant evidence that multi-task transfer is deeply connected to a model’s ability to learn a task in-context.</abstract>
-      <url hash="94ebbb41">2024.findings-acl.883</url>
+      <url hash="222626d7">2024.findings-acl.883</url>
       <bibkey>mueller-etal-2024-multi</bibkey>
       <doi>10.18653/v1/2024.findings-acl.883</doi>
     </paper>
@@ -18508,7 +18508,7 @@
       <author><first>Shikun</first><last>Zhang</last><affiliation>Peking University</affiliation></author>
       <pages>14892-14904</pages>
       <abstract>Although large language models (LLMs) have demonstrated impressive few-shot learning capabilities via in-context learning (ICL), ICL performance is known to be highly sensitive to the order of examples provided. To identify appropriate orders, recent studies propose heuristic methods to evaluate order performance using a set of unlabeled data. However, the requirement of in-domain data limits their utility in real-world scenarios where additional annotated data is challenging to acquire. Additionally, these dataset-based approaches are prone to being sub-optimal for a lack of consideration for individual differences. To address the problems, we first analyze the properties of performant example orders at both corpus level and instance level. Based on the analysis we propose **DEmO** to adaptively identify performant example order for each instance without extra data. DEmO works by filtering out a subset of orders featuring label fairness, then selecting the most influential order for each test instance. The employment of a content-free metric makes DEmO independent of in-domain data. Extensive experiments indicate the superiority of DEmO over a wide range of strong baselines. Further analysis validates the generalizability across various settings.</abstract>
-      <url hash="40fa8afe">2024.findings-acl.884</url>
+      <url hash="c27454b7">2024.findings-acl.884</url>
       <bibkey>guo-etal-2024-makes</bibkey>
       <doi>10.18653/v1/2024.findings-acl.884</doi>
     </paper>
@@ -18523,7 +18523,7 @@
       <author><first>Ajay</first><last>Divakaran</last><affiliation>SRI International</affiliation></author>
       <pages>14905-14918</pages>
       <abstract>We propose a novel VQA dataset, BloomVQA, to facilitate comprehensive evaluation of large vision-language models on comprehension tasks. Unlike current benchmarks that often focus on fact-based memorization and simple reasoning tasks without theoretical grounding, we collect multiple-choice samples based on picture stories that reflect different levels of comprehension, as laid out in Bloom’s Taxonomy, a classic framework for learning assessment widely adopted in education research. Our data maps to a novel hierarchical graph representation which enables automatic data augmentation and novel measures characterizing model consistency. We perform graded evaluation and reliability analysis on recent multi-modal models. In comparison to low-level tasks, we observe decreased performance on tasks requiring advanced comprehension and cognitive skills with up to 38.0% drop in VQA accuracy. In comparison to earlier models, GPT-4V demonstrates improved accuracy over all comprehension levels and also shows a tendency of bypassing visual inputs especially for higher-level tasks. Current models also show consistency patterns misaligned with human comprehension in various scenarios, demonstrating the need for improvement based on theoretically-grounded criteria. The dataset can be accessed at https://huggingface.co/datasets/ygong/BloomVQA.</abstract>
-      <url hash="8b8b1c35">2024.findings-acl.885</url>
+      <url hash="9b85bff3">2024.findings-acl.885</url>
       <bibkey>gong-etal-2024-bloomvqa</bibkey>
       <doi>10.18653/v1/2024.findings-acl.885</doi>
       <video href="2024.findings-acl.885.mp4"/>
@@ -18536,7 +18536,7 @@
       <author><first>Huan</first><last>Sun</last><affiliation>The Ohio State University, Columbus</affiliation></author>
       <pages>14919-14935</pages>
       <abstract>Modern generative search engines enhance the reliability of large language model (LLM) responses by providing cited evidence. However, evaluating the answer’s attribution, i.e., whether every claim within the generated responses is fully supported by its cited evidence, remains an open problem. This verification, traditionally dependent on costly human evaluation, underscores the urgent need for automatic attribution evaluation methods. To bridge the gap in the absence of standardized benchmarks for these methods, we present AttributionBench, a comprehensive benchmark compiled from various existing attribution datasets. Our extensive experiments on AttributionBench reveal the challenges of automatic attribution evaluation, even for state-of-the-art LLMs. Specifically, our findings show that even a fine-tuned GPT-3.5 only achieves around 80% macro-F1 under a binary classification formulation. A detailed analysis of more than 300 error cases indicates that a majority of failures stem from the model’s inability to process nuanced information, and the discrepancy between the information the model has access to and that human annotators do.</abstract>
-      <url hash="3042bd9d">2024.findings-acl.886</url>
+      <url hash="b8a2a650">2024.findings-acl.886</url>
       <bibkey>li-etal-2024-attributionbench</bibkey>
       <doi>10.18653/v1/2024.findings-acl.886</doi>
       <video href="2024.findings-acl.886.mp4"/>
@@ -18549,7 +18549,7 @@
       <author><first>Kilian</first><last>Weinberger</last><affiliation>Cornell University, Cornell University and Cornell University</affiliation></author>
       <pages>14936-14952</pages>
       <abstract>Current language models demonstrate remarkable proficiency in text generation. However, for many applications it is desirable to control attributes, such as sentiment, or toxicity, of the generated language—ideally tailored towards each specific use case and target audience. For auto-regressive language models, existing guidance methods are prone to decoding errors that cascade during generation and degrade performance. In contrast, text diffusion models can easily be guided with, for example, a simple linear sentiment classifier—however they do suffer from significantly higher perplexity than auto-regressive alternatives. In this paper we use a guided diffusion model to produce a latent proposal that steers an auto-regressive language model to generate text with desired properties. Our model inherits the unmatched fluency of the auto-regressive approach and the plug-and-play flexibility of diffusion. We show that it outperforms previous plug-and-play guidance methods across a wide range of benchmark data sets. Further, controlling a new attribute in our framework is reduced to training a single logistic regression classifier.</abstract>
-      <url hash="dfce6c4b">2024.findings-acl.887</url>
+      <url hash="44b9c0b9">2024.findings-acl.887</url>
       <bibkey>lovelace-etal-2024-diffusion</bibkey>
       <doi>10.18653/v1/2024.findings-acl.887</doi>
       <video href="2024.findings-acl.887.mp4"/>
@@ -18564,7 +18564,7 @@
       <author><first>Jeff</first><last>Pan</last><affiliation>University of Edinburgh, University of Edinburgh</affiliation></author>
       <pages>14953-14968</pages>
       <abstract>The task of model editing becomes popular for correcting inaccurate or outdated parametric knowledge in Large Language Models (LLMs). However, there are major limitations of state of the art (SOTA) model editing methods, including the excessive memorization issue caused by the direct editing methods, as well as the error propagation and knowledge conflict issues from the memory enhancement methods, resulting in hindering models’ *portability*, e.g., the ability to transfer the new knowledge to related one-hop or multi-hop content. To address these issues, we propose the InstructEd method, the idea of which is to insert soft instructions into the attention module so as to facilitate interactions between instructions and questions and to understand and utilize new facts. Our main findings are: (i) InstructEd has achieved SOTA performance on three datasets for one-hop/multi-hop evaluation with LLaMAs and GPT2, achieving 10% (5%) improvement in one-hop (multi-hop) model editing.(ii) Different from earlier methods on editing parameters in FFN, we show that editing attention can also help. (iii) Model editing is highly related to retrieval augmented methods, which can help improve the locality of model editing while slightly decrease the editing performance with hops.</abstract>
-      <url hash="d3d47b26">2024.findings-acl.888</url>
+      <url hash="7274f57f">2024.findings-acl.888</url>
       <bibkey>han-etal-2024-instructed</bibkey>
       <doi>10.18653/v1/2024.findings-acl.888</doi>
       <video href="2024.findings-acl.888.mp4"/>
@@ -18583,7 +18583,7 @@
       <author><first>Chang</first><last>Yoo</last><affiliation>Korea Advanced Institute of Science and Technology</affiliation></author>
       <pages>14969-14981</pages>
       <abstract>Reinforcement Learning from Human Feedback (RLHF) leverages human preference data to train language models to align more closely with human essence. These human preference data, however, are labeled at the sequence level, creating a mismatch between sequence-level preference labels and tokens, which are autoregressively generated from the language model. Although several recent approaches have tried to provide token-level (i.e., dense) rewards for each individual token, these typically rely on predefined discrete reward values (e.g., positive: +1, negative: -1, neutral: 0), failing to account for varying degrees of preference inherent to each token. To address this limitation, we introduce TLCR (Token-Level Continuous Reward) for RLHF, which incorporates a discriminator trained to distinguish positive and negative tokens, and the confidence of the discriminator is used to assign continuous rewards to each token considering the context. Extensive experiments show that our proposed TLCR leads to consistent performance improvements over previous sequence-level or token-level discrete rewards on open-ended generation benchmarks.</abstract>
-      <url hash="c17aab89">2024.findings-acl.889</url>
+      <url hash="56f6b02c">2024.findings-acl.889</url>
       <bibkey>yoon-etal-2024-tlcr</bibkey>
       <doi>10.18653/v1/2024.findings-acl.889</doi>
       <video href="2024.findings-acl.889.mp4"/>
@@ -18603,7 +18603,7 @@
       <author><first>Tomas</first><last>Pfister</last><affiliation>Google</affiliation></author>
       <pages>14982-14995</pages>
       <abstract>Large language models (LLMs), even when specifically trained to process long input contexts, struggle to capture relevant information located in the middle of their input. This phenomenon has been known as the lost-in-the-middle problem. In this work, we make three contributions. First, we set out to understand the factors that cause this phenomenon. In doing so, we establish a connection between lost-in-the-middle to LLMs’ intrinsic attention bias: LLMs exhibit an U-shaped attention bias where the tokens at the beginning and at the end of its input receive higher attention, regardless of their relevance. Second, we mitigate this positional bias through a calibration mechanism, found-in-the-middle, that allows the model to attend to contexts faithfully according to their relevance, even though when they are in the middle. Third, we show found-in-the-middle not only achieves better performance in locating relevant information within a long context, but also eventually leads to improved retrieval-augmented generation (RAG) performance across various tasks, outperforming existing methods by up to 10 percentage point. These findings open up future directions in understanding LLM attention bias and its potential consequences.</abstract>
-      <url hash="ac0b1207">2024.findings-acl.890</url>
+      <url hash="2cca197c">2024.findings-acl.890</url>
       <bibkey>hsieh-etal-2024-found</bibkey>
       <doi>10.18653/v1/2024.findings-acl.890</doi>
       <video href="2024.findings-acl.890.mp4"/>
@@ -18620,7 +18620,7 @@
       <author><first>Tara</first><last>Safavi</last><affiliation>Microsoft Research</affiliation></author>
       <pages>14996-15014</pages>
       <abstract>Traditional Dialogue State Tracking (DST) has focused on tracking preferences and intents in conversations centered around specific tasks (e.g. booking services). These conventional systems assume a relatively restricted conversation flow in which each turn gradually offers new information. However, advancements in Large Language Models (LLMs) have ushered in more versatile open-domain chat systems in which extended dialogue sessions encompassing numerous tasks and topics are common—in turn requiring new conversational tracking tools in order to successfully orchestrate such systems. Addressing these challenges, we introduce a novel approach combining dialogue segmentation and state tracking within open-domain dialogues, tailored for zero-shot applications appropriate to a true open-domain dialogue system. Our proposed method S3-DST employs a unique structured prompting technique and *Pre-Analytical Recollection*, a novel grounding mechanism we designed for improving long context tracking. Tested on proprietary anonymized open-domain dialogue datasets as well as publicly available DST and segmentation datasets, S3-DST consistently outperforms the state-of-the-art, showcasing its effectiveness and adaptability state tracking in the next wave of LLM-based chat systems. We also release S3-DST annotations with GPT-4 on a curated subset of LMSYS-Chat-1M to be used as a testbed to fuel research in this direction.</abstract>
-      <url hash="60720c69">2024.findings-acl.891</url>
+      <url hash="40f7dae1">2024.findings-acl.891</url>
       <bibkey>das-etal-2024-s3</bibkey>
       <doi>10.18653/v1/2024.findings-acl.891</doi>
       <video href="2024.findings-acl.891.mp4"/>
@@ -18634,7 +18634,7 @@
       <author><first>Noah</first><last>Smith</last><affiliation>University of Washington and Allen Institute for Artificial Intelligence</affiliation></author>
       <pages>15015-15040</pages>
       <abstract>Language models (LMs) are trained on web text originating from many points in time and, in general, without any explicit temporal grounding. This work investigates the temporal chaos of pretrained LMs and explores various methods to align their internal knowledge to a target time, which we call “temporal alignment.” To do this, we first automatically construct a dataset containing 20K time-sensitive questions and their answers for each year from 2000 to 2023. Based on this dataset, we empirically show that pretrained LMs (e.g., LLaMa2), despite having a recent pretraining cutoff (e.g., 2022), mostly answer questions using earlier knowledge (e.g., in 2019). We then develop several methods, from prompting to finetuning, to align LMs to use their most recent knowledge when answering questions, and investigate various factors in this alignment. Our experiments demonstrate that aligning LLaMa2 to the year 2022 can enhance its performance by up to 62% according to that year’s answers. This improvement occurs even without explicitly mentioning time information, indicating the possibility of aligning models’ internal sense of time after pretraining. Finally, we find that alignment to a historical time is also possible, with up to <tex-math>2.8\times</tex-math> the performance of the unaligned LM in 2010 if finetuning models to that year. These findings hint at the sophistication of LMs’ internal knowledge organization and the necessity of tuning them properly.</abstract>
-      <url hash="700029f8">2024.findings-acl.892</url>
+      <url hash="ff732010">2024.findings-acl.892</url>
       <bibkey>zhao-etal-2024-set</bibkey>
       <doi>10.18653/v1/2024.findings-acl.892</doi>
       <video href="2024.findings-acl.892.mp4"/>
@@ -18647,7 +18647,7 @@
       <author><first>Patrick</first><last>Lewis</last></author>
       <pages>15041-15058</pages>
       <abstract>To date, toxicity mitigation in language models has almost entirely been focused on single-language settings. As language models embrace multilingual capabilities, it’s crucial our safety measures keep pace. Recognizing this research gap, our approach expands the scope of conventional toxicity mitigation to address the complexities presented by multiple languages. In the absence of sufficient annotated datasets across languages, we employ translated data to evaluate and enhance our mitigation techniques. We also compare finetuning mitigation approaches against retrieval-augmented techniques under both static and continual toxicity mitigation scenarios. This allows us to examine the effects of translation quality and the cross-lingual transfer on toxicity mitigation. We also explore how model size and data quantity affect the success of these mitigation efforts. Covering nine languages, our study represents a broad array of linguistic families and levels of resource availability, ranging from high to mid-resource languages. Through comprehensive experiments, we provide insights into the complexities of multilingual toxicity mitigation, offering valuable insights and paving the way for future research in this increasingly important field.</abstract>
-      <url hash="247e68ad">2024.findings-acl.893</url>
+      <url hash="d3a0f0a7">2024.findings-acl.893</url>
       <bibkey>ermis-etal-2024-one</bibkey>
       <doi>10.18653/v1/2024.findings-acl.893</doi>
       <video href="2024.findings-acl.893.mp4"/>
@@ -18662,7 +18662,7 @@
       <author><first>Qiongkai</first><last>Xu</last><affiliation>Macquarie University</affiliation></author>
       <pages>15059-15075</pages>
       <abstract>The democratization of pre-trained language models through open-source initiatives has rapidly advanced innovation and expanded access to cutting-edge technologies. However, this openness also brings significant security risks, including backdoor attacks, where hidden malicious behaviors are triggered by specific inputs, compromising natural language processing (NLP) system integrity and reliability. This paper suggests that merging a backdoored model with other homogeneous models can significantly remediate backdoor vulnerabilities even if such models are not entirely secure. In our experiments, we verify our hypothesis on various models (BERT-Base, RoBERTa-Large, Llama2-7B, and Mistral-7B) and datasets (SST-2, OLID, AG News, and QNLI). Compared to multiple advanced defensive approaches, our method offers an effective and efficient inference-stage defense against backdoor attacks on classification and instruction-tuned tasks without additional resources or specific knowledge. Our approach consistently outperforms recent advanced baselines, leading to an average of about 75% reduction in the attack success rate. Since model merging has been an established approach for improving model performance, the extra advantage it provides regarding defense can be seen as a cost-free bonus.</abstract>
-      <url hash="5606af36">2024.findings-acl.894</url>
+      <url hash="c47af3f9">2024.findings-acl.894</url>
       <bibkey>arora-etal-2024-heres</bibkey>
       <doi>10.18653/v1/2024.findings-acl.894</doi>
       <video href="2024.findings-acl.894.mp4"/>
@@ -18674,7 +18674,7 @@
       <author><first>Aline</first><last>Paes</last><affiliation>Universidade Federal Fluminense</affiliation></author>
       <pages>15076-15091</pages>
       <abstract>Automatic text simplification focuses on transforming texts into a more comprehensible version without sacrificing their precision. However, automatic methods usually require (paired) datasets that can be rather scarce in languages other than English. This paper presents a new approach to automatic sentence simplification that leverages paraphrases, context, and linguistic attributes to overcome the absence of paired texts in Portuguese.We frame the simplification problem as a textual style transfer task and learn a style representation using the sentences around the target sentence in the document and its linguistic attributes. Moreover, unlike most unsupervised approaches that require style-labeled training data, we fine-tune strong pre-trained models using sentence-level paraphrases instead of annotated data. Our experiments show that our model achieves remarkable results, surpassing the current state-of-the-art (BART+ACCESS) while competitively matching a Large Language Model.</abstract>
-      <url hash="68f127e2">2024.findings-acl.895</url>
+      <url hash="ea197268">2024.findings-acl.895</url>
       <bibkey>scalercio-etal-2024-enhancing</bibkey>
       <doi>10.18653/v1/2024.findings-acl.895</doi>
       <video href="2024.findings-acl.895.mp4"/>
@@ -18688,7 +18688,7 @@
       <author><first>Christof</first><last>Monz</last><affiliation>University of Amsterdam, University of Amsterdam</affiliation></author>
       <pages>15092-15108</pages>
       <abstract>Zero-shot translation aims to translate between language pairs not seen during training in Multilingual Machine Translation (MMT) and is widely considered an open problem. A common, albeit resource-consuming, solution is to add as many related translation directions as possible to the training corpus. In this paper, we show that for an English-centric model, surprisingly large zero-shot improvements can be achieved by simply fine-tuning with a very small amount of multi-parallel data. For example, on the EC30 dataset, we obtain up to +21.7 ChrF++ non-English overall improvements (870 directions) by using only 100 multi-parallel samples while preserving English-centric translation quality. This performance exceeds M2M100 by an average of 5.9 ChrF++ in the involved non-English directions. When investigating the size effect of fine-tuning data on translation quality, we found that already a small, randomly sampled set of fine-tuning directions is sufficient to achieve comparable improvements. The resulting non-English performance is close to the complete translation upper bound. Even in a minimal setting—fine-tuning with only one single sample—the well-known off-target issue is almost completely resolved, explaining parts—but not all—of the observed improvements in translation quality.</abstract>
-      <url hash="c4daa086">2024.findings-acl.896</url>
+      <url hash="bc05ec6b">2024.findings-acl.896</url>
       <bibkey>wu-etal-2024-far</bibkey>
       <doi>10.18653/v1/2024.findings-acl.896</doi>
       <video href="2024.findings-acl.896.mp4"/>
@@ -18704,7 +18704,7 @@
       <author><first>Samuel</first><last>Carton</last><affiliation>University of New Hampshire, Durham</affiliation></author>
       <pages>15109-15123</pages>
       <abstract>We explore the ability of GPT-4 to perform ad-hoc schema-based information extraction from scientific literature. We assess specifically whether it can, with a basic one-shot prompting approach over the full text of the included manusciprts, replicate two existing material science datasets, one pertaining to multi-principal element alloys (MPEAs), and one to silicate diffusion. We collaborate with materials scientists to perform a detailed manual error analysis to assess where and why the model struggles to faithfully extract the desired information, and draw on their insights to suggest research directions to address this broadly important task.</abstract>
-      <url hash="0496ee24">2024.findings-acl.897</url>
+      <url hash="1b834227">2024.findings-acl.897</url>
       <bibkey>ghosh-etal-2024-toward</bibkey>
       <doi>10.18653/v1/2024.findings-acl.897</doi>
       <video href="2024.findings-acl.897.mp4"/>
@@ -18715,7 +18715,7 @@
       <author><first>Kangil</first><last>Kim</last><affiliation>Gwangju Institute of Science and Technology</affiliation></author>
       <pages>15124-15139</pages>
       <abstract>Neural parameterization has significantly advanced unsupervised grammar induction. However, training these models with a traditional likelihood loss for all possible parses exacerbates two issues: 1) *structural optimization ambiguity* that arbitrarily selects one among structurally ambiguous optimal grammars despite the specific preference of gold parses, and 2) *structural simplicity bias* that leads a model to underutilize rules to compose parse trees. These challenges subject unsupervised neural grammar induction (UNGI) to inevitable prediction errors, high variance, and the necessity for extensive grammars to achieve accurate predictions. This paper tackles these issues, offering a comprehensive analysis of their origins. As a solution, we introduce *sentence-wise parse-focusing* to reduce the parse pool per sentence for loss evaluation, using the structural bias from pre-trained parsers on the same dataset.In unsupervised parsing benchmark tests, our method significantly improves performance while effectively reducing variance and bias toward overly simplistic parses. Our research promotes learning more compact, accurate, and consistent explicit grammars, facilitating better interpretability.</abstract>
-      <url hash="b35971fb">2024.findings-acl.898</url>
+      <url hash="8a80a7ef">2024.findings-acl.898</url>
       <bibkey>park-kim-2024-structural</bibkey>
       <doi>10.18653/v1/2024.findings-acl.898</doi>
       <video href="2024.findings-acl.898.mp4"/>
@@ -18736,7 +18736,7 @@
       <author><first>Nan</first><last>Hua</last><affiliation>Google</affiliation></author>
       <pages>15140-15168</pages>
       <abstract>Large Language Models (LLM) have revolutionized Natural Language Processing (NLP), improving state-of-the-art and exhibiting emergent capabilities across various tasks. However, their application in extracting information from visually rich documents, which is at the core of many document processing workflows and involving the extraction of key entities from semi-structured documents, has not yet been successful. The main obstacles to adopting LLMs for this task include the absence of layout encoding within LLMs, which is critical for high quality extraction, and the lack of a grounding mechanism to localize the predicted entities within the document. In this paper, we introduce Language Model-based Document Information EXtraction and Localization (LMDX), a methodology to reframe the document information extraction task for a LLM. LMDX enables extraction of singular, repeated, and hierarchical entities, both with and without training data, while providing grounding guarantees and localizing the entities within the document. Finally, we apply LMDX to the PaLM 2-S and Gemini Pro LLMs and evaluate it on VRDU and CORD benchmarks, setting a new state-of-the-art and showing how LMDX enables the creation of high quality, data-efficient parsers.</abstract>
-      <url hash="b80e69e1">2024.findings-acl.899</url>
+      <url hash="7e3dd435">2024.findings-acl.899</url>
       <bibkey>perot-etal-2024-lmdx</bibkey>
       <doi>10.18653/v1/2024.findings-acl.899</doi>
       <video href="2024.findings-acl.899.mp4"/>
@@ -18750,7 +18750,7 @@
       <author><first>Ryutaro</first><last>Ichise</last><affiliation>National Intitute of Informatics and Tokyo Institute of Technology, Tokyo Institute of Technology</affiliation></author>
       <pages>15169-15182</pages>
       <abstract>This paper introduces the Database Querying and Reasoning Dataset for Question Answering (DBQR-QA), aimed at addressing the gap in current question-answering (QA) research by emphasizing the essential processes of database querying and reasoning to answer questions. Specifically designed to accommodate sequential questions and multi-hop queries, DBQR-QA more accurately mirrors the dynamics of real-world information retrieval and analysis, with a particular focus on the financial reports of US companies. The dataset’s construction, the challenges encountered during its development, the performance of large language models on this dataset, and a human evaluation are thoroughly discussed to illustrate the dataset’s complexity and highlight future research directions in querying and reasoning tasks.</abstract>
-      <url hash="bab449ac">2024.findings-acl.900</url>
+      <url hash="40b0f3b4">2024.findings-acl.900</url>
       <bibkey>nararatwong-etal-2024-dbqr</bibkey>
       <doi>10.18653/v1/2024.findings-acl.900</doi>
       <video href="2024.findings-acl.900.mp4"/>
@@ -18767,7 +18767,7 @@
       <author><first>Hong</first><last>Yu</last><affiliation>Columbia University</affiliation></author>
       <pages>15183-15201</pages>
       <abstract>We introduce NoteChat, a novel cooperative multi-agent framework leveraging Large Language Models (LLMs) to generate patient-physician dialogues. NoteChat embodies the principle that an ensemble of role-specific LLMs, through structured role-play and strategic prompting, can perform their assigned roles more effectively. The synergy among these role-playing LLMs results in a cohesive and efficient dialogue generation. Evaluation on MTS-dialogue, a benchmark dataset for patient-physician dialogues-note pairs, shows that models trained with the augmented synthetic patient-physician dialogues by NoteChat outperforms other state-of-the-art models for generating clinical notes. Our comprehensive automatic and human evaluation demonstrates that NoteChat substantially surpasses state-of-the-art models like ChatGPT and GPT-4 up to 22.78% by domain experts in generating superior synthetic patient-physician dialogues based on clinical notes. NoteChat has the potential to engage patients directly and help clinical documentation, a leading cause of physician burnout.</abstract>
-      <url hash="5f35b0a2">2024.findings-acl.901</url>
+      <url hash="0183725e">2024.findings-acl.901</url>
       <bibkey>wang-etal-2024-notechat</bibkey>
       <doi>10.18653/v1/2024.findings-acl.901</doi>
     </paper>
@@ -18778,7 +18778,7 @@
       <author><first>Gopala</first><last>Anumanchipalli</last><affiliation>University of California, Berkeley</affiliation></author>
       <pages>15202-15232</pages>
       <abstract>Editing knowledge in large language models is an attractive capability that allows us to correct incorrectly learned facts during pre-training, as well as update the model with an ever-growing list of new facts. While existing model editing techniques have shown promise, they are usually evaluated using metrics for reliability, specificity and generalization over one or few edits. We argue that for model editing to have practical utility, we must be able to make multiple edits to the same model. With this in mind, we evaluate current model editing methods at scale, focusing on two state of the art methods - ROME and MEMIT. With the lens of scalability, we evaluate model editing methods for three crucial properties - editing proficiency, fact forgetting and downstream performance. We find that as a model is edited sequentially with multiple facts, it continually becomes less editable, forgets previously edited facts and loses the ability to perform downstream tasks. For ROME and MEMIT, this “forgetting” happens in two phases - an initial gradual but progressive forgetting phase followed by an abrupt or catastrophic forgetting. Both gradual and catastrophic forgetting limit the usefulness of model editing methods at scale - the former makes model editing less effective as multiple edits are made to the model while the latter caps the scalability of such model editing methods. Our analysis also highlights other key limitations of ROME and MEMIT at scale. With our work, we push for better evaluation of model editing and development of model editing methods keeping scalability in mind.</abstract>
-      <url hash="0122ddd4">2024.findings-acl.902</url>
+      <url hash="7f1af4f6">2024.findings-acl.902</url>
       <bibkey>gupta-etal-2024-model</bibkey>
       <doi>10.18653/v1/2024.findings-acl.902</doi>
       <video href="2024.findings-acl.902.mp4"/>
@@ -18794,7 +18794,7 @@
       <author><first>Luca</first><last>Cagliero</last><affiliation>Polytechnic Institute of Turin</affiliation></author>
       <pages>15233-15244</pages>
       <abstract>This paper presents a groundbreaking multimodal, multi-task, multi-teacher joint-grained knowledge distillation model for visually-rich form document understanding. The model is designed to leverage insights from both fine-grained and coarse-grained levels by facilitating a nuanced correlation between token and entity representations, addressing the complexities inherent in form documents. Additionally, we introduce new inter-grained and cross-grained loss functions to further refine diverse multi-teacher knowledge distillation transfer process, presenting distribution gaps and a harmonised understanding of form documents. Through a comprehensive evaluation across publicly available form document understanding datasets, our proposed model consistently outperforms existing baselines, showcasing its efficacy in handling the intricate structures and content of visually complex form documents.</abstract>
-      <url hash="b96384e8">2024.findings-acl.903</url>
+      <url hash="4d496f34">2024.findings-acl.903</url>
       <bibkey>ding-etal-2024-3mvrd</bibkey>
       <doi>10.18653/v1/2024.findings-acl.903</doi>
       <video href="2024.findings-acl.903.mp4"/>
@@ -18808,7 +18808,7 @@
       <author><first>Hakim</first><last>Sidahmed</last></author>
       <pages>15245-15270</pages>
       <abstract>High-quality conversational datasets are essential for developing AI models that can communicate with users.One way to foster deeper interactions between a chatbot and its user is through *personas*, aspects of the user’s character that provide insights into their personality, motivations, and behaviors.Training Natural Language Processing (NLP) models on a diverse and comprehensive persona-based dataset can lead to conversational models that create a deeper connection with the user, and maintain their engagement. In this paper, we leverage the power of Large Language Models (LLMs) to create a large, high-quality conversational dataset from a seed dataset. We propose a Generator-Critic architecture framework to expand the initial dataset, while improving the quality of its conversations.The Generator is an LLM prompted to output conversations.The Critic consists of a mixture of expert LLMs that control the quality of the generated conversations.These experts select the best generated conversations, which we then use to improve the Generator.We release Synthetic-Persona-Chat, consisting of 20k conversations seeded from Persona-Chat.We evaluate the quality of Synthetic-Persona-Chat and our generation framework on different dimensions through extensive experiments, and observe that the losing rate of Synthetic-Persona-Chat against Persona-Chat during an AI detection test decreases from 17.2% to 8.8% over three iterations.</abstract>
-      <url hash="771706ff">2024.findings-acl.904</url>
+      <url hash="3593411e">2024.findings-acl.904</url>
       <bibkey>jandaghi-etal-2024-faithful-persona</bibkey>
       <doi>10.18653/v1/2024.findings-acl.904</doi>
     </paper>
@@ -18825,7 +18825,7 @@
       <author><first>Lifu</first><last>Huang</last><affiliation>Virginia Tech</affiliation></author>
       <pages>15271-15342</pages>
       <abstract>Despite vision-language models’ (VLMs) remarkable capabilities as versatile visual assistants, two substantial challenges persist within the existing VLM frameworks: (1) lacking task diversity in pretraining and visual instruction tuning, and (2) annotation error and bias in GPT-4 synthesized instruction tuning data. Both challenges lead to issues such as poor generalizability, hallucination, and catastrophic forgetting. To address these challenges, we construct Vision-Flan, the most diverse publicly available visual instruction tuning dataset to date, comprising 187 diverse tasks and 1,664,261 instances sourced from academic datasets, and each task is accompanied by an expert-written instruction. In addition, we propose a two-stage instruction tuning framework, in which VLMs are firstly finetuned on Vision-Flan and further tuned on GPT-4 synthesized data. We find this two-stage tuning framework significantly outperforms the traditional single-stage visual instruction tuning framework and achieves the state-of-the-art performance across a wide range of multi-modal evaluation benchmarks. Finally, we conduct in-depth analyses to understand visual instruction tuning and our findings reveal that: (1) GPT-4 synthesized data does not substantially enhance VLMs’ capabilities but rather modulates the model’s responses to human-preferred formats; (2) A minimal quantity (e.g., 1,000) of GPT-4 synthesized data can effectively align VLM responses with human-preference; (3) Visual instruction tuning mainly helps large-language models (LLMs) to understand visual features.</abstract>
-      <url hash="9a739414">2024.findings-acl.905</url>
+      <url hash="51d31297">2024.findings-acl.905</url>
       <bibkey>xu-etal-2024-vision</bibkey>
       <doi>10.18653/v1/2024.findings-acl.905</doi>
       <video href="2024.findings-acl.905.mp4"/>
@@ -18837,7 +18837,7 @@
       <author><first>Thomas</first><last>Hartvigsen</last><affiliation>University of Virginia, Charlottesville</affiliation></author>
       <pages>15343-15352</pages>
       <abstract>Humans rarely learn one fact in isolation. Instead, learning a new fact induces knowledge of other facts about the world. For example, in learning a korat is a type of cat, you also infer it is a mammal and has claws, ensuring your model of the world is consistent. Knowledge editing aims to inject new facts into language models to improve their factuality, but current benchmarks fail to evaluate consistency, which is critical to ensure efficient, accurate, and generalizable edits. We manually create TAXI, a new benchmark dataset specifically created to evaluate consistency in categorical knowledge edits. TAXI contains 11,120 multiple-choice queries for 976 edits spanning 41 categories (e.g., Dogs), 164 subjects (e.g., Labrador), and 183 properties (e.g., is a mammal). We then use TAXI to evaluate popular editors’ categorical consistency, measuring how often editing a subject’s category appropriately edits its properties. We find that 1) the editors achieve marginal, yet non-random consistency, 2) their consistency far underperforms human baselines, and 3) consistency is more achievable when editing atypical subjects.</abstract>
-      <url hash="ced7045e">2024.findings-acl.906</url>
+      <url hash="53605b79">2024.findings-acl.906</url>
       <bibkey>powell-etal-2024-taxi</bibkey>
       <doi>10.18653/v1/2024.findings-acl.906</doi>
     </paper>
@@ -18852,7 +18852,7 @@
       <author><first>Bill</first><last>Dolan</last></author>
       <pages>15353-15368</pages>
       <abstract>Advancements in large language models (LLMs) are revolutionizing interactive game design, enabling dynamic plotlines and interactions between players and non-player characters (NPCs). However, LLMs may exhibit flaws such as hallucinations, forgetfulness, or misinterpretations of prompts, causing logical inconsistencies and unexpected deviations from intended designs. Automated techniques for detecting such game bugs are still lacking. To address this, we propose a systematic LLM-based method for automatically identifying such bugs from player game logs, eliminating the need for collecting additional data such as post-play surveys. Applied to a text-based game DejaBoom!, our approach effectively identifies bugs inherent in LLM-powered interactive games, surpassing unstructured LLM-powered bug-catching methods and filling the gap in automated detection of logical and design flaws.</abstract>
-      <url hash="cadb41c1">2024.findings-acl.907</url>
+      <url hash="ff0668cc">2024.findings-acl.907</url>
       <bibkey>jin-etal-2024-automatic</bibkey>
       <doi>10.18653/v1/2024.findings-acl.907</doi>
       <video href="2024.findings-acl.907.mp4"/>
@@ -18863,7 +18863,7 @@
       <author><first>Julia</first><last>Rayz</last><affiliation>Purdue University</affiliation></author>
       <pages>15369-15379</pages>
       <abstract>While large language and vision-language models showcase impressive capabilities, they face a notable limitation: the inability to connect language with the physical world. To bridge this gap, research has focused on embodied language learning, where the language learner is situated in the world, perceives it, and interacts with it. This article explores the current standing of research in embodied language learning, highlighting opportunities and discussing common challenges. Lastly, it identifies existing gaps from the perspective of language understanding research within the embodied world and suggests potential future directions.</abstract>
-      <url hash="8500f9ed">2024.findings-acl.908</url>
+      <url hash="ff67efe5">2024.findings-acl.908</url>
       <bibkey>amin-rayz-2024-embodied</bibkey>
       <doi>10.18653/v1/2024.findings-acl.908</doi>
       <video href="2024.findings-acl.908.mp4"/>
@@ -18877,7 +18877,7 @@
       <author><first>Jackie</first><last>Cheung</last><affiliation>McGill University, Mila Research Institute and Microsoft</affiliation></author>
       <pages>15380-15395</pages>
       <abstract>It is increasingly common to evaluate the same coreference resolution (CR) model on multiple datasets. Do these multi-dataset evaluations allow us to draw meaningful conclusions about model generalization? Or, do they rather reflect the idiosyncrasies of a particular experimental setup (e.g., the specific datasets used)? To study this, we view evaluation through the lens of measurement modeling, a framework commonly used in the social sciences for analyzing the validity of measurements. By taking this perspective, we show how multi-dataset evaluations risk conflating different factors concerning what, precisely, is being measured. This in turn makes it difficult to draw more generalizable conclusions from these evaluations. For instance, we show that across seven datasets, measurements intended to reflect CR model generalization are often correlated with differences in both how coreference is defined and how it is operationalized; this limits our ability to draw conclusions regarding the ability of CR models to generalize across any singular dimension. We believe the measurement modeling framework provides the needed vocabulary for discussing challenges surrounding what is actually being measured by CR evaluations.</abstract>
-      <url hash="c970429a">2024.findings-acl.909</url>
+      <url hash="28033dd4">2024.findings-acl.909</url>
       <bibkey>porada-etal-2024-challenges</bibkey>
       <doi>10.18653/v1/2024.findings-acl.909</doi>
       <video href="2024.findings-acl.909.mp4"/>
@@ -18889,7 +18889,7 @@
       <author><first>Francis</first><last>Ferraro</last><affiliation>University of Maryland, Baltimore County</affiliation></author>
       <pages>15396-15420</pages>
       <abstract>Interpreting and assessing goal driven actions is vital to understanding and reasoning over complex events. It is important to be able to acquire the knowledge needed for this understanding, though doing so is challenging. We argue that such knowledge can be elicited through a participant achievement lens. We analyze a complex event in a narrative according to the intended achievements of the participants in that narrative, the likely future actions of the participants, and the likelihood of goal success. We collect 6.3K high quality goal and action annotations reflecting our proposed participant achievement lens, with an average weighted Fleiss-Kappa IAA of 80%. Our collection contains annotated alternate versions of each narrative. These alternate versions vary minimally from the “original” story, but can license drastically different inferences. Our findings suggest that while modern large language models can reflect some of the goal-based knowledge we study, they find it challenging to fully capture the design and intent behind concerted actions, even when the model pretraining included the data from which we extracted the goal knowledge. We show that smaller models fine-tuned on our dataset can achieve performance surpassing larger models.</abstract>
-      <url hash="375675ad">2024.findings-acl.910</url>
+      <url hash="6f3e52df">2024.findings-acl.910</url>
       <bibkey>vallurupalli-etal-2024-saga</bibkey>
       <doi>10.18653/v1/2024.findings-acl.910</doi>
       <video href="2024.findings-acl.910.mp4"/>
@@ -18903,7 +18903,7 @@
       <author><first>Liang</first><last>Zhan</last><affiliation>University of Pittsburgh</affiliation></author>
       <pages>15421-15435</pages>
       <abstract>The long-standing one-to-many problem of gold standard responses in open-domain dialogue systems presents challenges for automatic evaluation metrics. Though prior works have demonstrated some success by applying powerful Large Language Models (LLMs), existing approaches still struggle with the one-to-many problem, and exhibit subpar performance in domain-specific scenarios. We assume the commonsense reasoning biases within LLMs may hinder their performance in domain-specific evaluations. To address both issues, we propose a novel framework SLIDE (Small and Large Integrated for Dialogue Evaluation), that leverages both a small, specialised model (SLM), and LLMs for the evaluation of open domain dialogues. Our approach introduces several techniques: (1) Contrastive learning to differentiate between robust and non-robust response embeddings; (2) A novel metric for semantic sensitivity that combines embedding cosine distances with similarity learned through neural networks, and (3) A strategy for incorporating the evaluation results from both the SLM and LLMs. Our empirical results demonstrate that our approach achieves state-of-the-art performance in both the classification and evaluation tasks, and additionally the SLIDE evaluator exhibits better correlation with human judgements. Our code is available at https://github.com/hegehongcha/SLIDE-ACL2024.</abstract>
-      <url hash="2a1af695">2024.findings-acl.911</url>
+      <url hash="ae8e459b">2024.findings-acl.911</url>
       <bibkey>zhao-etal-2024-slide</bibkey>
       <doi>10.18653/v1/2024.findings-acl.911</doi>
       <video href="2024.findings-acl.911.mp4"/>
@@ -18918,7 +18918,7 @@
       <author><first>Kyunghoon</first><last>Bae</last></author>
       <pages>15436-15452</pages>
       <abstract>Instruction tuning has emerged as a powerful technique, significantly boosting zero-shot performance on unseen tasks. While recent work has explored cross-lingual generalization by applying instruction tuning to multilingual models, previous studies have primarily focused on English, with a limited exploration of non-English tasks. For in-depth exploration of cross-lingual generalization in instruction tuning, we perform instruction tuning individually for two distinct language meta-datasets. Subsequently, we assess the performance on unseen tasks in the language different from the one used for training. To facilitate this investigation, we introduce a novel non-English meta-dataset named “KORANI” (Korean Natural Instruction), comprising 51 Korean benchmarks. Moreover, we design cross-lingual templates to mitigate discrepancies in language and instruction-format of the template between training and inference within the cross-lingual setting. Our experiments reveal consistent improvements through cross-lingual generalization in both English and Korean, outperforming baseline by average scores of 20.7% and 13.6%, respectively. Remarkably, these enhancements are comparable to those achieved by mono-lingual instruction tuning and even surpass them in some tasks. The result underscores the significance of relevant data acquisition across languages over linguistic congruence with unseen tasks during instruction tuning.</abstract>
-      <url hash="7567d2ca">2024.findings-acl.912</url>
+      <url hash="d35c4265">2024.findings-acl.912</url>
       <bibkey>han-etal-2024-deep</bibkey>
       <doi>10.18653/v1/2024.findings-acl.912</doi>
       <video href="2024.findings-acl.912.mp4"/>
@@ -18929,7 +18929,7 @@
       <author><first>Saku</first><last>Sugawara</last><affiliation>National Institute of Informatics</affiliation></author>
       <pages>15453-15467</pages>
       <abstract>Psycholinguistic research suggests that humans may build a representation of linguistic input that is ‘good-enough’ for the task at hand. This study examines what architectural features make language models learn human-like good-enough language processing. We focus on the number of layers and self-attention heads in Transformers. We create a good-enough language processing (GELP) evaluation dataset (7,680 examples), which is designed to test the effects of two plausibility types, eight construction types, and three degrees of memory cost on language processing. To annotate GELP, we first conduct a crowdsourcing experiment whose design follows prior psycholinguistic studies. Our model evaluation against the annotated GELP then reveals that the full model as well as models with fewer layers and/or self-attention heads exhibit a good-enough performance. This result suggests that models with shallower depth and fewer heads can learn good-enough language processing.</abstract>
-      <url hash="be46c5eb">2024.findings-acl.913</url>
+      <url hash="deb9db21">2024.findings-acl.913</url>
       <bibkey>asami-sugawara-2024-makes</bibkey>
       <doi>10.18653/v1/2024.findings-acl.913</doi>
     </paper>
@@ -18944,7 +18944,7 @@
       <author><first>Shikun</first><last>Zhang</last><affiliation>Peking University</affiliation></author>
       <pages>15468-15480</pages>
       <abstract>Chinese Spelling Correction (CSC) commonly lacks large-scale high-quality corpora, due to the labor-intensive labeling of spelling errors in real-life human writing or typing scenarios. Two data augmentation methods are widely adopted: (1) *Random Replacement* with the guidance of confusion sets and (2) *OCR/ASR-based Generation* that simulates character misusing. However, both methods inevitably introduce noisy data (e.g., false spelling errors), potentially leading to over-correction. By carefully analyzing the two types of corpora, we find that though the latter achieves more robust generalization performance, the former yields better-calibrated CSC models. We then provide a theoretical analysis of this empirical observation, based on which a corpus refining strategy is proposed. Specifically, OCR/ASR-based data samples are fed into a well-calibrated CSC model trained on random replacement-based corpora and then filtered based on prediction confidence. By learning a simple BERT-based model on the refined OCR/ASR-based corpus, we set up impressive state-of-the-art performance on three widely-used benchmarks, while significantly alleviating over-correction (e.g., lowering false positive predictions).</abstract>
-      <url hash="0116517b">2024.findings-acl.914</url>
+      <url hash="e7fc7abb">2024.findings-acl.914</url>
       <bibkey>yu-etal-2024-refining</bibkey>
       <doi>10.18653/v1/2024.findings-acl.914</doi>
     </paper>
@@ -18956,7 +18956,7 @@
       <author><first>Yong Jae</first><last>Lee</last><affiliation>Department of Computer Sciences, University of Wisconsin - Madison and Cruise</affiliation></author>
       <pages>15481-15495</pages>
       <abstract>We propose CounterCurate, a framework to comprehensively improve the visio-linguistic compositional reasoning capability for both contrastive and generative multimodal models. In particular, we identify two critical under- explored problems: the neglect of physically grounded reasoning (counting and position understanding) and the potential of using highly capable text and image generation models for semantic counterfactual fine-tuning. Our work pioneers an approach in addressing these gaps.We first spotlight the near-chance performance of multimodal models like CLIP and LLaVA in physically grounded compositional reasoning. We then apply simple data augmentation using the grounded image generation model GLIGEN to generate fine-tuning data, resulting in significant performance improvements: +33% and +37% for CLIP and LLaVA, respectively, on our newly curated Flickr30k-Positions benchmark. Moreover, we exploit the capabilities of high-performing text generation and image generation models, specifically GPT-4V and DALLE-3, to curate challenging semantic counterfactuals, thereby further enhancing compositional reasoning capabilities on benchmarks such as SugarCrepe, where CounterCurate outperforms GPT-4V.To facilitate future research, we release ourcode, dataset, benchmark, and checkpoints at https://countercurate.github.io/</abstract>
-      <url hash="279ef3e0">2024.findings-acl.915</url>
+      <url hash="c086a36c">2024.findings-acl.915</url>
       <bibkey>zhang-etal-2024-countercurate</bibkey>
       <doi>10.18653/v1/2024.findings-acl.915</doi>
       <video href="2024.findings-acl.915.mp4"/>
@@ -18975,7 +18975,7 @@
       <author><first>Carl</first><last>Yang</last><affiliation>Emory University</affiliation></author>
       <pages>15496-15523</pages>
       <abstract>Clinical natural language processing faces challenges like complex medical terminology and clinical contexts. Recently, large language models (LLMs) have shown promise in this domain. Yet, their direct deployment can lead to privacy issues and are constrained by resources. To address this challenge, we delve into synthetic clinical text generation with LLMs for clinical NLP tasks. We propose an innovative, resource-efficient approach, ClinGen, which infuses knowledge into the process. Our model involves clinical knowledge extraction and context-informed LLM prompting. Both clinical topics and writing styles are drawn from external domain-specific knowledge graphs and LLMs to guide data generation. Our extensive empirical study across 8 clinical NLP tasks and 18 datasets reveals that ClinGen consistently enhances performance across various tasks by 7.7%-8.7% on average, effectively aligning the distribution of real datasets and enriching the diversity of generated training instances.</abstract>
-      <url hash="9d1862bf">2024.findings-acl.916</url>
+      <url hash="19a73729">2024.findings-acl.916</url>
       <bibkey>xu-etal-2024-knowledge</bibkey>
       <doi>10.18653/v1/2024.findings-acl.916</doi>
     </paper>
@@ -18989,7 +18989,7 @@
       <author><first>Ann</first><last>Lee</last><affiliation>Facebook</affiliation></author>
       <pages>15524-15541</pages>
       <abstract>In this paper, we propose a textless acoustic model with a self-supervised distillation strategy for noise-robust expressive speech-to-speech translation (S2ST).Recently proposed expressive S2ST systems have achieved impressive expressivity preservation performances by cascading unit-to-speech (U2S) generator to the speech-to-unit translation model. However, these systems are vulnerable to the presence of noise in input speech, which is an assumption in real-world translation scenarios. To address this limitation, we propose a U2S generator that incorporates a distillation with no label (DINO) self-supervised training strategy into it’s pretraining process.Because the proposed method captures noise-agnostic expressivity representation, it can generate qualified speech even in noisy environment.Objective and subjective evaluation results verified that the proposed method significantly improved the performance of the expressive S2ST system in noisy environments while maintaining competitive performance in clean environments.</abstract>
-      <url hash="484a4ea9">2024.findings-acl.917</url>
+      <url hash="02a4eccc">2024.findings-acl.917</url>
       <bibkey>hwang-etal-2024-textless</bibkey>
       <doi>10.18653/v1/2024.findings-acl.917</doi>
       <video href="2024.findings-acl.917.mp4"/>
@@ -19002,7 +19002,7 @@
       <author><first>Xiaozhong</first><last>Liu</last><affiliation>Worcester Polytechnic Institute</affiliation></author>
       <pages>15542-15555</pages>
       <abstract>The integration of generative Large Language Models (LLMs) into various applications, including the legal domain, has been accelerated by their expansive and versatile nature. However, when facing a legal case, users without a legal background often struggle to formulate professional queries and may inadvertently overlook critical legal factors when presenting their case narrative to LLMs. To address this issue, we propose the Diagnostic Legal Large Language Model (D3LM), which utilizes adaptive lawyer-like diagnostic questions to collect additional case information and then provides high-quality feedback. D3LM incorporates an innovative graph-based Positive-Unlabeled Reinforcement Learning (PURL) algorithm, enabling the generation of critical questions and enhancing user-LLM interactions. Moreover, an integrated LLM-based stopping criterion facilitates precise Court Views Generation (CVG). Our research also introduces a new English-language CVG dataset based on the US case law database, enriching the realm of LLM research and deployment with a vital dimension. D3LM surpasses classical LLMs by delivering outstanding performance and a remarkable user experience in the legal domain.</abstract>
-      <url hash="fe393781">2024.findings-acl.918</url>
+      <url hash="e787ca63">2024.findings-acl.918</url>
       <bibkey>wu-etal-2024-knowledge</bibkey>
       <doi>10.18653/v1/2024.findings-acl.918</doi>
     </paper>
@@ -19014,7 +19014,7 @@
       <author><first>Haoliang</first><last>Li</last><affiliation>City University of Hong Kong</affiliation></author>
       <pages>15556-15583</pages>
       <abstract>The proliferation of fake news has emerged as a severe societal problem, raising significant interest from industry and academia. While existing deep-learning based methods have made progress in detecting fake news accurately, their reliability may be compromised caused by the non-transparent reasoning processes, poor generalization abilities and inherent risks of integration with large language models (LLMs). To address this challenge, we propose TELLER, a novel framework for trustworthy fake news detection that prioritizes explainability, generalizability and controllability of models. This is achieved via a dual-system framework that integrates cognition and decision systems, adhering to the principles above. The cognition system harnesses human expertise to generate logical predicates, which guide LLMs in generating human-readable logic atoms. Meanwhile, the decision system deduces generalizable logic rules to aggregate these atoms, enabling the identification of the truthfulness of the input news across diverse domains and enhancing transparency in the decision-making process. Finally, we present comprehensive evaluation results on four datasets, demonstrating the feasibility and trustworthiness of our proposed framework.</abstract>
-      <url hash="45017d79">2024.findings-acl.919</url>
+      <url hash="2724e802">2024.findings-acl.919</url>
       <bibkey>liu-etal-2024-teller</bibkey>
       <doi>10.18653/v1/2024.findings-acl.919</doi>
       <video href="2024.findings-acl.919.mp4"/>
@@ -19025,7 +19025,7 @@
       <author><first>Lu</first><last>Wang</last><affiliation>Northeastern University, Northeastern University and University of Michigan</affiliation></author>
       <pages>15584-15596</pages>
       <abstract>Verifiable generation requires large language models (LLMs) to cite source documents supporting their outputs, thereby improve output transparency and trustworthiness. Yet, previous work mainly targets the generation of sentence-level citations, lacking specificity about which parts of a sentence are backed by the cited sources. This work studies verifiable generation with subsentence-level fine-grained citations for more precise location of generated content supported by the cited sources. We first present a dataset, SCiFi, comprising 10K Wikipedia paragraphs with subsentence-level citations. Each paragraph is paired with a set of candidate source documents for citation and a query that triggers the generation of the paragraph content. On SCiFi, we evaluate the performance of state-of-the-art LLMs and strategies for processing long documents designed for these models. Our experiment results reveals key factors that could enhance the quality of citations, including the expansion of the source documents’ context accessible to the models and the implementation of specialized model tuning.</abstract>
-      <url hash="eec48a2b">2024.findings-acl.920</url>
+      <url hash="22356d59">2024.findings-acl.920</url>
       <bibkey>cao-wang-2024-verifiable</bibkey>
       <doi>10.18653/v1/2024.findings-acl.920</doi>
       <video href="2024.findings-acl.920.mp4"/>
@@ -19040,7 +19040,7 @@
       <author><first>Niket</first><last>Tandon</last><affiliation>Allen Institute for Artificial Intelligence</affiliation></author>
       <pages>15597-15611</pages>
       <abstract>How-to procedures, such as how to plant a garden, are now used by millions of users, but sometimes need customizing to meet a user’s specific needs, e.g., planting a garden without pesticides. Our goal is to measure and improve an LLM’s ability to perform such customization. Our approach is to test several simple multi-LLM-agent architectures for customization, as well as an end-to-end LLM, using a new evaluation set, called CustomPlans, of over 200 WikiHow procedures each with a customization need. We find that a simple architecture with two LLM agents used sequentially performs best, one that edits a generic how-to procedure and one that verifies its executability, significantly outperforming (10.5% absolute) an end-to-end prompted LLM. This suggests that LLMs can be configured reasonably effectively for procedure customization. This also suggests that multi-agent editing architectures may be worth exploring further for other customization applications (e.g. coding, creative writing) in the future.</abstract>
-      <url hash="e5550d9f">2024.findings-acl.921</url>
+      <url hash="fb50c017">2024.findings-acl.921</url>
       <bibkey>lal-etal-2024-tailoring</bibkey>
       <doi>10.18653/v1/2024.findings-acl.921</doi>
       <video href="2024.findings-acl.921.mp4"/>
@@ -19051,7 +19051,7 @@
       <author><first>Lav</first><last>Varshney</last><affiliation>University of Illinois at Urbana-Champaign</affiliation></author>
       <pages>15612-15622</pages>
       <abstract>The Transformer architecture has become prominent in developing large causal language models. However, mechanisms to explain its capabilities are not well understood. Focused on the training process, here we establish a meta-learning view of the Transformer architecture when trained for the causal language modeling task, by explicating an inner optimization process that may happen within the Transformer. Further, from within the inner optimization, we discover and theoretically analyze a special characteristic of the norms of learned token representations within Transformer-based causal language models. Our analysis is supported by experiments conducted on pre-trained large language models and real-world data.</abstract>
-      <url hash="8641677f">2024.findings-acl.922</url>
+      <url hash="da6d1ef3">2024.findings-acl.922</url>
       <bibkey>wu-varshney-2024-meta</bibkey>
       <doi>10.18653/v1/2024.findings-acl.922</doi>
       <video href="2024.findings-acl.922.mp4"/>
@@ -19070,7 +19070,7 @@
       <author><first>Chao</first><last>Zhang</last><affiliation>Georgia Institute of Technology</affiliation></author>
       <pages>15623-15636</pages>
       <abstract>Large Language Models (LLMs) have exhibited impressive capabilities in various tasks, yet their vast parameter sizes restrict their applicability in resource-constrained settings. Knowledge distillation (KD) offers a viable solution by transferring expertise from large teacher models to compact student models. However, traditional KD techniques face specific challenges when applied to LLMs, including restricted access to LLM outputs, significant teacher-student capacity gaps, and the inherited mis-calibration issue. In this work, we present PLaD, a novel preference-based LLM distillation framework. PLaD exploits the teacher-student capacity discrepancy to generate pseudo-preference pairs where teacher outputs are preferred over student outputs. Then, PLaD leverages a ranking loss to re-calibrate the student’s estimation of sequence likelihood, which steers the student’s focus towards understanding the relative quality of outputs instead of simply imitating the teacher. PLaD bypasses the need for access to teacher LLM’s internal states, tackles the student’s expressivity limitations, and mitigates the student mis-calibration issue. Through extensive experiments on two sequence generation tasks and with various LLMs, we demonstrate the effectiveness of our proposed PLaD framework.</abstract>
-      <url hash="3fca82ca">2024.findings-acl.923</url>
+      <url hash="c109b7e8">2024.findings-acl.923</url>
       <bibkey>zhang-etal-2024-plad</bibkey>
       <doi>10.18653/v1/2024.findings-acl.923</doi>
     </paper>
@@ -19085,7 +19085,7 @@
       <author><first>Lu</first><last>Wang</last><affiliation>Northeastern University, Northeastern University and University of Michigan</affiliation></author>
       <pages>15637-15653</pages>
       <abstract>Self-correction has emerged as a promising solution to boost the reasoning performance of large language models (LLMs), where LLMs refine their solutions using self-generated critiques that pinpoint the errors. This work explores whether small (<tex-math>\leq 13</tex-math>B) language models (LMs) have the ability of self-correction on reasoning tasks with minimal inputs from stronger LMs. We propose a novel pipeline that prompts smaller LMs to collect self-correction data that supports the training of self-refinement abilities. First, we leverage correct solutions to guide the model in critiquing their incorrect responses. Second, the generated critiques, after filtering, are used for supervised fine-tuning of the self-correcting reasoner through solution refinement. Our experimental results show improved self-correction abilities of two models on five datasets spanning math and commonsense reasoning, with notable performance gains when paired with a strong GPT-4-based verifier, though limitations are identified when using a weak self-verifier for determining when to correct.</abstract>
-      <url hash="94b0bf6d">2024.findings-acl.924</url>
+      <url hash="39a440e4">2024.findings-acl.924</url>
       <bibkey>zhang-etal-2024-small</bibkey>
       <doi>10.18653/v1/2024.findings-acl.924</doi>
       <video href="2024.findings-acl.924.mp4"/>
@@ -19100,7 +19100,7 @@
       <author><first>Lei</first><last>Li</last><affiliation>School of Computer Science, Carnegie Mellon University</affiliation></author>
       <pages>15654-15669</pages>
       <abstract>How can large language models (LLMs) process and translate endangered languages? Many languages lack a large corpus to train a decent LLM; therefore existing LLMs rarely perform well in unseen, endangered languages. On the contrary, we observe that 2000 endangered languages, though without a large corpus, have a grammar book or a dictionary. We propose LingoLLM, a training-free approach to enable an LLM to process unseen languages that hardly occur in its pre-training. Our key insight is to demonstrate linguistic knowledge of an unseen language in an LLM’s prompt, including a dictionary, a grammar book, and morphologically analyzed input text. We implement LingoLLM on top of two models, GPT-4 and Mixtral, and evaluate their performance on 5 tasks across 8 endangered or low-resource languages. Our results show that LingoLLM elevates translation capability from GPT-4’s 0 to 10.5 BLEU for 10 language directions. Our findings demonstrate the tremendous value of linguistic knowledge in the age of LLMs for endangered languages. Our data, code, and model generations will be released to the public. Our data, code, and model generations can be found at <url>https://github.com/LLiLab/llm4endangeredlang</url>.</abstract>
-      <url hash="865d5cd3">2024.findings-acl.925</url>
+      <url hash="49f5a574">2024.findings-acl.925</url>
       <bibkey>zhang-etal-2024-hire</bibkey>
       <doi>10.18653/v1/2024.findings-acl.925</doi>
     </paper>
@@ -19112,7 +19112,7 @@
       <author><first>Klinton</first><last>Bicknell</last><affiliation>Duolingo</affiliation></author>
       <pages>15670-15693</pages>
       <abstract>We study the problem of controlling the difficulty level of text generated by Large Language Models (LLMs) for contexts where end-users are not fully proficient, such as language learners. Using a novel framework, we evaluate the effectiveness of several key approaches for this task, including few-shot prompting, supervised finetuning, and reinforcement learning (RL), utilising both GPT-4 and open source alternatives like LLama2-7B and Mistral-7B.Our findings reveal a large performance gap between GPT-4 and the open source models when using prompt-based strategies. However, we show how to bridge this gap with a careful combination of finetuning and RL alignment. Our best model, CALM (CEFR-Aligned Language Model), surpasses the performance of GPT-4 and other strategies, at only a fraction of the cost. We further validate the quality of our results through a small-scale human study.</abstract>
-      <url hash="497662da">2024.findings-acl.926</url>
+      <url hash="cd81db42">2024.findings-acl.926</url>
       <bibkey>malik-etal-2024-tarzan</bibkey>
       <doi>10.18653/v1/2024.findings-acl.926</doi>
     </paper>
@@ -19127,7 +19127,7 @@
       <author><first>Golnoosh</first><last>Farnadi</last></author>
       <pages>15694-15710</pages>
       <abstract>Recent progress in large language models (LLMs) has led to their widespread adoption in various domains. However, these advancements have also introduced additional safety risks and raised concerns regarding their detrimental impact on already marginalized populations.Despite growing mitigation efforts to develop safety safeguards, such as supervised safety-oriented fine-tuning and leveraging safe reinforcement learning from human feedback, multiple concerns regarding the safety and ingrained biases in these models remain. Furthermore, previous work has demonstrated that models optimized for safety often display exaggerated safety behaviors, such as a tendency to refrain from responding to certain requests as a precautionary measure. As such, a clear trade-off between the helpfulness and safety of these models has been documented in the literature. In this paper, we further investigate the effectiveness of safety measures by evaluating models on already mitigated biases. Using the case of Llama 2 as an example, we illustrate how LLMs’ safety responses can still encode harmful assumptions. To do so, we create a set of non-toxic prompts, which we then use to evaluate Llama models. Through our new taxonomy of LLMs responses to users, we observe that the safety/helpfulness trade-offs are more pronounced for certain demographic groups which can lead to different kinds of harms such as quality-of-service harms for marginalized populations.</abstract>
-      <url hash="961bc360">2024.findings-acl.927</url>
+      <url hash="c76b54a1">2024.findings-acl.927</url>
       <bibkey>chehbouni-etal-2024-representational</bibkey>
       <doi>10.18653/v1/2024.findings-acl.927</doi>
       <video href="2024.findings-acl.927.mp4"/>
@@ -19139,7 +19139,7 @@
       <author><first>Deyi</first><last>Xiong</last><affiliation>Tianjin University</affiliation></author>
       <pages>15711-15724</pages>
       <abstract>Assessing the capabilities of large language models (LLMs) as agents in decision making and operational tasks is crucial for the development of LLM-as-agent service. We propose CToolEval, a benchmark designed to evaluate LLMs in the context of Chinese societal applications, featuring 398 APIs across 27 widely-used Apps (e.g., Apps for shopping, map, music, travel, etc.) that cover 14 domains. We further present an evaluation framework that simulates real-life scenarios, to facilitate the assessment of tool invocation ability of LLMs for tool learning and task completion ability for user interation. Our extensive experiments with CToolEval evaluate 11 LLMs, revealing that while GPT-3.5-turbo excels in tool invocation, Chinese LLMs usually struggle with issues like hallucination and a lack of comprehensive tool understanding. Our findings highlight the need for further refinement in decision-making capabilities of LLMs, offering insights into bridging the gap between current functionalities and agent-level performance. To promote further research for LLMs to fully act as reliable agents in complex, real-world situations, we release our data and codes at https://github.com/tjunlp-lab/CToolEval.</abstract>
-      <url hash="1bfb4ec5">2024.findings-acl.928</url>
+      <url hash="a845b2fb">2024.findings-acl.928</url>
       <bibkey>guo-etal-2024-ctooleval</bibkey>
       <doi>10.18653/v1/2024.findings-acl.928</doi>
     </paper>
@@ -19158,7 +19158,7 @@
       <author><first>Bing</first><last>Xiang</last><affiliation>Amazon</affiliation></author>
       <pages>15725-15738</pages>
       <abstract>Generative models, widely utilized in various applications, can often struggle with prompts corresponding to partial tokens. This struggle stems from tokenization, where partial tokens fall out of distribution during inference, leading to incorrect or nonsensical outputs. This paper examines a technique to alleviate the tokenization artifact on text completion in generative models, maintaining performance even in regular non-subword cases. The method, termed token alignment, involves backtracking to the last complete tokens and ensuring the model’s generation aligns with the prompt. This approach showcases marked improvement across many partial token scenarios, including nuanced cases like space-prefix and partial indentation, with only a minor time increase. The technique and analysis detailed in this paper contribute to the continuous advancement of generative models in handling partial inputs, bearing relevance for applications like code completion and text.</abstract>
-      <url hash="d3683a39">2024.findings-acl.929</url>
+      <url hash="994c7266">2024.findings-acl.929</url>
       <bibkey>athiwaratkun-etal-2024-token</bibkey>
       <doi>10.18653/v1/2024.findings-acl.929</doi>
       <video href="2024.findings-acl.929.mp4"/>
@@ -19171,7 +19171,7 @@
       <author><first>Arman</first><last>Cohan</last><affiliation>Yale University and Allen Institute for Artificial Intelligence</affiliation></author>
       <pages>15739-15746</pages>
       <abstract>Evaluating multilingual summarization evaluation metrics, i.e., meta-evaluation, is challenging because of the difficulty of human annotation collection. Therefore, we investigate an efficient multilingual meta-evaluation framework that uses machine translation systems to transform a monolingual meta-evaluation dataset into multilingual versions. To this end, we introduce a statistical test to verify the transformed dataset quality by checking the meta-evaluation result consistency on the original dataset and back-translated dataset. With this quality verification method, we transform an existing English summarization meta-evaluation dataset, RoSE, into 30 languages, and conduct a multilingual meta-evaluation of several representative automatic evaluation metrics. In our meta-evaluation, we find that metric performance varies in different languages and neural metrics generally outperform classical text-matching-based metrics in non-English languages. Moreover, we identify a two-stage evaluation method with superior performance, which first translates multilingual texts into English and then performs evaluation. We make the transformed datasets publicly available to facilitate future research.</abstract>
-      <url hash="e86474dc">2024.findings-acl.930</url>
+      <url hash="38ec6ccb">2024.findings-acl.930</url>
       <bibkey>han-etal-2024-rethinking</bibkey>
       <doi>10.18653/v1/2024.findings-acl.930</doi>
     </paper>
@@ -19186,7 +19186,7 @@
       <author><first>Xie</first><last>Chen</last></author>
       <pages>15747-15760</pages>
       <abstract>We propose emotion2vec, a universal speech emotion representation model. emotion2vec is pre-trained on open-source unlabeled emotion data through self-supervised online distillation, combining utterance-level loss and frame-level loss during pre-training. emotion2vec outperforms state-of-the-art pre-trained universal models and emotion specialist models by only training linear layers for the speech emotion recognition task on the mainstream IEMOCAP dataset. In addition, emotion2vec shows consistent improvements among 10 different languages of speech emotion recognition datasets. emotion2vec also shows excellent results on other emotion tasks, such as song emotion recognition, emotion prediction in conversation, and sentiment analysis. Comparison experiments, ablation experiments, and visualization comprehensively demonstrate the universal capability of the proposed emotion2vec. To the best of our knowledge, emotion2vec is the first universal representation model in various emotion-related tasks, filling a gap in the field.</abstract>
-      <url hash="a796224d">2024.findings-acl.931</url>
+      <url hash="10e6982b">2024.findings-acl.931</url>
       <bibkey>ma-etal-2024-emotion2vec</bibkey>
       <doi>10.18653/v1/2024.findings-acl.931</doi>
     </paper>
@@ -19197,7 +19197,7 @@
       <author><first>Prasad</first><last>Tadepalli</last><affiliation>Oregon State University and Oregon State University</affiliation></author>
       <pages>15761-15772</pages>
       <abstract>Beam search decoding is the de-facto method for decoding auto-regressive Neural Machine Translation (NMT) models, including multilingual NMT where the target language is specified as an input. However, decoding multilingual NMT models commonly produces off-target translations – yielding translation outputs not in the intended language.In this paper, we first conduct an error analysis of off-target translations for a strong multilingual NMT model and identify how these decodings are produced during beam search. We then propose Language-informed Beam Search (LiBS), a general decoding algorithm incorporating an off-the-shelf Language Identification (LiD) model into beam search decoding to reduce off-target translations. LiBS is an inference-time procedure that is NMT-model agnostic and does not require any additional parallel data. Results show that our proposed LiBS algorithm on average improves +1.1 BLEU and +0.9 BLEU on WMT and OPUS datasets, and reduces off-target rates from 22.9% to 7.7% and 65.8% to 25.3% respectively.</abstract>
-      <url hash="a7ba9516">2024.findings-acl.932</url>
+      <url hash="da8e6fa2">2024.findings-acl.932</url>
       <bibkey>yang-etal-2024-language-informed</bibkey>
       <doi>10.18653/v1/2024.findings-acl.932</doi>
     </paper>
@@ -19209,7 +19209,7 @@
       <author><first>Jungwook</first><last>Choi</last><affiliation>Hanyang University</affiliation></author>
       <pages>15773-15786</pages>
       <abstract>Deploying large language models (LLMs) with their extensive parameters and high memory demands challenges computational efficiency, particularly in fine-tuning for specific applications with limited resources. Techniques like Low-Rank Adaptation (LoRA) help by training a smaller, modifiable extension of the base model to reduce memory usage. However, combining quantization with LoRA, especially in low-bit scenarios, can lead to performance losses due to quantization errors. Our innovative Rank-Adaptive LoRA (RA-LoRA) addresses this by dynamically adjusting the adapter’s rank using rank-subspace analysis, optimizing performance with fewer parameters. We tested RA-LoRA on state-of-the-art LLMs for 2-bit efficient fine-tuning, showing it can improve model accuracy with minimal trainable parameters, marking a leap forward in quantization-aware fine-tuning methods and highlighting the significance of rank dynamics in optimizing quantized LLMs.</abstract>
-      <url hash="11a99399">2024.findings-acl.933</url>
+      <url hash="97611125">2024.findings-acl.933</url>
       <bibkey>kim-etal-2024-ra</bibkey>
       <doi>10.18653/v1/2024.findings-acl.933</doi>
       <video href="2024.findings-acl.933.mp4"/>
@@ -19220,7 +19220,7 @@
       <author><first>Wei</first><last>Wang</last><affiliation>University of California, Los Angeles</affiliation></author>
       <pages>15787-15803</pages>
       <abstract>Social networks have become ideal vehicles for news dissemination because posted content is easily able to reach users beyond a news outlet’s direct audience. Understanding how information is transmitted among communities of users is a critical step towards understanding the impact social networks have on real-world events. Two significant barriers in this vein of work are identifying user clusters and meaningfully characterizing these communities. Thus, we propose the PGNSC benchmark, which builds information pathways based on the audiences of influential news sources and uses their content to characterize the communities. We present methods of aggregating these news-source-centric communities and for constructing the community feature representations that are used sequentially to construct information pathway prediction pipelines. Lastly, we perform extensive experiments to demonstrate the performance of baseline pipeline constructions and to highlight the possibilities for future work.</abstract>
-      <url hash="49d1b7ac">2024.findings-acl.934</url>
+      <url hash="b853239a">2024.findings-acl.934</url>
       <bibkey>taylor-wang-2024-pgnsc</bibkey>
       <doi>10.18653/v1/2024.findings-acl.934</doi>
       <video href="2024.findings-acl.934.mp4"/>
@@ -19232,7 +19232,7 @@
       <author><first>Shivam</first><last>Ratnakar</last><affiliation>International Business Machines</affiliation></author>
       <pages>15804-15819</pages>
       <abstract>Interactive fiction games have emerged as an important application to improve the generalization capabilities of language-based reinforcement learning (RL) agents. Existing environments for interactive fiction games are domain-specific or time-consuming to generate and do not train the RL agents to master a specific set of skills. In this work, we introduce an interactive environment for self-supervised RL, STARLING, for text-based games that bootstraps the text-based RL agents with automatically generated games (based on the seed set of game ideas) to boost the performance and generalization capabilities to reach a goal of the target environment. These games let the agent hone their skills on a predefined set of tasks. We create and test an environment with 100 games, generated using this automated framework that uses large language models (GPT3) and an interactive fiction game engine (based on Inform7) to provide the user with the ability to generate more games under minimal human supervision. Experimental results based on both the human participants and baseline text-based RL agents reveal that current state-of-the-art text-based RL agents cannot use previously learned skills in new situations at the level humans can. These results enforce STARLING’s potential to serve as a sandbox environment for further research in self-supervised text-based RL.</abstract>
-      <url hash="8af2edd2">2024.findings-acl.935</url>
+      <url hash="9844ea2c">2024.findings-acl.935</url>
       <bibkey>basavatia-etal-2024-starling</bibkey>
       <doi>10.18653/v1/2024.findings-acl.935</doi>
       <video href="2024.findings-acl.935.mp4"/>
@@ -19245,7 +19245,7 @@
       <author><first>Jaegul</first><last>Choo</last><affiliation>Korea Advanced Institute of Science and Technology</affiliation></author>
       <pages>15820-15839</pages>
       <abstract>Although language models (LMs) demonstrate exceptional capabilities on various tasks, they are potentially vulnerable to extraction attacks, which represent a significant privacy risk.To mitigate the privacy concerns of LMs, machine unlearning has emerged as an important research area, which is utilized to induce the LM to selectively forget about some of its training data.While completely retraining the model will guarantee successful unlearning and privacy assurance, it is impractical for LMs, as it would be time-consuming and resource-intensive.Prior works efficiently unlearn the target token sequences, but upon subsequent iterations, the LM displays significant degradation in performance.In this work, we propose <tex-math>\textbf{P}</tex-math>rivacy Protection via <tex-math>\textbf{O}</tex-math>ptimal <tex-math>\textbf{P}</tex-math>arameters (POP), a novel unlearning method that effectively forgets the target token sequences from the pretrained LM by applying optimal gradient updates to the parameters.Inspired by the gradient derivation of complete retraining, we approximate the optimal training objective that successfully unlearns the target sequence while retaining the knowledge from the rest of the training data.Experimental results demonstrate that POP exhibits remarkable retention performance post-unlearning across 9 classification and 4 dialogue benchmarks, outperforming the state-of-the-art by a large margin.Furthermore, we introduce Remnant Memorization Accuracy that quantifies privacy risks based on token likelihood and validate its effectiveness through both qualitative and quantitative analyses.</abstract>
-      <url hash="6b3e8dda">2024.findings-acl.936</url>
+      <url hash="251cd7a9">2024.findings-acl.936</url>
       <bibkey>lee-etal-2024-protecting</bibkey>
       <doi>10.18653/v1/2024.findings-acl.936</doi>
       <video href="2024.findings-acl.936.mp4"/>
@@ -19258,7 +19258,7 @@
       <author><first>Chris</first><last>Biemann</last><affiliation>U Hamburg</affiliation></author>
       <pages>15840-15853</pages>
       <abstract>Large Vision-Language Models (LVLMs) are increasingly adept at generating contextually detailed and coherent responses from visual inputs. However, their application in multimodal decision-making and open-ended generation is hindered by a notable rate of hallucinations, where generated text inaccurately represents the visual contents. To address this issue, this paper introduces the Instruction Contrastive Decoding (ICD) method, a novel approach designed to reduce hallucinations during LVLM inference. Our method is inspired by our observation that what we call disturbance instructions significantly exacerbate hallucinations in multimodal fusion modules. ICD contrasts distributions from standard and instruction disturbance, thereby increasing alignment uncertainty and effectively subtracting hallucinated concepts from the original distribution. Through comprehensive experiments on discriminative benchmarks (POPE and MME) and a generative benchmark (LLaVa-Bench), we demonstrate that ICD significantly mitigates both object-level and attribute-level hallucinations. Moreover, our method not only addresses hallucinations but also significantly enhances the general perception and recognition capabilities of LVLMs.</abstract>
-      <url hash="5260c549">2024.findings-acl.937</url>
+      <url hash="2b8ea016">2024.findings-acl.937</url>
       <bibkey>wang-etal-2024-mitigating</bibkey>
       <doi>10.18653/v1/2024.findings-acl.937</doi>
       <video href="2024.findings-acl.937.mp4"/>
@@ -19272,7 +19272,7 @@
       <author><first>Sheng</first><last>Zha</last><affiliation>Amazon</affiliation></author>
       <pages>15854-15868</pages>
       <abstract>Handling drafty partial code remains a notable challenge in real-time code suggestion applications. Previous work has demonstrated shortcomings of large language models of code (CodeLLMs) in completing partial code with potential bugs. In this study, we view partial code as implementation hints and fine-tune CodeLLMs to jointly rewrite and complete partial code into functional full programs. We explore two strategies: one-pass generation and multi-pass iterative refinement. We construct new training and testing datasets using semantic-altering code transformations and iterative self-generations.We conduct comprehensive experiments over three representative open-sourced CodeLLMs – InCoder, CodeGen, and StarCoder.Results show that CodeLLMs fine-tuned using our approach achieve superior pass rates compared to the previous baselines across existing and newly-created benchmarks, effectively handle both potentially buggy and clean code, and largely preserve the integrity of the original partial implementations. We further present findings on the properties of the potential bugs we tested and on the design choices of our methods.</abstract>
-      <url hash="df895caf">2024.findings-acl.938</url>
+      <url hash="f7935e30">2024.findings-acl.938</url>
       <bibkey>wang-etal-2024-fine-tuning</bibkey>
       <doi>10.18653/v1/2024.findings-acl.938</doi>
     </paper>
@@ -19284,7 +19284,7 @@
       <author><first>Mira</first><last>Mezini</last><affiliation>Technische Universität Darmstadt</affiliation></author>
       <pages>15869-15889</pages>
       <abstract>Large Language Models trained on code corpora (code-LLMs) have demonstrated impressive performance in various coding assistance tasks. However, despite their increased size and training dataset, code-LLMs still have limitations such as suggesting codes with syntactic errors, variable misuse etc. Some studies argue that code-LLMs perform well on coding tasks because they use self-attention and hidden representations to encode relations among input tokens. However, previous works have not studied what code properties are not encoded by code-LLMs. In this paper, we conduct a fine-grained analysis of attention maps and hidden representations of code-LLMs. Our study indicates that code-LLMs only encode relations among specific subsets of input tokens. Specifically, by categorizing input tokens into syntactic tokens and identifiers, we found that models encode relations among syntactic tokens and among identifiers, but they fail to encode relations between syntactic tokens and identifiers. We also found that fine-tuned models encode these relations poorly compared to their pre-trained counterparts. Additionally, larger models with billions of parameters encode significantly less information about code than models with only a few hundred million parameters.</abstract>
-      <url hash="236e58f6">2024.findings-acl.939</url>
+      <url hash="bf988dd9">2024.findings-acl.939</url>
       <bibkey>anand-etal-2024-critical</bibkey>
       <doi>10.18653/v1/2024.findings-acl.939</doi>
       <video href="2024.findings-acl.939.mp4"/>
@@ -19297,7 +19297,7 @@
       <author><first>Jianbing</first><last>Shen</last><affiliation>University of Macau</affiliation></author>
       <pages>15890-15902</pages>
       <abstract>In Large Visual Language Models (LVLMs), the efficacy of In-Context Learning (ICL) remains limited by challenges in cross-modal interactions and representation disparities. To overcome these challenges, we introduce a novel Visual In-Context Learning (VICL) method comprising Visual Demonstration Retrieval, Intent-Oriented Image Summarization, and Intent-Oriented Demonstration Composition. Our approach retrieves images via ”Retrieval &amp; Rerank” paradigm, summarises images with task intent and task-specific visual parsing, and composes language-based demonstrations that reduce token count and alleviate cross-modal interaction problem. Experimental evaluations on five visual reasoning datasets demonstrate the effectiveness of our method. Moreover, our extensive experiments leverage information flow analysis to elucidate the effectiveness of our method, and investigate the impact of length and position of demonstrations for LVLM. The use of in-context unlearning further shows promise in resetting specific model knowledge without retraining.</abstract>
-      <url hash="21c31c26">2024.findings-acl.940</url>
+      <url hash="08cb2516">2024.findings-acl.940</url>
       <bibkey>zhou-etal-2024-visual</bibkey>
       <doi>10.18653/v1/2024.findings-acl.940</doi>
     </paper>
@@ -19312,7 +19312,7 @@
       <author><first>Rui</first><last>Yan</last><affiliation>Renmin University of China</affiliation></author>
       <pages>15903-15918</pages>
       <abstract>In this paper, we introduce SCALE, a collaborative framework that connects a compact Specialized Translation Model (STM) and a general-purpose Large Language Model (LLM) as one unified translation engine. By introducing translation from STM into the triplet in-context demonstrations, SCALE unlocks refinement and pivoting ability of LLM, thus 1) mitigating language bias of LLMs and parallel data bias of STMs, 2) enhancing LLM speciality without sacrificing generality, and 3) facilitating continual learning in a LLM-tuning-free way.Our comprehensive experiments show that SCALE significantly outperforms both LLMs (GPT-4, GPT-3.5) and supervised models (NLLB, M2M) in either high-resource or challenging low-resource settings. Moreover SCALE shows great scalability by only updating the lightweight STM and witness consistent system improvement, an averaged 4 BLEURT score across 4 languages without tuning LLM. Interestingly, SCALE could also effectively exploit the existing language bias of LLMs by using an English-centric STM as a pivot to conduct translation between any language pairs, outperforming GPT-4 by an average of 6 COMET points across eight translation directions. Furthermore we provide an in-depth analysis of SCALE’s robustness, translation characteristics, latency costs and inherent language bias, providing solid foundation for future studies exploring the potential synergy between LLMs and more specialized models.</abstract>
-      <url hash="6e32413d">2024.findings-acl.941</url>
+      <url hash="58409a6a">2024.findings-acl.941</url>
       <bibkey>cheng-etal-2024-scale</bibkey>
       <doi>10.18653/v1/2024.findings-acl.941</doi>
     </paper>
@@ -19324,7 +19324,7 @@
       <author><first>Md Shad</first><last>Akhtar</last><affiliation>Indraprastha Institute of Information Technology, Delhi</affiliation></author>
       <pages>15919-15932</pages>
       <abstract>Healthcare Community Question Answering (CQA) forums offer an accessible platform for individuals seeking information on various healthcare-related topics. People find such platforms suitable for self-disclosure, seeking medical opinions, finding simplified explanations for their medical conditions, and answering others’ questions. However, answers on these forums are typically diverse and prone to off-topic discussions. It can be challenging for readers to sift through numerous answers and extract meaningful insights, making answer summarization a crucial task for CQA forums. While several efforts have been made to summarize the community answers, most of them are limited to the open domain and overlook the different perspectives offered by these answers. To address this problem, this paper proposes a novel task of perspective-specific answer summarization. We identify various perspectives, within healthcare-related responses and frame a perspective-driven abstractive summary covering all responses. To achieve this, we annotate 3167 CQA threads with 6193 perspective-aware summaries in our PUMA dataset. Further, we propose PLASMA, a prompt-driven controllable summarization model. To encapsulate the perspective-specific conditions, we design an energy-controlled loss function for the optimization. We also leverage the prefix tuner to learn the intricacies of the healthcare perspective summarization. Our evaluation against five baselines suggests the superior performance of PLASMA by a margin of ~1.5 - 21% improvement. We supplement our experiments with ablation and qualitative analysis.</abstract>
-      <url hash="327e6155">2024.findings-acl.942</url>
+      <url hash="568cd1c3">2024.findings-acl.942</url>
       <bibkey>naik-etal-2024-perspective</bibkey>
       <doi>10.18653/v1/2024.findings-acl.942</doi>
       <video href="2024.findings-acl.942.mp4"/>
@@ -19341,7 +19341,7 @@
       <author><first>Daxin</first><last>Jiang</last><affiliation>Microsoft</affiliation></author>
       <pages>15933-15946</pages>
       <abstract>We propose a simple method that applies a large language model (LLM) to large-scale retrieval in zero-shot scenarios. Our method, the Large language model as Retriever (LameR), is built upon no other neural models but an LLM in a retrieval-augmented retrieval fashion, while breaking brute-force combinations of retrievers with LLMs and lifting the performance of zero-shot retrieval to be very competitive on benchmark datasets. Essentially, we propose to augment a query with its potential answers by prompting LLMs with a composition of the query and the query’s in-domain candidates. The candidates, regardless of correct or wrong, are obtained by a vanilla retrieval procedure on the target collection. As a part of the prompts, they are likely to help LLM generate more precise answers by pattern imitation or candidate summarization. Even if all the candidates are wrong, the prompts at least make LLM aware of in-collection patterns and genres. Moreover, due to the low performance of a self-supervised retriever, the LLM-based query augmentation becomes less effective as the retriever bottlenecks the whole pipeline. Therefore, we propose to leverage a non-parametric lexicon-based method (e.g., BM25) as the retrieval module to capture query-document overlap in a literal fashion. As such, LameR makes the retrieval procedure transparent to the LLM, thus circumventing the bottleneck.</abstract>
-      <url hash="7b840b08">2024.findings-acl.943</url>
+      <url hash="7ab0817f">2024.findings-acl.943</url>
       <bibkey>shen-etal-2024-retrieval</bibkey>
       <doi>10.18653/v1/2024.findings-acl.943</doi>
     </paper>
@@ -19351,11 +19351,11 @@
       <author><first>Jisun</first><last>An</last><affiliation>Indiana University</affiliation></author>
       <author><first>Haewoon</first><last>Kwak</last><affiliation>Indiana University</affiliation></author>
       <author><first>Muhammad Arslan</first><last>Manzoor</last></author>
-      <author><first>Zain</first><last>Muhammad Mujahid</last></author>
+      <author><first>Zain</first><last>Mujahid</last></author>
       <author><first>Husrev</first><last>Sencar</last><affiliation>QCRI</affiliation></author>
       <pages>15947-15962</pages>
       <abstract>The present level of proliferation of fake, biased, and propagandistic content online has made it impossible to fact-check every single suspicious claim or article, either manually or automatically. An increasing number of scholars are focusing on a coarser granularity, aiming to profile entire news outlets, which allows fast identification of potential “fake news” by checking the reliability of their source. Source factuality is also an important element of systems for automatic fact-checking and “fake news” detection, as they need to assess the reliability of the evidence they retrieve online. Political bias detection, which in the Western political landscape is about predicting left-center-right bias, is an equally important topic, which has experienced a similar shift toward profiling entire news outlets. Moreover, there is a clear connection between the two, as highly biased media are less likely to be factual; yet, the two problems have been addressed separately. In this survey, we review the state of the art on media profiling for factuality and bias, arguing for the need to model them jointly. We also shed light on some of the major challenges for modeling bias and factuality jointly. We further discuss interesting recent advances in using different information sources and modalities, which go beyond the text of the articles the target news outlet has published. Finally, we discuss current challenges and outline future research directions.</abstract>
-      <url hash="01e05fe7">2024.findings-acl.944</url>
+      <url hash="88a26ce8">2024.findings-acl.944</url>
       <bibkey>nakov-etal-2024-survey</bibkey>
       <doi>10.18653/v1/2024.findings-acl.944</doi>
       <video href="2024.findings-acl.944.mp4"/>
@@ -19367,7 +19367,7 @@
       <author><first>Mona</first><last>Diab</last><affiliation>Carnegie Mellon University</affiliation></author>
       <pages>15963-15977</pages>
       <abstract>Wavelet transforms, a powerful mathematical tool, have been widely used in different domains, including Signal and Image processing, to unravel intricate patterns, enhance data representation, and extract meaningful features from data. Tangible results from their application suggest that Wavelet transforms can be applied to NLP capturing a variety of linguistic and semantic properties.In this paper, we empirically leverage the application of Discrete Wavelet Transforms (DWT) to word and sentence embeddings. We aim to showcase the capabilities of DWT in analyzing embedding representations at different levels of resolution and compressing them while maintaining their overall quality.We assess the effectiveness of DWT embeddings on semantic similarity tasks to show how DWT can be used to consolidate important semantic information in an embedding vector. We show the efficacy of the proposed paradigm using different embedding models, including large language models, on downstream tasks. Our results show that DWT can reduce the dimensionality of embeddings by 50-93% with almost no change in performance for semantic similarity tasks, while achieving superior accuracy in most downstream tasks. Our findings pave the way for applying DWT to improve NLP applications.</abstract>
-      <url hash="225a0d96">2024.findings-acl.945</url>
+      <url hash="00299b61">2024.findings-acl.945</url>
       <bibkey>salama-etal-2024-semantic</bibkey>
       <doi>10.18653/v1/2024.findings-acl.945</doi>
       <video href="2024.findings-acl.945.mp4"/>
@@ -19380,7 +19380,7 @@
       <author><first>Hogun</first><last>Park</last><affiliation>Sungkyunkwan University</affiliation></author>
       <pages>15978-15991</pages>
       <abstract>Multi-hop logical reasoning on knowledge graphs is a pivotal task in natural language processing, with numerous approaches aiming to answer First-Order Logic (FOL) queries. Recent geometry (e.g., box, cone) and probability (e.g., beta distribution)-based methodologies have effectively addressed complex FOL queries. However, a common challenge across these methods lies in determining accurate geometric bounds or probability parameters for these queries. The challenge arises because existing methods rely on linear sequential operations within their computation graphs, overlooking the logical structure of the query and the relation-induced information that can be gleaned from the relations of the query, which we call the context of the query. To address the problem, we propose a model-agnostic methodology that enhances the effectiveness of existing multi-hop logical reasoning approaches by fully integrating the context of the FOL query graph. Our approach distinctively discerns (1) the structural context inherent to the query structure and (2) the relation-induced context unique to each node in the query graph as delineated in the corresponding knowledge graph. This dual-context paradigm helps nodes within a query graph attain refined internal representations throughout the multi-hop reasoning steps. Through experiments on two datasets, our method consistently enhances the three multi-hop reasoning foundation models, achieving performance improvements of up to 19.5%. Our codes are available at https://github.com/kjh9503/caqr.</abstract>
-      <url hash="d9dde969">2024.findings-acl.946</url>
+      <url hash="1604a789">2024.findings-acl.946</url>
       <bibkey>kim-etal-2024-improving-multi</bibkey>
       <doi>10.18653/v1/2024.findings-acl.946</doi>
       <video href="2024.findings-acl.946.mp4"/>
@@ -19396,7 +19396,7 @@
       <author><first>Chao</first><last>Zhang</last><affiliation>Georgia Institute of Technology</affiliation></author>
       <pages>15992-16030</pages>
       <abstract>Although Large Language Models (LLMs) exhibit remarkable adaptability across domains, these models often fall short in structured knowledge extraction tasks such as named entity recognition (NER). This paper explores an innovative, cost-efficient strategy to harness LLMs with modest NER capabilities for producing superior NER datasets. Our approach diverges from the basic class-conditional prompts by instructing LLMs to self-reflect on the specific domain, thereby generating domain-relevant attributes (such as category and emotions for movie reviews), which are utilized for creating attribute-rich training data. Furthermore, we preemptively generate entity terms and then develop NER context data around these entities, effectively bypassing the LLMs’ challenges with complex structures. Our experiments across both general and niche domains reveal significant performance enhancements over conventional data generation methods while being more cost-effective than existing alternatives.</abstract>
-      <url hash="8e9d670e">2024.findings-acl.947</url>
+      <url hash="c898ac29">2024.findings-acl.947</url>
       <bibkey>heng-etal-2024-proggen</bibkey>
       <doi>10.18653/v1/2024.findings-acl.947</doi>
       <video href="2024.findings-acl.947.mp4"/>
@@ -19409,7 +19409,7 @@
       <author><first>Cho-Jui</first><last>Hsieh</last><affiliation>Google and University of California, Los Angeles</affiliation></author>
       <pages>16031-16046</pages>
       <abstract>Although many large language models (LLMs) have been trained to refuse harmful requests, they are still vulnerable to jailbreaking attacks which rewrite the original prompt to conceal its harmful intent. In this paper, we propose a new method for defending LLMs against jailbreaking attacks by “backtranslation”. Specifically, given an initial response generated by the target LLM from an input prompt, our backtranslation prompts a language model to infer an input prompt that can lead to the response. The inferred prompt is called the backtranslated prompt which tends to reveal the actual intent of the original prompt, since it is generated based on the LLM’s response and not directly manipulated by the attacker. We then run the target LLM again on the backtranslated prompt, and we refuse the original prompt if the model refuses the backtranslated prompt. We explain that the proposed defense provides several benefits on its effectiveness and efficiency. We empirically demonstrate that our defense significantly outperforms the baselines, in the cases that are hard for the baselines, and our defense also has little impact on the generation quality for benign input prompts. Our implementation is based on our library for LLM jailbreaking defense algorithms at <url>https://github.com/YihanWang617/llm-jailbreaking-defense</url>, and the code for reproducing our experiments is available at <url>https://github.com/YihanWang617/LLM-Jailbreaking-Defense-Backtranslation</url>.</abstract>
-      <url hash="d7c79332">2024.findings-acl.948</url>
+      <url hash="f127e28f">2024.findings-acl.948</url>
       <bibkey>wang-etal-2024-defending</bibkey>
       <doi>10.18653/v1/2024.findings-acl.948</doi>
     </paper>
@@ -19421,7 +19421,7 @@
       <author><first>Kentaro</first><last>Inui</last><affiliation>Mohamed bin Zayed University of Artificial Intelligence, RIKEN and Tohoku University</affiliation></author>
       <pages>16047-16062</pages>
       <abstract>Mitigating the generation of contradictory responses poses a substantial challenge in dialogue response generation. The quality and quantity of available contradictory response data play a vital role in suppressing these contradictions, offering two significant benefits. First, having access to large contradiction data enables a comprehensive examination of their characteristics. Second, data-driven methods to mitigate contradictions may be enhanced with large-scale contradiction data for training. Nevertheless, no attempt has been made to build an extensive collection of model-generated contradictory responses. In this paper, we build a large dataset of response generation models’ contradictions for the first time. Then, we acquire valuable insights into the characteristics of model-generated contradictions through an extensive analysis of the collected responses. Lastly, we also demonstrate how this dataset substantially enhances the performance of data-driven contradiction suppression methods.</abstract>
-      <url hash="11251741">2024.findings-acl.949</url>
+      <url hash="bfd8a4da">2024.findings-acl.949</url>
       <bibkey>sato-etal-2024-large</bibkey>
       <doi>10.18653/v1/2024.findings-acl.949</doi>
       <video href="2024.findings-acl.949.mp4"/>
@@ -19436,7 +19436,7 @@
       <author><first>Mitsuhiro</first><last>Okada</last></author>
       <pages>16063-16077</pages>
       <abstract>This paper explores the question of how accurately current large language models can perform logical reasoning in natural language, with an emphasis on whether these models exhibit reasoning biases similar to humans. Specifically, our study focuses on syllogistic reasoning, a form of deductive reasoning extensively studied in cognitive science as a natural form of human reasoning. We present a syllogism dataset called NeuBAROCO, which consists of syllogistic reasoning problems in English and Japanese. This dataset was originally designed for psychological experiments to assess human reasoning capabilities using various forms of syllogisms. Our experiments with leading large language models indicate that these models exhibit reasoning biases similar to humans, along with other error tendencies. Notably, there is significant room for improvement in reasoning problems where the relationship between premises and hypotheses is neither entailment nor contradiction. We also present experimental results and in-depth analysis using a new Chain-of-Thought prompting method, which asks LLMs to translate syllogisms into abstract logical expressions and then explain their reasoning process. Our analysis using this method suggests that the primary limitations of LLMs lie in the reasoning process itself rather than the interpretation of syllogisms.</abstract>
-      <url hash="c45155ee">2024.findings-acl.950</url>
+      <url hash="c03e0b33">2024.findings-acl.950</url>
       <bibkey>ozeki-etal-2024-exploring</bibkey>
       <doi>10.18653/v1/2024.findings-acl.950</doi>
       <video href="2024.findings-acl.950.mp4"/>
@@ -19452,7 +19452,7 @@
       <author><first>Arman</first><last>Cohan</last><affiliation>Yale University and Allen Institute for Artificial Intelligence</affiliation></author>
       <pages>16078-16092</pages>
       <abstract>Data contamination has garnered increased attention in the era of Large language models (LLMs) due to the reliance on extensive internet-derived training corpora. The issue of training corpus overlap with evaluation benchmarks—referred to as contamination—has been the focus of significant recent research. This body of work aims to identify contamination, understand its impacts, and explore mitigation strategies from diverse perspectives. However, comprehensive studies that provide a clear pathway from foundational concepts to advanced insights are lacking in this nascent field. Therefore, we present the first survey in the field of data contamination. We begin by examining the effects of data contamination across various stages and forms. We then provide a detailed analysis of current contamination detection methods, categorizing them to highlight their focus, assumptions, strengths, and limitations. We also discuss mitigation strategies, offering a clear guide for future research. This survey serves as a succinct overview of the most recent advancements in data contamination research, providing a straightforward guide for the benefit of future research endeavors.</abstract>
-      <url hash="e825eb44">2024.findings-acl.951</url>
+      <url hash="892de303">2024.findings-acl.951</url>
       <bibkey>deng-etal-2024-unveiling</bibkey>
       <doi>10.18653/v1/2024.findings-acl.951</doi>
     </paper>
@@ -19465,7 +19465,7 @@
       <author><first>Aravindan</first><last>Raghuveer</last><affiliation>Google</affiliation></author>
       <pages>16093-16109</pages>
       <abstract>Self-correction techniques have recently emerged as a promising framework to improve the quality of responses generated by large language models (LLMs). Few-shot prompted LLMs act as critics to produce feedback for an input, which is further fed to a refiner (also an LLM) to produce an output. However, these critique-refine steps require multiple expensive LLM calls. To circumvent this large inference cost, we borrow inspiration from prior work on knowledge distillation and propose the use of critique distillation to train critic models. These are smaller sequence-to-sequence models that are trained on input-critique pairs generated by an LLM. We focus on the problem of text simplification for three Indian languages: Hindi, Bengali and Marathi. This task is a good fit for self-correction style techniques. It also hasn’t been systematically explored for Indian languages before. We train two separate critics that focus on lexical and structure complexity, and show that it is surprisingly more effective than using an LLM directly as a critic in both 0-shot and few-shot settings. We also show the benefits of training multilingual critics, as opposed to monolingual critics. Extensive human evaluations show that on average, raters find 80% of DIMSIM’s output to be simple and easy to read.</abstract>
-      <url hash="3b8bd498">2024.findings-acl.952</url>
+      <url hash="a8c2a33e">2024.findings-acl.952</url>
       <bibkey>mondal-etal-2024-dimsim</bibkey>
       <doi>10.18653/v1/2024.findings-acl.952</doi>
     </paper>
@@ -19477,7 +19477,7 @@
       <author><first>Jens</first><last>Lehmann</last><affiliation>Amazon, Technische Universität Dresden, University of Bonn and Fraunhofer IAIS</affiliation></author>
       <pages>16110-16121</pages>
       <abstract>Leveraging external knowledge is crucial for achieving high performance in knowledge-intensive tasks, such as question answering. The retrieve-and-read approach is widely adopted for integrating external knowledge into a language model. However, this approach suffers from increased computational cost and latency due to the long context length, which grows proportionally with the number of retrieved knowledge. Furthermore, existing retrieval-augmented models typically retrieve information from a single type of knowledge source, limiting their scalability to diverse knowledge sources with varying structures. In this work, we introduce an efficient memory-augmented transformer called MATTER, designed to retrieve relevant knowledge from multiple heterogeneous knowledge sources. Specifically, our model retrieves and reads from both unstructured sources (paragraphs) and semi-structured sources (QA pairs) in the form of fixed-length neural memories. We demonstrate that our model outperforms existing efficient retrieval-augmented models on popular QA benchmarks in terms of both accuracy and speed. Furthermore, MATTER achieves competitive results compared to conventional read-and-retrieve models while having 100x throughput during inference.</abstract>
-      <url hash="bb4e815d">2024.findings-acl.953</url>
+      <url hash="d3e2212e">2024.findings-acl.953</url>
       <bibkey>lee-etal-2024-matter</bibkey>
       <doi>10.18653/v1/2024.findings-acl.953</doi>
       <video href="2024.findings-acl.953.mp4"/>
@@ -19491,7 +19491,7 @@
       <author><first>Jong</first><last>Park</last><affiliation>Korea Advanced Institute of Science and Technology</affiliation></author>
       <pages>16122-16143</pages>
       <abstract>Social bias is shaped by the accumulation of social perceptions towards targets across various demographic identities. To fully understand such social bias in large language models (LLMs), it is essential to consider the composite of social perceptions from diverse perspectives among identities. Previous studies have either evaluated biases in LLMs by indirectly assessing the presence of sentiments towards demographic identities in the generated text or measuring the degree of alignment with given stereotypes. These methods have limitations in directly quantifying social biases at the level of distinct perspectives among identities. In this paper, we aim to investigate how social perceptions from various viewpoints contribute to the development of social bias in LLMs. To this end, we propose a novel strategy to intuitively quantify these social perceptions and suggest metrics that can evaluate the social biases within LLMs by aggregating diverse social perceptions. The experimental results show the quantitative demonstration of the social attitude in LLMs by examining social perception. The analysis we conducted shows that our proposed metrics capture the multi-dimensional aspects of social bias, enabling a fine-grained and comprehensive investigation of bias in LLMs.</abstract>
-      <url hash="3d81237b">2024.findings-acl.954</url>
+      <url hash="75157565">2024.findings-acl.954</url>
       <bibkey>shin-etal-2024-ask</bibkey>
       <doi>10.18653/v1/2024.findings-acl.954</doi>
       <video href="2024.findings-acl.954.mp4"/>
@@ -19506,7 +19506,7 @@
       <author><first>Xiao-Yu</first><last>Zhang</last><affiliation>Institute of Information Engineering, Chinese Academy of Sciences</affiliation></author>
       <pages>16144-16159</pages>
       <abstract>Temporal Knowledge Graph (TKG) forecasting aims to predict future facts based on given histories. Most recent graph-based models excel at capturing structural information within TKGs but lack semantic comprehension abilities. Nowadays, with the surge of LLMs, the LLM-based TKG prediction model has emerged. However, the existing LLM-based model exhibits three shortcomings: (1) It only focuses on the first-order history for prediction while ignoring high-order historical information, resulting in the provided information for LLMs being extremely limited. (2) LLMs struggle with optimal reasoning performance under heavy historical information loads. (3) For TKG prediction, the temporal reasoning capability of LLM alone is limited. To address the first two challenges, we propose Chain-of-History (CoH) reasoning which explores high-order histories step-by-step, achieving effective utilization of high-order historical information for LLMs on TKG prediction. To address the third issue, we design CoH as a plug-and-play module to enhance the performance of graph-based models for TKG prediction. Extensive experiments on three datasets and backbones demonstrate the effectiveness of CoH.</abstract>
-      <url hash="b29b880f">2024.findings-acl.955</url>
+      <url hash="96ca04e7">2024.findings-acl.955</url>
       <bibkey>xia-etal-2024-chain</bibkey>
       <doi>10.18653/v1/2024.findings-acl.955</doi>
       <video href="2024.findings-acl.955.mp4"/>
@@ -19519,7 +19519,7 @@
       <author><first>Tianyi</first><last>Zhou</last><affiliation>University of Maryland, College Park</affiliation></author>
       <pages>16160-16176</pages>
       <abstract>Making LLMs speak for different, especially minority groups of people, and generate statements supporting their diverse or even controversial perspectives is critical to creating an inclusive environment. However, existing LLMs lack sufficient controllability to the stance of their generated content, which often contains inconsistent, neutral, or biased statements. In this paper, we improve the controllability of LLMs in generating statements supporting an argument the user defined in the prompt. We find that multi-round debates between two LLMs with opposite stances generate higher-quality and more salient statements for each, which are important training data to improve the controllability of LLMs. Motivated by this, we develop a novel debate &amp; tuning (“DEBATUNE”) pipeline finetuning LLMs to generate the statements obtained via debate. To examine DEBATUNE, we curate the largest dataset of debate topics so far, which covers 710 controversial topics and corresponding arguments for each topic. Evaluations by the GPT-4 judge with a novel controversy controllability metric show that LLMs’ capability of generating diverse perspectives is significantly improved by DEBATUNE. Moreover, such controllability can be generalized to unseen topics, generating high-quality statements supporting controversial arguments.</abstract>
-      <url hash="94ecb75c">2024.findings-acl.956</url>
+      <url hash="30c8e88b">2024.findings-acl.956</url>
       <bibkey>li-etal-2024-llms-speak</bibkey>
       <doi>10.18653/v1/2024.findings-acl.956</doi>
       <video href="2024.findings-acl.956.mp4"/>
@@ -19533,7 +19533,7 @@
       <author><first>Kyungsik</first><last>Han</last><affiliation>Hanyang University</affiliation></author>
       <pages>16177-16188</pages>
       <abstract>Detecting implicit hate speech that is not directly hateful remains a challenge. Recent research has attempted to detect implicit hate speech by applying contrastive learning to pre-trained language models such as BERT and RoBERTa, but the proposed models still do not have a significant advantage over cross-entropy loss-based learning. We found that contrastive learning based on randomly sampled batch data does not encourage the model to learn hard negative samples. In this work, we propose Label-aware Hard Negative sampling strategies (LAHN) that encourage the model to learn detailed features from hard negative samples, instead of naive negative samples in random batch, using momentum-integrated contrastive learning. LAHN outperforms the existing models for implicit hate speech detection both in- and cross-datasets. The code is available at https://github.com/Hanyang-HCC-Lab/LAHN</abstract>
-      <url hash="63439216">2024.findings-acl.957</url>
+      <url hash="a5afe79e">2024.findings-acl.957</url>
       <bibkey>kim-etal-2024-label</bibkey>
       <doi>10.18653/v1/2024.findings-acl.957</doi>
       <video href="2024.findings-acl.957.mp4"/>
@@ -19548,7 +19548,7 @@
       <author><first>Tianyi</first><last>Zhou</last><affiliation>University of Maryland, College Park</affiliation></author>
       <pages>16189-16211</pages>
       <abstract>Instruction tuning is critical to large language models (LLMs) for achieving better instruction following and task adaptation capabilities but its success heavily relies on the training data quality. Many recent methods focus on improving the data quality but often overlook the compatibility of the data with the student model being finetuned. This paper introduces Selective Reflection-Tuning, a novel paradigm that synergizes a teacher LLM’s reflection and introspection for improving existing data quality with the data selection capability of the student LLM, to automatically refine existing instruction-tuning data. This teacher-student collaboration produces high-quality and student-compatible instruction-response pairs, resulting in sample-efficient instruction tuning and LLMs of superior performance. Selective Reflection-Tuning is a data augmentation and synthesis that generally improves LLM finetuning and self-improvement without collecting brand-new data. We apply our method to Alpaca and WizardLM data and achieve much stronger and top-tier 7B and 13B LLMs.</abstract>
-      <url hash="d7c8ee7e">2024.findings-acl.958</url>
+      <url hash="99f7f4c1">2024.findings-acl.958</url>
       <bibkey>li-etal-2024-selective</bibkey>
       <doi>10.18653/v1/2024.findings-acl.958</doi>
       <video href="2024.findings-acl.958.mp4"/>
@@ -19564,7 +19564,7 @@
       <author><first>Lilian</first><last>Tang</last><affiliation>University of Surrey</affiliation></author>
       <pages>16212-16226</pages>
       <abstract>In conversational AI, personalizing dialogues with persona profiles and contextual understanding is essential. Despite large language models’ (LLMs) improved response coherence, effective persona integration remains a challenge. In this work, we first study two common approaches for personalizing LLMs: textual prompting and direct fine-tuning. We observed that textual prompting often struggles to yield responses that are similar to the ground truths in datasets, while direct fine-tuning tends to produce repetitive or overly generic replies. To alleviate those issues, we propose **S**elective **P**rompt **T**uning (SPT), which softly prompts LLMs for personalized conversations in a selective way. Concretely, SPT initializes a set of soft prompts and uses a trainable dense retriever to adaptively select suitable soft prompts for LLMs according to different input contexts, where the prompt retriever is dynamically updated through feedback from the LLMs. Additionally, we propose context-prompt contrastive learning and prompt fusion learning to encourage the SPT to enhance the diversity of personalized conversations. Experiments on the CONVAI2 dataset demonstrate that SPT significantly enhances response diversity by up to 90%, along with improvements in other critical performance indicators. Those results highlight the efficacy of SPT in fostering engaging and personalized dialogue generation. The SPT model code is [publicly available](https://github.com/hqsiswiliam/SPT) for further exploration.</abstract>
-      <url hash="0ee1bc5f">2024.findings-acl.959</url>
+      <url hash="1e000b80">2024.findings-acl.959</url>
       <bibkey>huang-etal-2024-selective</bibkey>
       <doi>10.18653/v1/2024.findings-acl.959</doi>
       <video href="2024.findings-acl.959.mp4"/>
@@ -19577,7 +19577,7 @@
       <author><first>Soujanya</first><last>Poria</last><affiliation>Singapore University of Technology and Design</affiliation></author>
       <pages>16227-16239</pages>
       <abstract>In the rapidly advancing field of artificial intelligence, the concept of ‘Red-Teaming’ or ‘Jailbreaking’ large language models (LLMs) has emerged as a crucial area of study. This approach is especially significant in terms of assessing and enhancing the safety and robustness of these models. This paper investigates the intricate consequences of such modifications through model editing, uncovering a complex relationship between enhancing model accuracy and preserving its ethical integrity. Our in-depth analysis reveals a striking paradox: while injecting accurate information is crucial for model reliability, it can paradoxically destabilize the model’s foundational framework, resulting in unpredictable and potentially unsafe behaviors. Additionally, we propose a benchmark dataset <tex-math>NicheHazardQA</tex-math> to investigate this unsafe behavior both within the same and cross topical domain. This aspect of our research sheds light on how the edits, impact the model’s safety metrics and guardrails. Our findings show that model editing serves as a cost-effective tool for topical red-teaming by methodically applying targeted edits and evaluating the resultant model behavior.</abstract>
-      <url hash="f309d535">2024.findings-acl.960</url>
+      <url hash="eaf95169">2024.findings-acl.960</url>
       <bibkey>hazra-etal-2024-sowing</bibkey>
       <doi>10.18653/v1/2024.findings-acl.960</doi>
     </paper>
@@ -19596,7 +19596,7 @@
       <author><first>Xiaofeng</first><last>Tao</last></author>
       <pages>16240-16258</pages>
       <abstract>Image retrieval from contextual descriptions (IRCD) aims to identify an image within a set of minimally contrastive candidates based on linguistically complex text. Despite the success of VLMs, they still significantly lag behind human performance in IRCD. The main challenges lie in aligning key contextual cues in two modalities, where these subtle cues are concealed in tiny areas of multiple contrastive images and within the complex linguistics of textual descriptions. This motivates us to propose ContextBLIP, a simple yet effective method that relies on a doubly contextual alignment scheme for challenging IRCD. Specifically, 1) our model comprises a multi-scale adapter, a matching loss, and a text-guided masking loss. The adapter learns to capture fine-grained visual cues. The two losses enable iterative supervision for the adapter, gradually highlighting the focal patches of a single image to the key textual cues. We term such a way as intra-contextual alignment. 2) Then, ContextBLIP further employs an inter-context encoder to learn dependencies among candidates, facilitating alignment between the text to multiple images. We term this step as inter-contextual alignment. Consequently, the nuanced cues concealed in each modality can be effectively aligned. Experiments on two benchmarks show the superiority of our method. We observe that ContextBLIP can yield comparable results with GPT-4V, despite involving about 7,500 times fewer parameters.</abstract>
-      <url hash="96ab2c79">2024.findings-acl.961</url>
+      <url hash="bd414be0">2024.findings-acl.961</url>
       <bibkey>lin-etal-2024-contextblip</bibkey>
       <doi>10.18653/v1/2024.findings-acl.961</doi>
     </paper>
@@ -19625,7 +19625,7 @@
       <author><first>Wonjae</first><last>Lee</last><affiliation>Korea Advanced Institute of Science and Technology</affiliation></author>
       <pages>16274-16289</pages>
       <abstract>Understanding the interplay between emotions in language and user behaviors is critical. We study how moral emotions shape the political participation of users based on cross-cultural online petition data. To quantify moral emotions, we employ a context-aware NLP model that is designed to capture the subtle nuances of emotions across cultures. For model training, we construct and share a moral emotion dataset comprising nearly 50,000 petition sentences in Korean and English each, along with emotion labels annotated by a fine-tuned LLM. We examine two distinct types of user participation: general support (i.e., registered signatures of petitions) and active support (i.e., sharing petitions on social media). We discover that moral emotions like other-suffering increase both forms of participation and help petitions go viral, while self-conscious have the opposite effect. The most prominent moral emotion, other-condemning, led to polarizing responses among the audience. In contrast, other-praising was perceived differently by culture; it led to a rise in active support in Korea but a decline in the UK. Our findings suggest that both moral emotions embedded in language and cultural perceptions are critical to shaping the public’s political discourse.</abstract>
-      <url hash="3e4f586e">2024.findings-acl.963</url>
+      <url hash="56386d7c">2024.findings-acl.963</url>
       <bibkey>kim-etal-2024-moral</bibkey>
       <doi>10.18653/v1/2024.findings-acl.963</doi>
       <video href="2024.findings-acl.963.mp4"/>
@@ -19639,7 +19639,7 @@
       <author><first>Yi</first><last>Yang</last><affiliation>Zhejiang University</affiliation></author>
       <pages>16290-16314</pages>
       <abstract>In this paper, we aim to evaluate multi-agent systems against complex dependencies, including spatial, causal, and temporal constraints. First, we construct a new benchmark, named VillagerBench, within the Minecraft environment. VillagerBench comprises diverse tasks crafted to test various aspects of multi-agent collaboration, from workload distribution to dynamic adaptation and synchronized task execution. Second, we introduce a Directed Acyclic Graph Multi-Agent Framework (VillagerAgent) to resolve complex inter-agent dependencies and enhance collaborative efficiency. This solution incorporates a task decomposer that creates a directed acyclic graph (DAG) for structured task management, an agent controller for task distribution, and a state manager for tracking environmental and agent data.Our empirical evaluation on VillagerBench demonstrates that VillagerAgentoutperforms the existing AgentVerse model, reducing hallucinations and improving task decomposition efficacy. The results underscore VillagerAgent’s potential in advancing multi-agent collaboration, offering a scalable and generalizable solution in dynamic environments. Source code is open-source on GitHub.</abstract>
-      <url hash="34ada041">2024.findings-acl.964</url>
+      <url hash="b316f20f">2024.findings-acl.964</url>
       <bibkey>dong-etal-2024-villageragent</bibkey>
       <doi>10.18653/v1/2024.findings-acl.964</doi>
       <video href="2024.findings-acl.964.mp4"/>
@@ -19651,7 +19651,7 @@
       <author><first>Yanfeng</first><last>Wang</last><affiliation>Shanghai Jiao Tong University</affiliation></author>
       <pages>16315-16325</pages>
       <abstract>In text-conditioned image retrieval (TCIR), the combination of a reference image and modification text forms a query tuple, aiming to locate the most congruent target image within a dataset. The advantages of rich image semantic information and text flexibility are combined in this manner for more accurate retrieval. While traditional techniques often employ attention-driven compositors to craft a unified image-text representation, our paper introduces a compositor-free framework, CF-TCIR, which eschews the standard compositor. Compositor-based methods are designed to learn a joint representation of images and text, but they struggle to directly capture the correlations between attributes across the image and text modalities. Instead, we reformulate the retrieval process as a cross-modal interaction between a synthesized image feature and its corresponding text descriptor. This novel methodology offers advantages in terms of computational efficiency, scalability, and superior performance. To optimize the retrieval performance, we advocate a tiered retrieval mechanism, blending both coarse-grain and fine-grain paradigms. Moreover, to enrich the contextual relationship within the query tuple, we integrate a generative cross-modal alignment technique, ensuring synchronization of sequential attributes between image and text data.</abstract>
-      <url hash="8ff880aa">2024.findings-acl.965</url>
+      <url hash="a466ec3c">2024.findings-acl.965</url>
       <bibkey>yang-etal-2024-cf</bibkey>
       <doi>10.18653/v1/2024.findings-acl.965</doi>
     </paper>
@@ -19663,7 +19663,7 @@
       <author><first>Jiawei</first><last>Chen</last></author>
       <pages>16326-16338</pages>
       <abstract>Conversational Aspect-based Sentiment Quadruple Analysis (DiaASQ) aims to extract fine-grained sentiment quadruples from dialogues. Previous research has primarily concentrated on enhancing token-level interactions, still lacking in sufficient modeling of the discourse structure information in dialogue. Firstly, it does not incorporate interactions among different utterances in the encoding stage, resulting in a limited token-level context understanding for subsequent modules. Secondly, it ignores the critical fact that discourse information is naturally organized at the utterance level and learning it solely at the token level is incomplete. In this work, we strengthen the token-level encoder by utilizing a discourse structure called “thread” and graph convolutional networks to enhance the token interaction among different utterances. Moreover, we propose an utterance-level encoder to learn the structured speaker and reply information, providing a macro understanding of dialogue discourse. Furthermore, we introduce a novel Multi-granularities Integrator to integrate token-level and utterance-level representations, resulting in a comprehensive and cohesive dialogue contextual understanding. Experiments on two datasets demonstrate that our model achieves state-of-the-art performance. Our codes are publicly available at https://github.com/SIGSDSscau/DMIN.</abstract>
-      <url hash="f83d6a59">2024.findings-acl.966</url>
+      <url hash="6dbdf68f">2024.findings-acl.966</url>
       <bibkey>huang-etal-2024-dmin</bibkey>
       <doi>10.18653/v1/2024.findings-acl.966</doi>
       <video href="2024.findings-acl.966.mp4"/>
@@ -19675,7 +19675,7 @@
       <author><first>Hwee Tou</first><last>Ng</last><affiliation>National University of Singapore</affiliation></author>
       <pages>16339-16347</pages>
       <abstract>The natural language processing field has been evolving around language models for the past few years, from the usage of n-gram language models for re-ranking, to transfer learning with encoder-only (BERT-like) language models, and finally to large language models (LLMs) as general solvers. LLMs are dominated by the decoder-only type, and they are popular for their efficacy in numerous tasks. LLMs are regarded as having strong comprehension abilities and strong capabilities to solve new unseen tasks. As such, people may quickly assume that decoder-only LLMs always perform better than the encoder-only ones, especially for understanding word meaning. In this paper, we demonstrate that decoder-only LLMs perform worse on word meaning comprehension than an encoder-only language model that has vastly fewer parameters.</abstract>
-      <url hash="ebc2c15d">2024.findings-acl.967</url>
+      <url hash="fa5c2025">2024.findings-acl.967</url>
       <bibkey>qorib-etal-2024-decoder</bibkey>
       <doi>10.18653/v1/2024.findings-acl.967</doi>
       <video href="2024.findings-acl.967.mp4"/>
@@ -19687,7 +19687,7 @@
       <author><first>Yi</first><last>Yang</last><affiliation>Zhejiang University</affiliation></author>
       <pages>16348-16361</pages>
       <abstract>To process contexts with unlimited length using Large Language Models (LLMs), recent studies explore hierarchically managing the long text. Only several text fragments are taken from the external memory and passed into the temporary working memory, i.e., LLM’s context window. However, existing approaches isolatedly handle the text fragments without considering their structural connections, thereby suffering limited capability on texts with intensive inter-relations, e.g., coherent stories and code repositories. This work attempts to resolve this by exploiting the fragment-level relations in external memory. First, we formulate the fragment-level relations and present several instantiations for different text types. Next, we introduce a relation-aware fragment assessment criteria upon previous independent fragment assessment. Finally, we present the fragment-connected Hierarchical Memory based LLM. We validate the benefits of involving these relations on long story understanding, repository-level code generation, and long-term chatting.</abstract>
-      <url hash="df68d8c5">2024.findings-acl.968</url>
+      <url hash="33c8243a">2024.findings-acl.968</url>
       <bibkey>yue-etal-2024-fragrel</bibkey>
       <doi>10.18653/v1/2024.findings-acl.968</doi>
       <video href="2024.findings-acl.968.mp4"/>
@@ -19703,7 +19703,7 @@
       <author><first>Lijie</first><last>Wen</last><affiliation>School of Software, Tsinghua University</affiliation></author>
       <pages>16362-16374</pages>
       <abstract>Driven by the demand for cross-sentence and large-scale relation extraction, document-level relation extraction (DocRE) has attracted increasing research interest. Despite the continuous improvement in performance, we find that existing DocRE models which initially perform well may make more mistakes when merely changing the entity names in the document, hindering the generalization to novel entity names. To this end, we systematically investigate the robustness of DocRE models to entity name variations in this work. We first propose a principled pipeline to generate entity-renamed documents by replacing the original entity names with names from Wikidata. By applying the pipeline to DocRED and Re-DocRED datasets, we construct two novel benchmarks named Env-DocRED and Env-Re-DocRED for robustness evaluation. Experimental results show that both three representative DocRE models and two in-context learned large language models consistently lack sufficient robustness to entity name variations, particularly on cross-sentence relation instances and documents with more entities. Finally, we propose an entity variation robust training method which not only improves the robustness of DocRE models but also enhances their understanding and reasoning capabilities. We further verify that the basic idea of this method can be effectively transferred to in-context learning for DocRE as well.</abstract>
-      <url hash="e16a1297">2024.findings-acl.969</url>
+      <url hash="84bc59cf">2024.findings-acl.969</url>
       <bibkey>meng-etal-2024-robustness</bibkey>
       <doi>10.18653/v1/2024.findings-acl.969</doi>
     </paper>
@@ -19717,7 +19717,7 @@
       <author><first>Zhendong</first><last>Mao</last><affiliation>University of Science and Technology of China</affiliation></author>
       <pages>16375-16387</pages>
       <abstract>On social media platforms, users’ emotions are triggered when they encounter particular content from other users,where such emotions are different from those that spontaneously emerged, owing to the “responsive” nature. Analyzing the aforementioned responsive emotions from user interactions is a task of significant importance for understanding human cognition, the mechanisms of emotion generation, and behavior on the Internet, etc. Performing the task with artificial intelligence generally requires human-annotated data to help train a well-performing system, while existing data resources do not cover this specific area, with none of them focusing on responsive emotion analysis. In this paper, we propose a Chinese dataset named ResEmo for responsive emotion analysis, including 3813 posts with 68,781 comments collected from Weibo, the largest social media platform in China. ResEmo contains three types of human annotations with respect to responsive emotions, namely, responsive relationship, responsive emotion cause, and responsive emotion category. Moreover, to test this dataset, we build large language model (LLM) baseline methods for responsive relation extraction, responsive emotion cause extraction, and responsive emotion detection, which show the potential of the proposed ResEmo being a benchmark for future studies on responsive emotions.</abstract>
-      <url hash="36bce09d">2024.findings-acl.970</url>
+      <url hash="4c130715">2024.findings-acl.970</url>
       <bibkey>hu-etal-2024-resemo</bibkey>
       <doi>10.18653/v1/2024.findings-acl.970</doi>
     </paper>
@@ -19729,7 +19729,7 @@
       <author><first>Edward</first><last>Choi</last><affiliation>Korea Advanced Institute of Science and Technology</affiliation></author>
       <pages>16388-16407</pages>
       <abstract>In this paper, we introduce EHR-SeqSQL, a novel sequential text-to-SQL dataset for Electronic Health Record (EHR) databases. EHR-SeqSQL is designed to address critical yet underexplored aspects in text-to-SQL parsing: interactivity, compositionality, and efficiency. To the best of our knowledge, EHR-SeqSQL is not only the largest but also the first medical text-to-SQL dataset benchmark to include sequential and contextual questions. We provide a data split and the new test set designed to assess compositional generalization ability. Our experiments demonstrate the superiority of a multi-turn approach over a single-turn approach in learning compositionality. Additionally, our dataset integrates specially crafted tokens into SQL queries to improve execution efficiency. With EHR-SeqSQL, we aim to bridge the gap between practical needs and academic research in the text-to-SQL domain.</abstract>
-      <url hash="aeb6445e">2024.findings-acl.971</url>
+      <url hash="a28542ee">2024.findings-acl.971</url>
       <bibkey>ryu-etal-2024-ehr</bibkey>
       <doi>10.18653/v1/2024.findings-acl.971</doi>
       <video href="2024.findings-acl.971.mp4"/>
@@ -19744,7 +19744,7 @@
       <author><first>Weipeng</first><last>Chen</last></author>
       <pages>16408-16414</pages>
       <abstract>Ongoing chatting is an important step for conversational agents to build long-term connections with people. However, people tend to quickly lose interest in chatting if the conversational agent’s words are not engaging enough. In this paper, we present a novel task of increasing users’ willingness to continue talking to the agent.We collect a dataset named ContinuousChat by: (i) collecting personas and revising them, and then expanding the personas to detailed-personas through experiences, daily life, future plans, or interesting stories; (ii) expanding detailed-personas into the dialogues, and inject emotions and feelings into them; (iii) rewriting the dialogues in specific styles through few-shot prompt, conditioning on handwritten style-specific examples.We benchmark LLMs on ContinuousChat Dataset using both fine-tuning and in-context learning settings. Experiments over publicly available models demonstrate that although there is substantial room for improvement in generating style-specific dialogues, our ContinuousChat dataset is valuable in guiding conversational agents to generate more attractive dialogues and increase users’ willingness to continue the conversations.</abstract>
-      <url hash="1e36c1ad">2024.findings-acl.972</url>
+      <url hash="47917bbb">2024.findings-acl.972</url>
       <bibkey>wang-etal-2024-keep</bibkey>
       <doi>10.18653/v1/2024.findings-acl.972</doi>
     </paper>
@@ -19761,7 +19761,7 @@
       <author><first>Yu</first><last>Su</last></author>
       <pages>16415-16429</pages>
       <abstract>The gap between the trepidation of program reliability and the expense of repairs underscore the indispensability for Automated Program Repair (APR). APR is instrumental in transforming vulnerable programs into more robust ones, bolstering program reliability while simultaneously diminishing the financial burden of manual repairs. Commercial-scale language models (LM) have taken APR to unprecedented levels. However, due to the limitations of model capabilities by parameters, a one-step substantial modification may not achieve the desired effect for models with parameters less than 100B. Moreover, humans interact with the LLM through explicit prompts, which hinders the LLM from receiving feedback from compiler and test cases to automatically optimize its repair policies. Explicit prompts from humans not only increase additional manpower costs, but also pose potential misunderstandings between human’s intent and LMs.Based on the above considerations, we are exploring how to ensure small-scale LM still outperform through process supervision and feedback. We start by constructing a dataset named CodeNet4Repair, replete with multiple repair records, which supervises the fine-tuning of a foundational mode. Building upon the encouraging outcomes of reinforcement learning, we develop a reward model that serves as a critic, providing feedback for the fine-tuned LM’s action, progressively optimizing its policy. During inference, we require the LM to generate solutions iteratively until the repair effect no longer improves or hits the maximum step limit. The experimental results show that this process-based feedback not only outperforms larger outcome-based generation methods, but also nearly matches the performance of closed-source commercial large-scale LMs.</abstract>
-      <url hash="4dd3bf5e">2024.findings-acl.973</url>
+      <url hash="136ad266">2024.findings-acl.973</url>
       <bibkey>zhao-etal-2024-repair</bibkey>
       <doi>10.18653/v1/2024.findings-acl.973</doi>
       <video href="2024.findings-acl.973.mp4"/>
@@ -19782,7 +19782,7 @@
       <author><first>Wanxiang</first><last>Che</last><affiliation>Harbin Institute of Technology</affiliation></author>
       <pages>16430-16441</pages>
       <abstract>Through reading the documentation in the context, tool-using language models can dynamically extend their capability using external tools. The cost is that we have to input lengthy documentation every time the model needs to use the tool, occupying the input window as well as slowing down the decoding process.Given the progress in general-purpose compression, soft context compression is a suitable approach to alleviate the problem. However, when compressing tool documentation, existing methods suffer from the weaknesses of key information loss (specifically, tool/parameter name errors) and difficulty in adjusting the length of compressed sequences based on documentation lengths.To address these problems, we propose two strategies for compressing tool documentation into concise and precise summary sequences for tool-using language models. 1) Selective compression strategy mitigates key information loss by deliberately retaining key information as raw text tokens. 2) Block compression strategy involves dividing tool documentation into short chunks and then employing a fixed-length compression model to achieve variable-length compression. This strategy facilitates the flexible adjustment of the compression ratio.Results on API-Bank and APIBench show that our approach reaches a performance comparable to the upper-bound baseline under up to 16x compression ratio.</abstract>
-      <url hash="270b8046">2024.findings-acl.974</url>
+      <url hash="3b9bf902">2024.findings-acl.974</url>
       <bibkey>xu-etal-2024-concise</bibkey>
       <doi>10.18653/v1/2024.findings-acl.974</doi>
       <video href="2024.findings-acl.974.mp4"/>
diff --git a/data/xml/2024.hcinlp.xml b/data/xml/2024.hcinlp.xml
index 3d0f99576b..367167e991 100644
--- a/data/xml/2024.hcinlp.xml
+++ b/data/xml/2024.hcinlp.xml
@@ -4,7 +4,7 @@
     <meta>
       <booktitle>Proceedings of the Third Workshop on Bridging Human--Computer Interaction and Natural Language Processing</booktitle>
       <editor><first>Su Lin</first><last>Blodgett</last></editor>
-      <editor><first>Amanda Cercas</first><last>Curry</last></editor>
+      <editor><first>Amanda</first><last>Cercas Curry</last></editor>
       <editor><first>Sunipa</first><last>Dev</last></editor>
       <editor><first>Michael</first><last>Madaio</last></editor>
       <editor><first>Ani</first><last>Nenkova</last></editor>
diff --git a/data/xml/2024.icnlsp.xml b/data/xml/2024.icnlsp.xml
new file mode 100644
index 0000000000..e8129aa7f1
--- /dev/null
+++ b/data/xml/2024.icnlsp.xml
@@ -0,0 +1,473 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<collection id="2024.icnlsp">
+  <volume id="1" ingest-date="2024-10-29" type="proceedings">
+    <meta>
+      <booktitle>Proceedings of the 7th International Conference on Natural Language and Speech Processing (ICNLSP 2024)</booktitle>
+      <editor><first>Mourad</first><last>Abbas</last></editor>
+      <editor><first>Abed Alhakim</first><last>Freihat</last></editor>
+      <publisher>Association for Computational Linguistics</publisher>
+      <address>Trento</address>
+      <month>October</month>
+      <year>2024</year>
+      <url hash="9cff2e21">2024.icnlsp-1</url>
+      <venue>icnlsp</venue>
+    </meta>
+    <frontmatter>
+      <url hash="395eac57">2024.icnlsp-1.0</url>
+      <bibkey>icnlsp-2024-1</bibkey>
+    </frontmatter>
+    <paper id="1">
+      <title>Leveraging Annotator Disagreement for Text Classification</title>
+      <author><first>Jin</first><last>Xu</last></author>
+      <author><first>Mariët</first><last>Theune</last></author>
+      <author><first>Daniel</first><last>Braun</last></author>
+      <pages>1–10</pages>
+      <url hash="60323fad">2024.icnlsp-1.1</url>
+      <bibkey>xu-etal-2024-leveraging</bibkey>
+    </paper>
+    <paper id="2">
+      <title>Native language Identification for <fixed-case>A</fixed-case>rabic Language Learners using Pre-trained Language Models</title>
+      <author><first>Mourad</first><last>Abbas</last></author>
+      <author><first>Mohamed Amine</first><last>Cheragui</last></author>
+      <author><first>Mohammed</first><last>Mediani</last></author>
+      <pages>11–18</pages>
+      <url hash="35f494b8">2024.icnlsp-1.2</url>
+      <bibkey>abbas-etal-2024-native</bibkey>
+    </paper>
+    <paper id="3">
+      <title>Generative Adversarial Network based Neural Vocoder for <fixed-case>M</fixed-case>yanmar End-to-End Speech Synthesis</title>
+      <author><first>Aye Mya</first><last>Hlaing</last></author>
+      <author><first>Win Pa</first><last>Pa</last></author>
+      <pages>19–24</pages>
+      <url hash="1b92ba1d">2024.icnlsp-1.3</url>
+      <bibkey>hlaing-pa-2024-generative</bibkey>
+    </paper>
+    <paper id="4">
+      <title>Detecting <fixed-case>C</fixed-case>hat<fixed-case>GPT</fixed-case>-Generated Text with <fixed-case>GZIP</fixed-case>-<fixed-case>KNN</fixed-case>: A No-Training, Low-Resource Approach</title>
+      <author><first>Matthias</first><last>Berchtold</last></author>
+      <author><first>Sandra</first><last>Mitrovic</last></author>
+      <author><first>Davide</first><last>Andreoletti</last></author>
+      <author><first>Daniele</first><last>Puccinelli</last></author>
+      <author><first>Omran</first><last>Ayoub</last></author>
+      <pages>25–33</pages>
+      <url hash="d5443ac3">2024.icnlsp-1.4</url>
+      <bibkey>berchtold-etal-2024-detecting</bibkey>
+    </paper>
+    <paper id="5">
+      <title>Data Bias According to Bipol: Men are Naturally Right and It is the Role of Women to Follow Their Lead</title>
+      <author><first>Irene</first><last>Pagliai</last></author>
+      <author><first>Goya</first><last>van Boven</last></author>
+      <author><first>Tosin</first><last>Adewumi</last></author>
+      <author><first>Lama</first><last>Alkhaled</last></author>
+      <author><first>Namrata</first><last>Gurung</last></author>
+      <author><first>Isabella</first><last>Södergren</last></author>
+      <author><first>Elisa H Barney</first><last>Smith</last></author>
+      <pages>34–46</pages>
+      <url hash="43b9ebd5">2024.icnlsp-1.5</url>
+      <bibkey>pagliai-etal-2024-data</bibkey>
+    </paper>
+    <paper id="6">
+      <title>Improved Spoken Emotion Recognition With Combined Segment-Based Processing And Triplet Loss</title>
+      <author><first>Dejan</first><last>Porjazovski</last></author>
+      <author><first>Tamas</first><last>Grosz</last></author>
+      <author><first>Mikko</first><last>Kurimo</last></author>
+      <pages>47–54</pages>
+      <url hash="59f1f211">2024.icnlsp-1.6</url>
+      <bibkey>porjazovski-etal-2024-improved</bibkey>
+    </paper>
+    <paper id="7">
+      <title><fixed-case>EEG</fixed-case> Signal Analysis for Multimodal Simple Concepts Decoding</title>
+      <author><first>Sergio Guillén</first><last>Jiménez</last></author>
+      <author><first>Lorenzo J.</first><last>Tardón</last></author>
+      <author><first>Ana M</first><last>Barbancho</last></author>
+      <author><first>Isabel</first><last>Barbancho</last></author>
+      <pages>55–63</pages>
+      <url hash="4cff0de6">2024.icnlsp-1.7</url>
+      <bibkey>jimenez-etal-2024-eeg</bibkey>
+    </paper>
+    <paper id="8">
+      <title>Modeling Score Estimation for <fixed-case>J</fixed-case>apanese Essays with Generative Pre-trained Transformers</title>
+      <author><first>Boago</first><last>Okgetheng</last></author>
+      <author><first>Koichi</first><last>Takeuchi</last></author>
+      <pages>64–73</pages>
+      <url hash="4e4b23bb">2024.icnlsp-1.8</url>
+      <bibkey>okgetheng-takeuchi-2024-modeling</bibkey>
+    </paper>
+    <paper id="9">
+      <title><fixed-case>C</fixed-case>lique<fixed-case>C</fixed-case>orex: A Self-supervised Clique-based Anchored Topic Model</title>
+      <author><first>Sami</first><last>Diaf</last></author>
+      <pages>74–82</pages>
+      <url hash="29244d6a">2024.icnlsp-1.9</url>
+      <bibkey>diaf-2024-cliquecorex</bibkey>
+    </paper>
+    <paper id="10">
+      <title>Double Decoder: Improving latency for Streaming End-to-end <fixed-case>ASR</fixed-case> Models</title>
+      <author><first>Riqiang</first><last>Wang</last></author>
+      <author><first>Shreekantha</first><last>Nadig</last></author>
+      <author><first>Daniil</first><last>Kulko</last></author>
+      <author><first>Simon</first><last>Vandieken</last></author>
+      <author><first>Chia-tien</first><last>Chang</last></author>
+      <author><first>Seyyed Saeed</first><last>Sarfjoo</last></author>
+      <author><first>Jonas</first><last>Robertson</last></author>
+      <pages>83–91</pages>
+      <url hash="a333567a">2024.icnlsp-1.10</url>
+      <bibkey>wang-etal-2024-double</bibkey>
+    </paper>
+    <paper id="11">
+      <title>Personalised Abusive Language Detection Using <fixed-case>LLM</fixed-case>s and Retrieval-Augmented Generation</title>
+      <author><first>Tsungcheng</first><last>Yao</last></author>
+      <author><first>Ernest</first><last>Foo</last></author>
+      <author><first>Sebastian</first><last>Binnewies</last></author>
+      <pages>92–98</pages>
+      <url hash="0674a147">2024.icnlsp-1.11</url>
+      <bibkey>yao-etal-2024-personalised</bibkey>
+    </paper>
+    <paper id="12">
+      <title>Large-scale Summarization of Chat Transcripts in the Absence of Annotated Summaries</title>
+      <author><first>Pratik K.</first><last>Biswas</last></author>
+      <pages>99–108</pages>
+      <url hash="311a09e6">2024.icnlsp-1.12</url>
+      <bibkey>biswas-2024-large</bibkey>
+    </paper>
+    <paper id="13">
+      <title>Real-Time Speech-Driven Avatar Animation by Predicting Facial landmarks and Deformation Blendshapes</title>
+      <author><first>Juan Camilo</first><last>Vasquez-Correa</last></author>
+      <author><first>Santiago</first><last>Moreno-Acevedo</last></author>
+      <author><first>Ander</first><last>Gonzalez-Docasal</last></author>
+      <author><first>Aritz</first><last>Lasarguren</last></author>
+      <author><first>Jone</first><last>Lòpez</last></author>
+      <author><first>Egoitz</first><last>Rodriguez</last></author>
+      <author><first>Aitor</first><last>Álvarez</last></author>
+      <pages>109–118</pages>
+      <url hash="5eeb398a">2024.icnlsp-1.13</url>
+      <bibkey>vasquez-correa-etal-2024-real</bibkey>
+    </paper>
+    <paper id="14">
+      <title>Speech Emotion Recognition for Call Centers using Self-supervised Models: A Complete Pipeline for Industrial Applications</title>
+      <author><first>Juan M.</first><last>Martín-Doñas</last></author>
+      <author><first>Asier López</first><last>Zorrilla</last></author>
+      <author><first>Mikel</first><last>deVelasco</last></author>
+      <author><first>Juan Camilo</first><last>Vasquez-Correa</last></author>
+      <author><first>Aitor</first><last>Álvarez</last></author>
+      <author><first>Maria Inés</first><last>Torres</last></author>
+      <author><first>Paz</first><last>Delgado</last></author>
+      <author><first>Ane</first><last>Lazpiur</last></author>
+      <author><first>Blanca</first><last>Romero</last></author>
+      <author><first>Irati</first><last>Alkorta</last></author>
+      <pages>119–128</pages>
+      <url hash="e3220295">2024.icnlsp-1.14</url>
+      <bibkey>martin-donas-etal-2024-speech</bibkey>
+    </paper>
+    <paper id="15">
+      <title>Probing Whisper Predictions for <fixed-case>F</fixed-case>rench, <fixed-case>E</fixed-case>nglish and <fixed-case>P</fixed-case>ersian Transcriptions</title>
+      <author><first>Nicolas</first><last>Ballier</last></author>
+      <author><first>Léa</first><last>Burin</last></author>
+      <author><first>Behnoosh</first><last>Namdarzadeh</last></author>
+      <author><first>Sara B</first><last>Ng</last></author>
+      <author><first>Richard</first><last>Wright</last></author>
+      <author><first>Jean-Baptiste</first><last>Yunès</last></author>
+      <pages>129–138</pages>
+      <url hash="84c577bc">2024.icnlsp-1.15</url>
+      <bibkey>ballier-etal-2024-probing</bibkey>
+    </paper>
+    <paper id="16">
+      <title>Sawaal: A Framework for Automatic Question Generation in <fixed-case>U</fixed-case>rdu</title>
+      <author><first>Maria</first><last>Rahim</last></author>
+      <author><first>Shakeel Ahmed</first><last>Khoja</last></author>
+      <pages>139–148</pages>
+      <url hash="b51d1401">2024.icnlsp-1.16</url>
+      <bibkey>rahim-khoja-2024-sawaal</bibkey>
+    </paper>
+    <paper id="17">
+      <title>Thonburian Whisper: Robust Fine-tuned and Distilled Whisper for <fixed-case>T</fixed-case>hai</title>
+      <author><first>Zaw Htet</first><last>Aung</last></author>
+      <author><first>Thanachot</first><last>Thavornmongkol</last></author>
+      <author><first>Atirut</first><last>Boribalburephan</last></author>
+      <author><first>Vittavas</first><last>Tangsriworakan</last></author>
+      <author><first>Knot</first><last>Pipatsrisawat</last></author>
+      <author><first>Titipat</first><last>Achakulvisut</last></author>
+      <pages>149–156</pages>
+      <url hash="5b463881">2024.icnlsp-1.17</url>
+      <bibkey>aung-etal-2024-thonburian</bibkey>
+    </paper>
+    <paper id="18">
+      <title>Dual-Task Learning for <fixed-case>AI</fixed-case>-Generated Medical Text Detection and Named Entity Recognition</title>
+      <author><first>Saja</first><last>Al-Dabet</last></author>
+      <author><first>Ban</first><last>Alomar</last></author>
+      <author><first>Sherzod</first><last>Turaev</last></author>
+      <author><first>Abdelkader Nasreddine</first><last>Belkacem</last></author>
+      <pages>157–167</pages>
+      <url hash="d80523a4">2024.icnlsp-1.18</url>
+      <bibkey>al-dabet-etal-2024-dual</bibkey>
+    </paper>
+    <paper id="19">
+      <title>Comparative Analysis of Modality Fusion Approaches for Audio-Visual Person Identification and Verification</title>
+      <author><first>Aref</first><last>Farhadipour</last></author>
+      <author><first>Masoumeh</first><last>Chapariniya</last></author>
+      <author><first>Teodora</first><last>Vukovic</last></author>
+      <author><first>Volker</first><last>Dellwo</last></author>
+      <pages>168–177</pages>
+      <url hash="87875f72">2024.icnlsp-1.19</url>
+      <bibkey>farhadipour-etal-2024-comparative</bibkey>
+    </paper>
+    <paper id="20">
+      <title><fixed-case>P</fixed-case>oli<fixed-case>T</fixed-case>un: <fixed-case>T</fixed-case>unisian Political Dataset for Detecting Public Opinions and Categories Orientation</title>
+      <author><first>Chayma</first><last>Fourati</last></author>
+      <author><first>Roua</first><last>Hammami</last></author>
+      <author><first>Chiraz</first><last>Latiri</last></author>
+      <author><first>Hatem</first><last>Haddad</last></author>
+      <pages>178–185</pages>
+      <url hash="5b941c3b">2024.icnlsp-1.20</url>
+      <bibkey>fourati-etal-2024-politun</bibkey>
+    </paper>
+    <paper id="21">
+      <title>Efficient Few-shot Learning for Multi-label Classification of Scientific Documents with Many Classes</title>
+      <author><first>Tim</first><last>Schopf</last></author>
+      <author><first>Alexander</first><last>Blatzheim</last></author>
+      <author><first>Nektarios</first><last>Machner</last></author>
+      <author><first>Florian</first><last>Matthes</last></author>
+      <pages>186–198</pages>
+      <url hash="3b79ea0d">2024.icnlsp-1.21</url>
+      <bibkey>schopf-etal-2024-efficient</bibkey>
+    </paper>
+    <paper id="22">
+      <title>A Comparison of Different Tokenization Methods for the <fixed-case>G</fixed-case>eorgian Language</title>
+      <author><first>Beso</first><last>Mikaberidze</last></author>
+      <author><first>Temo</first><last>Saghinadze</last></author>
+      <author><first>Guram</first><last>Mikaberidze</last></author>
+      <author><first>Raphael</first><last>Kalandadze</last></author>
+      <author><first>Konstantine</first><last>Pkhakadze</last></author>
+      <author><first>Josef</first><last>van Genabith</last></author>
+      <author><first>Simon</first><last>Ostermann</last></author>
+      <author><first>Lonneke</first><last>van der Plas</last></author>
+      <author><first>Philipp</first><last>Müller</last></author>
+      <pages>199–208</pages>
+      <url hash="a9445100">2024.icnlsp-1.22</url>
+      <bibkey>mikaberidze-etal-2024-comparison</bibkey>
+    </paper>
+    <paper id="23">
+      <title><fixed-case>G</fixed-case>em<fixed-case>ST</fixed-case>: Continual Learning for End-to-End Speech-to-Text Translation</title>
+      <author><first>Pranav</first><last>Karande</last></author>
+      <author><first>Balaram</first><last>Sarkar</last></author>
+      <author><first>Chandresh K</first><last>Maurya</last></author>
+      <pages>209–213</pages>
+      <url hash="04557412">2024.icnlsp-1.23</url>
+      <bibkey>karande-etal-2024-gemst</bibkey>
+    </paper>
+    <paper id="24">
+      <title><fixed-case>CASCA</fixed-case>: Leveraging Role-Based Lexical Cues for Robust Multimodal Speaker Diarization via Large Language Models</title>
+      <author><first>William K</first><last>Nehrboss</last></author>
+      <pages>214–224</pages>
+      <url hash="993e98ee">2024.icnlsp-1.24</url>
+      <bibkey>nehrboss-2024-casca</bibkey>
+    </paper>
+    <paper id="25">
+      <title>Deep Information Maximisation to Mitigate Information Loss in Text Independent Speaker Verification</title>
+      <author><first>Nipun Thejan</first><last>Fonseka</last></author>
+      <author><first>Nirmal</first><last>Sankalana</last></author>
+      <pages>225–232</pages>
+      <url hash="7d4bf106">2024.icnlsp-1.25</url>
+      <bibkey>fonseka-sankalana-2024-deep</bibkey>
+    </paper>
+    <paper id="26">
+      <title>Context-Aware Question Answering in <fixed-case>U</fixed-case>rdu</title>
+      <author><first>Samreen</first><last>Kazi</last></author>
+      <author><first>Shakeel Ahmed</first><last>Khoja</last></author>
+      <pages>233–242</pages>
+      <url hash="70495800">2024.icnlsp-1.26</url>
+      <bibkey>kazi-khoja-2024-context</bibkey>
+    </paper>
+    <paper id="27">
+      <title>Human and Machine: Language Processing in Translation Tasks</title>
+      <author><first>Hening</first><last>Wang</last></author>
+      <author><first>Leixin</first><last>Zhang</last></author>
+      <author><first>Ondrej</first><last>Bojar</last></author>
+      <pages>243–250</pages>
+      <url hash="89dc2086">2024.icnlsp-1.27</url>
+      <bibkey>wang-etal-2024-human</bibkey>
+    </paper>
+    <paper id="28">
+      <title>Asking the Right Questions: Exploiting Hidden Interactions in a Generative Framework for Multilingual, Multitask Classification</title>
+      <author><first>Sebastian-Antonio</first><last>Toma</last></author>
+      <author><first>Camelia</first><last>Lemnaru</last></author>
+      <author><first>Vlad Andrei</first><last>Negru</last></author>
+      <author><first>Rodica</first><last>Potolea</last></author>
+      <pages>251–266</pages>
+      <url hash="0aee9cb6">2024.icnlsp-1.28</url>
+      <bibkey>toma-etal-2024-asking</bibkey>
+    </paper>
+    <paper id="29">
+      <title>Resolving Gender Biases in <fixed-case>LLM</fixed-case>s at Inference Time with Novel Dijkstra’s-based K-Explorers Neural Network Traversal (<fixed-case>K</fixed-case>e<fixed-case>NNT</fixed-case>)</title>
+      <author><first>Hanav</first><last>Modasiya</last></author>
+      <pages>267–278</pages>
+      <url hash="85f79352">2024.icnlsp-1.29</url>
+      <bibkey>modasiya-2024-resolving</bibkey>
+    </paper>
+    <paper id="30">
+      <title>Semantically Enriched Text Generation for <fixed-case>QA</fixed-case> through Dense Paraphrasing</title>
+      <author><first>Timothy</first><last>Obiso</last></author>
+      <author><first>Bingyang</first><last>Ye</last></author>
+      <author><first>Kyeongmin</first><last>Rim</last></author>
+      <author><first>James</first><last>Pustejovsky</last></author>
+      <pages>279–286</pages>
+      <url hash="9de96a2f">2024.icnlsp-1.30</url>
+      <bibkey>obiso-etal-2024-semantically</bibkey>
+    </paper>
+    <paper id="31">
+      <title>Introducing wwm-german-18k - Can <fixed-case>LLM</fixed-case>s Crack the Million? (Or Win at Least 500 Euros?)</title>
+      <author><first>Matthias</first><last>Aßenmacher</last></author>
+      <author><first>Luis</first><last>Karrlein</last></author>
+      <author><first>Philipp</first><last>Schiele</last></author>
+      <author><first>Christian</first><last>Heumann</last></author>
+      <pages>287–296</pages>
+      <url hash="37b85022">2024.icnlsp-1.31</url>
+      <bibkey>assenmacher-etal-2024-introducing</bibkey>
+    </paper>
+    <paper id="32">
+      <title>Native Language Identification Improves Authorship Attribution</title>
+      <author><first>Ahmet Yavuz</first><last>Uluslu</last></author>
+      <author><first>Gerold</first><last>Schneider</last></author>
+      <author><first>Can</first><last>Yildizli</last></author>
+      <pages>297–303</pages>
+      <url hash="c6223b65">2024.icnlsp-1.32</url>
+      <bibkey>uluslu-etal-2024-native</bibkey>
+    </paper>
+    <paper id="33">
+      <title>Design and Comparison of <fixed-case>A</fixed-case>rabic Negotiation Bots Using <fixed-case>LLM</fixed-case>s versus <fixed-case>S</fixed-case>eq2<fixed-case>S</fixed-case>eq Models with Reinforcement Learning</title>
+      <author><first>Ahmad</first><last>Hajj</last></author>
+      <author><first>Yasmine A Abu</first><last>Adla</last></author>
+      <author><first>Samah</first><last>Albast</last></author>
+      <author><first>Hazem</first><last>Hajj</last></author>
+      <author><first>Shady</first><last>Elbassuoni</last></author>
+      <author><first>Wassim El</first><last>Hajj</last></author>
+      <author><first>Khaled</first><last>Shaban</last></author>
+      <pages>304–318</pages>
+      <url hash="184ede38">2024.icnlsp-1.33</url>
+      <bibkey>hajj-etal-2024-design</bibkey>
+    </paper>
+    <paper id="34">
+      <title>Enhancing <fixed-case>LLM</fixed-case>-based <fixed-case>A</fixed-case>rabic Negotiation by Fine Tuning on Dialogue Shortcomings</title>
+      <author><first>Yasmine A Abu</first><last>Adla</last></author>
+      <author><first>Hazem</first><last>Hajj</last></author>
+      <author><first>Shady</first><last>Elbassuoni</last></author>
+      <author><first>Khaled</first><last>Shaban</last></author>
+      <author><first>Wassim El</first><last>Hajj</last></author>
+      <pages>319–342</pages>
+      <url hash="8c6ebd58">2024.icnlsp-1.34</url>
+      <bibkey>adla-etal-2024-enhancing</bibkey>
+    </paper>
+    <paper id="35">
+      <title>The Qiyas Benchmark: Measuring <fixed-case>C</fixed-case>hat<fixed-case>GPT</fixed-case> Mathematical and Language Understanding in <fixed-case>A</fixed-case>rabic</title>
+      <author><first>Shahad</first><last>Al-Khalifa</last></author>
+      <author><first>Hend</first><last>Al-Khalifa</last></author>
+      <pages>343–351</pages>
+      <url hash="60b600ae">2024.icnlsp-1.35</url>
+      <bibkey>al-khalifa-al-khalifa-2024-qiyas</bibkey>
+    </paper>
+    <paper id="36">
+      <title>Analyzing Politeness in <fixed-case>A</fixed-case>rabic Tweets: A Preliminary Study</title>
+      <author><first>Hend</first><last>Al-Khalifa</last></author>
+      <author><first>Nadia</first><last>Ghezaiel</last></author>
+      <author><first>Maria</first><last>Bounnit</last></author>
+      <pages>352–359</pages>
+      <url hash="1611f85a">2024.icnlsp-1.36</url>
+      <bibkey>al-khalifa-etal-2024-analyzing</bibkey>
+    </paper>
+    <paper id="37">
+      <title><fixed-case>F</fixed-case>eruza<fixed-case>S</fixed-case>peech: A 60 Hour <fixed-case>U</fixed-case>zbek Read Speech Corpus with Punctuation, Casing, and Context</title>
+      <author><first>Anna</first><last>Povey</last></author>
+      <author><first>Katherine</first><last>Povey</last></author>
+      <pages>360–364</pages>
+      <url hash="9bf0eb00">2024.icnlsp-1.37</url>
+      <bibkey>povey-povey-2024-feruzaspeech</bibkey>
+    </paper>
+    <paper id="38">
+      <title><fixed-case>B</fixed-case>ulgarian Grammar Error Correction with Data Augmentation and Machine Translation Techniques</title>
+      <author><first>Bozhidar</first><last>Klouchek</last></author>
+      <author><first>Riza</first><last>Batista-Navarro</last></author>
+      <pages>365–376</pages>
+      <url hash="520e613a">2024.icnlsp-1.38</url>
+      <bibkey>klouchek-batista-navarro-2024-bulgarian</bibkey>
+    </paper>
+    <paper id="39">
+      <title>On Barriers to Archival Audio Processing</title>
+      <author><first>Peter R</first><last>Sullivan</last></author>
+      <author><first>Muhammad</first><last>Abdul-Mageed</last></author>
+      <pages>377–383</pages>
+      <url hash="9d2f4261">2024.icnlsp-1.39</url>
+      <bibkey>sullivan-abdul-mageed-2024-barriers</bibkey>
+    </paper>
+    <paper id="40">
+      <title>Conversational Exploratory Search of Scholarly Publications Using Knowledge Graphs</title>
+      <author><first>Phillip</first><last>Schneider</last></author>
+      <author><first>Florian</first><last>Matthes</last></author>
+      <pages>384–396</pages>
+      <url hash="a3706a23">2024.icnlsp-1.40</url>
+      <bibkey>schneider-matthes-2024-conversational</bibkey>
+    </paper>
+    <paper id="41">
+      <title>A Hybrid Retrieval Approach for Advancing Retrieval-Augmented Generation Systems</title>
+      <author><first>Nguyen Nam</first><last>Doan</last></author>
+      <author><first>Aki</first><last>Härmä</last></author>
+      <author><first>Remzi</first><last>Celebi</last></author>
+      <author><first>Valeria</first><last>Gottardo</last></author>
+      <pages>397–409</pages>
+      <url hash="30192dd9">2024.icnlsp-1.41</url>
+      <bibkey>doan-etal-2024-hybrid</bibkey>
+    </paper>
+    <paper id="42">
+      <title>Investigating Gender Bias in Large Language Models Through Text Generation</title>
+      <author><first>Shweta</first><last>Soundararajan</last></author>
+      <author><first>Sarah Jane</first><last>Delany</last></author>
+      <pages>410–424</pages>
+      <url hash="6e51f283">2024.icnlsp-1.42</url>
+      <bibkey>soundararajan-delany-2024-investigating</bibkey>
+    </paper>
+    <paper id="43">
+      <title>Improving Long-term F0 representation using post-processing techniques</title>
+      <author><first>Crisron Rudolf</first><last>Lucas</last></author>
+      <author><first>Diptasree</first><last>Debnath</last></author>
+      <author><first>Andrew</first><last>Hines</last></author>
+      <pages>425–433</pages>
+      <url hash="b7232e64">2024.icnlsp-1.43</url>
+      <bibkey>lucas-etal-2024-improving</bibkey>
+    </paper>
+    <paper id="44">
+      <title>Text-to-Speech in <fixed-case>A</fixed-case>zerbaijani Language via Transfer Learning in a Low Resource Environment</title>
+      <author><first>Dzhavidan</first><last>Zeinalov</last></author>
+      <author><first>Bugra</first><last>Sen</last></author>
+      <author><first>Firangiz</first><last>Aslanova</last></author>
+      <pages>434–438</pages>
+      <url hash="5e7bc7f3">2024.icnlsp-1.44</url>
+      <bibkey>zeinalov-etal-2024-text</bibkey>
+    </paper>
+    <paper id="45">
+      <title><fixed-case>SG</fixed-case>-<fixed-case>RAG</fixed-case>: Multi-Hop Question Answering With Large Language Models Through Knowledge Graphs</title>
+      <author><first>Ahmmad O. M.</first><last>Saleh</last></author>
+      <author><first>Gokhan</first><last>Tur</last></author>
+      <author><first>Yucel</first><last>Saygin</last></author>
+      <pages>439–448</pages>
+      <url hash="f6e5e25e">2024.icnlsp-1.45</url>
+      <bibkey>saleh-etal-2024-sg</bibkey>
+    </paper>
+    <paper id="46">
+      <title>Linking <fixed-case>Q</fixed-case>uran and <fixed-case>H</fixed-case>adith Topics in an Ontology using Word Embeddings and Cellfie Plugin</title>
+      <author><first>Ibtisam Khalaf</first><last>Alshammari</last></author>
+      <author><first>Eric</first><last>Atwell</last></author>
+      <author><first>Mohammad Ammar</first><last>Alsalka</last></author>
+      <pages>449–455</pages>
+      <url hash="961320c4">2024.icnlsp-1.46</url>
+      <bibkey>alshammari-etal-2024-linking</bibkey>
+    </paper>
+    <paper id="47">
+      <title>Medical Information Extraction with Large Language Models</title>
+      <author><first>Raffaello</first><last>Fornasiere</last></author>
+      <author><first>Nicolò</first><last>Brunello</last></author>
+      <author><first>Vincenzo</first><last>Scotti</last></author>
+      <author><first>Mark</first><last>Carman</last></author>
+      <pages>456–466</pages>
+      <url hash="a065d4e9">2024.icnlsp-1.47</url>
+      <bibkey>fornasiere-etal-2024-medical</bibkey>
+    </paper>
+  </volume>
+</collection>
diff --git a/data/xml/2024.lrec.xml b/data/xml/2024.lrec.xml
index d014c43cbd..a80a1d4ada 100644
--- a/data/xml/2024.lrec.xml
+++ b/data/xml/2024.lrec.xml
@@ -14849,7 +14849,7 @@
       <bibkey>sedykh-etal-2024-searching</bibkey>
     </paper>
     <paper id="1262">
-      <title>Sebastian, Basti, Wastl?! Recognizing Named Entities in <fixed-case>B</fixed-case>avarian Dialectal Data</title>
+      <title><fixed-case>S</fixed-case>ebastian, <fixed-case>B</fixed-case>asti, <fixed-case>W</fixed-case>astl?! Recognizing Named Entities in <fixed-case>B</fixed-case>avarian Dialectal Data</title>
       <author><first>Siyao</first><last>Peng</last></author>
       <author><first>Zihang</first><last>Sun</last></author>
       <author><first>Huangyan</first><last>Shan</last></author>
diff --git a/data/xml/2024.nlp4call.xml b/data/xml/2024.nlp4call.xml
index 365a480ff4..326bdb140e 100644
--- a/data/xml/2024.nlp4call.xml
+++ b/data/xml/2024.nlp4call.xml
@@ -112,10 +112,10 @@
     </paper>
     <paper id="11">
       <title>Jingle <fixed-case>BERT</fixed-case>, Jingle <fixed-case>BERT</fixed-case>, Frozen All the Way: Freezing Layers to Identify <fixed-case>CEFR</fixed-case> Levels of Second Language Learners Using <fixed-case>BERT</fixed-case></title>
-      <author><first>Ricardo Muñoz</first><last>Sánchez</last></author>
+      <author><first>Ricardo</first><last>Muñoz Sánchez</last></author>
       <author><first>David</first><last>Alfter</last></author>
       <author><first>Simon</first><last>Dobnik</last></author>
-      <author><first>Maria</first><last>Szawerna</last></author>
+      <author><first>Maria Irena</first><last>Szawerna</last></author>
       <author><first>Elena</first><last>Volodina</last></author>
       <pages>137–152</pages>
       <url hash="9f2808e9">2024.nlp4call-1.11</url>
diff --git a/data/xml/2024.safety4convai.xml b/data/xml/2024.safety4convai.xml
index 71769df7a8..8ce230e7c7 100644
--- a/data/xml/2024.safety4convai.xml
+++ b/data/xml/2024.safety4convai.xml
@@ -5,7 +5,7 @@
       <booktitle>Proceedings of Safety4ConvAI: The Third Workshop on Safety for Conversational AI @ LREC-COLING 2024</booktitle>
       <editor><first>Tanvi</first><last>Dinkar</last></editor>
       <editor><first>Giuseppe</first><last>Attanasio</last></editor>
-      <editor><first>Amanda Cercas</first><last>Curry</last></editor>
+      <editor><first>Amanda</first><last>Cercas Curry</last></editor>
       <editor><first>Ioannis</first><last>Konstas</last></editor>
       <editor><first>Dirk</first><last>Hovy</last></editor>
       <editor><first>Verena</first><last>Rieser</last></editor>
diff --git a/data/xml/C18.xml b/data/xml/C18.xml
index 3779cfbce1..2b401a5e48 100644
--- a/data/xml/C18.xml
+++ b/data/xml/C18.xml
@@ -990,7 +990,7 @@
       <abstract>Generating natural language requires conveying content in an appropriate style. We explore two related tasks on generating text of varying formality: monolingual formality transfer and formality-sensitive machine translation. We propose to solve these tasks jointly using multi-task learning, and show that our models achieve state-of-the-art performance for formality transfer and are able to perform formality-sensitive translation without being explicitly trained on style-annotated translation examples.</abstract>
       <url hash="3b314eb5">C18-1086</url>
       <bibkey>niu-etal-2018-multi</bibkey>
-      <pwccode url="https://github.com/xingniu/multitask-ft-fsmt" additional="false">xingniu/multitask-ft-fsmt</pwccode>
+      <pwccode url="https://github.com/xingniu/multitask-ft-fsmt" additional="true">xingniu/multitask-ft-fsmt</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/gyafc">GYAFC</pwcdataset>
     </paper>
     <paper id="87">
diff --git a/data/xml/D18.xml b/data/xml/D18.xml
index ba07224caf..94840f6349 100644
--- a/data/xml/D18.xml
+++ b/data/xml/D18.xml
@@ -743,7 +743,7 @@
       <video href="D18-1052.mp4"/>
       <doi>10.18653/v1/D18-1052</doi>
       <bibkey>seo-etal-2018-phrase</bibkey>
-      <pwccode url="https://github.com/uwnlp/piqa" additional="false">uwnlp/piqa</pwccode>
+      <pwccode url="https://github.com/uwnlp/piqa" additional="true">uwnlp/piqa</pwccode>
     </paper>
     <paper id="53">
       <title>Ranking Paragraphs for Improving Answer Recall in Open-Domain Question Answering</title>
diff --git a/data/xml/P19.xml b/data/xml/P19.xml
index a2652ed418..ca1bb0efd2 100644
--- a/data/xml/P19.xml
+++ b/data/xml/P19.xml
@@ -9098,7 +9098,7 @@
       <revision id="1" href="P19-1644v1" hash="590f6420"/>
       <revision id="2" href="P19-1644v2" hash="9e13eba7">In the main paper (attached), in Table 3, the row for "Cardinality (soft)" has incorrect values under the NLVR and NLVR2 columns. The respective values should be 16 and 23.6. The value of 16 was reported in Suhr et al. 2017 (P17-2034; the error occurred when I accidentally overwrote the NLVR cell value).</revision>
       <bibkey>suhr-etal-2019-corpus</bibkey>
-      <pwccode url="https://github.com/lil-lab/nlvr" additional="false">lil-lab/nlvr</pwccode>
+      <pwccode url="https://github.com/lil-lab/nlvr" additional="true">lil-lab/nlvr</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/clevr">CLEVR</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/clevr-humans">CLEVR-Humans</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/coco">MS COCO</pwcdataset>