references.bib

@inproceedings{adamic_etal_2005,
  author = {Adamic, Lada A. and Glance, Natalie},
  title = {The Political Blogosphere and the 2004 U.S. Election: Divided They Blog},
  year = {2005},
  isbn = {1595932151},
  publisher = {Association for Computing Machinery},
  address = {New York, NY, USA},
  url = {https://doi.org/10.1145/1134271.1134277},
  doi = {10.1145/1134271.1134277},
  abstract = {In this paper, we study the linking patterns and discussion topics of political bloggers. Our aim is to measure the degree of interaction between liberal and conservative blogs, and to uncover any differences in the structure of the two communities. Specifically, we analyze the posts of 40 "A-list" blogs over the period of two months preceding the U.S. Presidential Election of 2004, to study how often they referred to one another and to quantify the overlap in the topics they discussed, both within the liberal and conservative communities, and also across communities. We also study a single day snapshot of over 1,000 political blogs. This snapshot captures blogrolls (the list of links to other blogs frequently found in sidebars), and presents a more static picture of a broader blogosphere. Most significantly, we find differences in the behavior of liberal and conservative blogs, with conservative blogs linking to each other more frequently and in a denser pattern.},
  booktitle = {Proceedings of the 3rd International Workshop on Link Discovery},
  pages = {36–43},
  numpages = {8},
  keywords = {social networks, political blogs, link analysis},
  location = {Chicago, Illinois},
  series = {LinkKDD '05}
}

@misc{alammar_2019, 
  title={The Illustrated Word2vec}, 
  url={http://jalammar.github.io/illustrated-word2vec/}, 
  journal={Jay Alammar – Visualizing machine learning one concept at a time}, 
  author={Alammar, Jay},
  year={2019}
} 

@article{allport_1942,
  author={Allport, G. W.},
  title={The use of personal documents in psychological science.},
  journal={Social Science Research Council Bulletin},
  year={1942},
  volume={49},
  pages={xix + 210-xix + 210},
  abstract={The psychological use of personal documents is traced from its uncritical beginnings at the turn of the century to its emergence in the last 20 years as a method in its own right. Its uses in molecular and molar research; in teaching; in suggesting new items for questionnaires; in inductive studies, often with the construction of typologies; in social psychology; etc. are examined. The place of the personal document in an idiographic rather than a nomothetic scheme is stressed: "Lawful happenings may be one-time events. Frequency is not a necessary test of validity." The forms of personal documents are presented with examples and discussion. Essentially, documents are reducible to autobiographies, questionnaire responses, verbatim recordings, diaries, letters, or expressive and projective productions. The evaluation of personal documents (65 pages) examines the case for and against their use. "It can be shown that{\ldots} critical tests of science are met by personal documents properly handled," and personal documents may be superior to actuarial methods by themselves in achieving the scientific goals of understanding, prediction, and control. Bibliography of 198 references; indices. (PsycINFO Database Record (c) 2016 APA, all rights reserved)}
}

@article{anderson_etal_1991,
  author = {Anne H. Anderson and Miles Bader and Ellen Gurman Bard and Elizabeth Boyle and Gwyneth Doherty and Simon Garrod and Stephen Isard and Jacqueline Kowtko and Jan McAllister and Jim Miller and Catherine Sotillo and Henry S. Thompson and Regina Weinert},
  title = {The HCRC Map Task Corpus},
  journal = {Language and Speech},
  volume = {34},
  number = {4},
  pages = {351-366},
  year = {1991},
  doi = {10.1177/002383099103400404},
  URL = {https://doi.org/10.1177/002383099103400404},
  eprint = {https://doi.org/10.1177/002383099103400404},
  abstract = { This paper describes a corpus of unscripted, task-oriented dialogues which has been designed, digitally recorded, and transcribed to support the study of spontaneous speech on many levels. The corpus uses the Map Task (Brown, Anderson, Yule, and Shillcock, 1983) in which speakers must collaborate verbally to reproduce on one participant's map a route printed on the other's. In all, the corpus includes four conversations from each of 64 young adults and manipulates the following variables: familiarity of speakers, eye contact between speakers, matching between landmarks on the participants' maps, opportunities for contrastive stress, and phonological characteristics of landmark names. The motivations for the design are set out and basic corpus statistics are presented. }
}

@article{araque_etal_2018,
  author       = {Oscar Araque and
                  Lorenzo Gatti and
                  Jacopo Staiano and
                  Marco Guerini},
  title        = {DepecheMood++: a Bilingual Emotion Lexicon Built Through Simple Yet
                  Powerful Techniques},
  journal      = {CoRR},
  volume       = {abs/1810.03660},
  year         = {2018},
  url          = {http://arxiv.org/abs/1810.03660},
  eprinttype    = {arXiv},
  eprint       = {1810.03660},
  timestamp    = {Tue, 30 Oct 2018 10:49:09 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-1810-03660.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@article{ashokkumar_pennebaker_2022,
    author = {Ashokkumar, Ashwini and Pennebaker, James W},
    title = {Tracking group identity through natural language within groups},
    journal = {PNAS Nexus},
    volume = {1},
    number = {2},
    pages = {pgac022},
    year = {2022},
    month = {06},
    abstract = {To what degree can we determine people's connections with groups through the language they use? In recent years, large archives of behavioral data from social media communities have become available to social scientists, opening the possibility of tracking naturally occurring group identity processes. A feature of most digital groups is that they rely exclusively on the written word. Across 3 studies, we developed and validated a language-based metric of group identity strength and demonstrated its potential in tracking identity processes in online communities. In Studies 1a–1c, 873 people wrote about their connections to various groups (country, college, or religion). A total of 2 language markers of group identity strength were found: high affiliation (more words like we, togetherness) and low cognitive processing or questioning (fewer words like think, unsure). Using these markers, a language-based unquestioning affiliation index was developed and applied to in-class stream-of-consciousness essays of 2,161 college students (Study 2). Greater levels of unquestioning affiliation expressed in language predicted not only self-reported university identity but also students’ likelihood of remaining enrolled in college a year later. In Study 3, the index was applied to naturalistic Reddit conversations of 270,784 people in 2 online communities of supporters of the 2016 presidential candidates—Hillary Clinton and Donald Trump. The index predicted how long people would remain in the group (3a) and revealed temporal shifts mirroring members’ joining and leaving of groups (3b). Together, the studies highlight the promise of a language-based approach for tracking and studying group identity processes in online groups.},
    issn = {2752-6542},
    doi = {10.1093/pnasnexus/pgac022},
    url = {https://doi.org/10.1093/pnasnexus/pgac022},
    eprint = {https://academic.oup.com/pnasnexus/article-pdf/1/2/pgac022/47087259/pgac022.pdf},
}

@misc{atari_etal_2023,
  title={Contextualized Construct Representation: Leveraging Psychometric Scales to Advance Theory-Driven Text Analysis},
  url={osf.io/preprints/psyarxiv/m93pd},
  DOI={10.31234/osf.io/m93pd},
  publisher={PsyArXiv},
  author={Atari, Mohammad and Omrani, Ali and Dehghani, Morteza},
  year={2023},
  month={Feb}
}

@book{baayen_2001,
  title={Word Frequency Distributions},
  author={Baayen, R.H.},
  isbn={9780792370178},
  lccn={2001029823},
  series={Text, Speech and Language Technology},
  url={https://link.springer.com/book/10.1007/978-94-010-0844-0},
  year={2001},
  publisher={Springer Netherlands}
}

@article{bandhakavi_etal_2021,
  author = {Bandhakavi, Anil and Wiratunga, Nirmalie and Massie, Stewart and P., Deepak},
  address = {Oxford},
  copyright = {2018 John Wiley & Sons, Ltd.},
  issn = {0266-4720},
  journal = {Expert systems},
  keywords = {Sentiment analysis ; Dirichlet problem ; Data mining ; Psychology ; Emotions ; Vocabulary ; Social media ; Grammar, Generative ; Generative grammar ; Computer science ; Natural Language Processing ; Information storage and retrieval systems ; Artificial intelligence},
  language = {eng},
  number = {7},
  publisher = {Blackwell Publishing Ltd},
  title = {Emotion‐aware polarity lexicons for Twitter sentiment analysis},
  volume = {38},
  year = {2021},
}

@inproceedings{badaro_etal_2018,
  title = "{E}mo{W}ord{N}et: Automatic Expansion of Emotion Lexicon Using {E}nglish {W}ord{N}et",
  author = "Badaro, Gilbert and Jundi, Hussein and Hajj, Hazem and El-Hajj, Wassim",
  editor = "Nissim, Malvina  and Berant, Jonathan  and Lenci, Alessandro",
  booktitle = "Proceedings of the Seventh Joint Conference on Lexical and Computational Semantics",
  month = jun,
  year = "2018",
  address = "New Orleans, Louisiana",
  publisher = "Association for Computational Linguistics",
  url = "https://aclanthology.org/S18-2009",
  doi = "10.18653/v1/S18-2009",
  pages = "86--93",
}

@proceedings{baumgartner_etal_2020,
  author = {Baumgartner, Jason and Zannettou, Savvas and Keegan, Brian and Squire, Megan and Blackburn, Jeremy},
  title = {The Pushshift Reddit Dataset},
  year = 2020,
  publisher = {Zenodo},
  month = jan,
  doi = {10.5281/zenodo.3608135},
  url = {https://doi.org/10.5281/zenodo.3608135}
}

@article{bellezza_etal_1986,
  title={Words high and low in pleasantness as rated by male and female college students},
  author={Bellezza, Francis S and Greenwald, Anthony G and Banaji, Mahzarin R},
  journal={Behavior Research Methods, Instruments, \& Computers},
  volume={18},
  pages={299--303},
  year={1986},
  publisher={Springer}
}

@article{biester_etal_2022,
    doi = {10.1371/journal.pone.0278179},
    author = {Biester, Laura and Pennebaker, James and Mihalcea, Rada},
    journal = {PLOS ONE},
    publisher = {Public Library of Science},
    title = {Emotional and cognitive changes surrounding online depression identity claims},
    year = {2022},
    month = {12},
    volume = {17},
    url = {https://doi.org/10.1371/journal.pone.0278179},
    pages = {1-20},
    abstract = {As social media has proliferated, a key aspect to making meaningful connections with people online has been revealing important parts of one’s identity. In this work, we study changes that occur in people’s language use after they share a specific piece of their identity: a depression diagnosis. To do so, we collect data from over five thousand users who have made such a statement, which we refer to as an identity claim. Prior to making a depression identity claim, the Reddit user’s language displays evidence of increasingly higher rates of anxiety, sadness, and cognitive processing language compared to matched controls. After the identity claim, these language markers decrease and more closely match the controls. Similarly, first person singular pronoun usage decreases following the identity claim, which was previously previously found to be indicative of self-focus and associated with depression. By further considering how and to whom people express their identity, we find that the observed longitudinal changes are larger for those who do so in ways that are more correlated with seeking help (sharing in a post instead of a comment; sharing in a mental health support forum). This work suggests that there may be benefits to sharing one’s depression diagnosis, especially in a semi-anonymous forum where others are likely to be empathetic.},
    number = {12},

}

@article{blei_etal_2003,
  author = {Blei, David M. and Ng, Andrew Y. and Jordan, Michael I.},
  title = {Latent dirichlet allocation},
  year = {2003},
  issue_date = {3/1/2003},
  publisher = {JMLR.org},
  volume = {3},
  number = {null},
  doi = {10.5555/944919.944937},
  issn = {1532-4435},
  abstract = {We describe latent Dirichlet allocation (LDA), a generative probabilistic model for collections of discrete data such as text corpora. LDA is a three-level hierarchical Bayesian model, in which each item of a collection is modeled as a finite mixture over an underlying set of topics. Each topic is, in turn, modeled as an infinite mixture over an underlying set of topic probabilities. In the context of text modeling, the topic probabilities provide an explicit representation of a document. We present efficient approximate inference techniques based on variational methods and an EM algorithm for empirical Bayes parameter estimation. We report results in document modeling, text classification, and collaborative filtering, comparing to a mixture of unigrams model and the probabilistic LSI model.},
  journal = {J. Mach. Learn. Res.},
  month = {mar},
  pages = {993–1022},
  numpages = {30}
}

@misc{blei_mcauliffe_2010,
  title={Supervised Topic Models}, 
  author={David M. Blei and Jon D. McAuliffe},
  year={2010},
  eprint={1003.0783},
  archivePrefix={arXiv},
  primaryClass={stat.ML}
}

@article{bojanowski_etal_2017,
  title={Enriching Word Vectors with Subword Information}, 
  author={Piotr Bojanowski and Edouard Grave and Armand Joulin and Tomas Mikolov},
  year={2017},
  eprint={1607.04606},
  archivePrefix={arXiv},
  primaryClass={cs.CL}
}

@unknown{boyd_etal_2022,
  author = {Boyd, Ryan and Ashokkumar, Ashwini and Seraj, Sarah and Pennebaker, James},
  year = {2022},
  month = {02},
  pages = {},
  title = {The Development and Psychometric Properties of LIWC-22},
  doi = {10.13140/RG.2.2.23890.43205}
}

@article{buechel_etal_2018,
  author       = {Sven Buechel and
                  Anneke Buffone and
                  Barry Slaff and
                  Lyle H. Ungar and
                  Jo{\~{a}}o Sedoc},
  title        = {Modeling Empathy and Distress in Reaction to News Stories},
  journal      = {CoRR},
  volume       = {abs/1808.10399},
  year         = {2018},
  url          = {http://arxiv.org/abs/1808.10399},
  eprinttype    = {arXiv},
  eprint       = {1808.10399},
  timestamp    = {Mon, 03 Sep 2018 13:36:40 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-1808-10399.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{buechel_etal_2020,
  title = "Learning and Evaluating Emotion Lexicons for 91 Languages",
  author = {Buechel, Sven  and
    R{\"u}cker, Susanna  and
    Hahn, Udo},
  editor = "Jurafsky, Dan  and
    Chai, Joyce  and
    Schluter, Natalie  and
    Tetreault, Joel",
  booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics",
  month = jul,
  year = "2020",
  address = "Online",
  publisher = "Association for Computational Linguistics",
  url = "https://aclanthology.org/2020.acl-main.112",
  doi = "10.18653/v1/2020.acl-main.112",
  pages = "1202--1217",
  abstract = "Emotion lexicons describe the affective meaning of words and thus constitute a centerpiece for advanced sentiment and emotion analysis. Yet, manually curated lexicons are only available for a handful of languages, leaving most languages of the world without such a precious resource for downstream applications. Even worse, their coverage is often limited both in terms of the lexical units they contain and the emotional variables they feature. In order to break this bottleneck, we here introduce a methodology for creating almost arbitrarily large emotion lexicons for any target language. Our approach requires nothing but a source language emotion lexicon, a bilingual word translation model, and a target language embedding model. Fulfilling these requirements for 91 languages, we are able to generate representationally rich high-coverage lexicons comprising eight emotional variables with more than 100k lexical entries each. We evaluated the automatically generated lexicons against human judgment from 26 datasets, spanning 12 typologically diverse languages, and found that our approach produces results in line with state-of-the-art monolingual approaches to lexicon creation and even surpasses human reliability for some languages and variables. Code and data are available at \url{https://github.com/JULIELab/MEmoLon} archived under DOI 10.5281/zenodo.3779901.",
}

@inproceedings{burger_etal_2011,
author = {Burger, John D. and Henderson, John and Kim, George and Zarrella, Guido},
title = {Discriminating Gender on Twitter},
year = {2011},
isbn = {9781937284114},
publisher = {Association for Computational Linguistics},
address = {USA},
abstract = {Accurate prediction of demographic attributes from social media and other informal online content is valuable for marketing, personalization, and legal investigation. This paper describes the construction of a large, multilingual dataset labeled with gender, and investigates statistical models for determining the gender of uncharacterized Twitter users. We explore several different classifier types on this dataset. We show the degree to which classifier accuracy varies based on tweet volumes as well as when various kinds of profile metadata are included in the models. We also perform a large-scale human assessment using Amazon Mechanical Turk. Our methods significantly out-perform both baseline models and almost all humans on the same task.},
booktitle = {Proceedings of the Conference on Empirical Methods in Natural Language Processing},
pages = {1301–1309},
numpages = {9},
location = {Edinburgh, United Kingdom},
series = {EMNLP '11}
}

@article{brysbaert_etal_2014,
  title={Concreteness ratings for 40 thousand generally known English word lemmas},
  author={Brysbaert, Marc and Warriner, Amy Beth and Kuperman, Victor},
  journal={Behavior research methods},
  volume={46},
  pages={904--911},
  year={2014},
  publisher={Springer}
}

@inproceedings{cai_etal_2021,
  title={Isotropy in the Contextual Embedding Space: Clusters and Manifolds},
  author={Xingyu Cai and Jiaji Huang and Yuchen Bian and Kenneth Church},
  booktitle={International Conference on Learning Representations},
  year={2021},
  url={https://openreview.net/forum?id=xYGNO86OWDH}
}

@article{chatterjee_etal_2023,
  author = {Chatterjee, Promothesh and Mishra, Himanshu and Mishra, Arul},
  year = {2023},
  month = {05},
  pages = {},
  title = {Does the first letter of one's name affect life decisions? A natural language processing examination of nominative determinism},
  volume = {125},
  journal = {Journal of personality and social psychology},
  doi = {10.1037/pspa0000347}
}

@article{chersoni_etal_2021,
  title = {Decoding Word Embeddings with Brain-Based Semantic Features},
  author = {Chersoni, Emmanuele and Santus, Enrico  and Huang, Chu-Ren and Lenci, Alessandro},
  journal = {Computational Linguistics},
  volume = {47},
  number = {3},
  month = nov,
  year = {2021},
  address = {Cambridge, MA},
  publisher = {MIT Press},
  url = {https://aclanthology.org/2021.cl-3.20},
  doi = {10.1162/coli_a_00412},
  pages = {663--698},
  abstract = {Word embeddings are vectorial semantic representations built with either counting or predicting techniques aimed at capturing shades of meaning from word co-occurrences. Since their introduction, these representations have been criticized for lacking interpretable dimensions. This property of word embeddings limits our understanding of the semantic features they actually encode. Moreover, it contributes to the {``}black box{''} nature of the tasks in which they are used, since the reasons for word embedding performance often remain opaque to humans. In this contribution, we explore the semantic properties encoded in word embeddings by mapping them onto interpretable vectors, consisting of explicit and neurobiologically motivated semantic features (Binder et al. 2016). Our exploration takes into account different types of embeddings, including factorized count vectors and predict models (Skip-Gram, GloVe, etc.), as well as the most recent contextualized representations (i.e., ELMo and BERT). In our analysis, we first evaluate the quality of the mapping in a retrieval task, then we shed light on the semantic features that are better encoded in each embedding type. A large number of probing tasks is finally set to assess how the original and the mapped embeddings perform in discriminating semantic categories. For each probing task, we identify the most relevant semantic features and we show that there is a correlation between the embedding performance and how they encode those features. This study sets itself as a step forward in understanding which aspects of meaning are captured by vector spaces, by proposing a new and simple method to carve human-interpretable semantic representations from distributional vectors.}
}

@article{choi_choi_2010,
  abstract = {We compared the magnitude of the hindsight bias in individuals and groups with the prediction that the plausibility of an outcome would affect the magnitude of the group–individual difference. We provided groups and individuals with outcomes of scientific studies, and asked them to predict the probability of those outcomes as if they did not know the given outcomes and to report their level of surprise at the outcomes. Overall, groups were more prone to hindsight bias than were individuals, but the group–individual difference was present only when the given outcomes were relatively implausible (Study 1). Moreover, this difference was not eliminated even when participants were asked to consider alternative outcomes (Study 2). Implications are discussed. [ABSTRACT FROM AUTHOR]},
  author = {Choi, Dong‐Won and Choi, Incheol},
  issn = {00219029},
  journal = {Journal of Applied Social Psychology},
  keywords = {SOCIAL psychology research, HINDSIGHT bias (Psychology), PREJUDICES, MEMORY, HUMAN behavior research, SOCIAL groups, PLAUSIBILITY (Logic)},
  number = {2},
  pages = {325 - 343},
  title = {A Comparison of Hindsight Bias in Groups and Individuals: The Moderating Role of Plausibility.},
  volume = {40},
  url = {https://search.ebscohost.com/login.aspx?direct=true&amp;db=sxi&amp;AN=48116256&amp;site=ehost-live},
  year = {2010},
}

@article{choi_nisbett_2000,
  pages = {890-905},
  publisher = {American Psychological Association},
  title = {Cultural Psychology of Surprise: Holistic Theories and Recognition of Contradiction},
  volume = {79},
  year = {2000},
  author = {Choi, Incheol and Nisbett, Richard E},
  address = {Washington, DC},
  copyright = {2000 American Psychological Association},
  issn = {0022-3514},
  journal = {Journal of personality and social psychology},
  keywords = {Adult ; Asians ; Attitude formation ; Attitudes ; Behavior ; Biological and medical sciences ; Cognition ; Concept Formation ; Cross Cultural Differences ; Cross-Cultural Comparison ; Cultural differences ; Culture ; Emotional Responses ; Emotions ; Ethnic Groups - psychology ; Expectations ; Female ; Fundamental and applied biological sciences. Psychology ; Helping Behavior ; Holism ; Human ; Humans ; Korea ; Logic ; Male ; Probability Learning ; Psychology ; Psychology. Psychoanalysis. Psychiatry ; Psychology. Psychophysiology ; Response Bias ; Social attribution, perception and cognition ; Social Perception ; Social psychology ; U.S.A ; United States},
  language = {eng},
  number = {6},
  abstract = {The authors tested the hypothesis that East Asians, because of their holistic reasoning, take contradiction and inconsistency for granted and consequently are less likely than Americans to experience surprise. Studies 1 and 2 showed that Korean participants displayed less surprise and greater hindsight bias than American participants did when a target's behavior contradicted their expectations. Studies 3 and 4 further demonstrated that even when contradiction was created in highly explicit ways, Korean participants experienced little surprise, whereas American participants reported substantial surprise. We discuss the implications of these findings for various issues, including the psychology of conviction, cognitive dissonance, and the development of science.},
}

@article{chung_pennebaker_2008,
  title = {Revealing dimensions of thinking in open-ended self-descriptions: An automated meaning extraction method for natural language},
  journal = {Journal of Research in Personality},
  volume = {42},
  number = {1},
  pages = {96-132},
  year = {2008},
  issn = {0092-6566},
  doi = {https://doi.org/10.1016/j.jrp.2007.04.006},
  url = {https://www.sciencedirect.com/science/article/pii/S0092656607000451},
  author = {Cindy K. Chung and James W. Pennebaker},
  keywords = {LIWC, Meaning extraction method, Natural language, Self-descriptions},
  abstract = {A new method for extracting common themes from written text is introduced and applied to 1165 open-ended self-descriptive narratives. Drawing on a lexical approach to personality, the most commonly-used adjectives within narratives written by college students were identified using computerized text analytic tools. A factor analysis on the use of these adjectives in the self-descriptions produced a 7-factor solution consisting of psychologically meaningful dimensions. Some dimensions were unipolar (e.g., Negativity factor, wherein most loaded items were negatively valenced adjectives); others were dimensional in that semantically opposite words clustered together (e.g., Sociability factor, wherein terms such as shy, outgoing, reserved, and loud all loaded in the same direction). The factors exhibited modest reliability across different types of writing samples and were correlated with self-reports and behaviors consistent with the dimensions. Similar analyses with additional content words (adjectives, adverbs, nouns, and verbs) yielded additional psychological dimensions associated with physical appearance, school, relationships, etc. in which people contextualize their self-concepts. The results suggest that the meaning extraction method is a promising strategy that determines the dimensions along which people think about themselves.}
}

@inproceedings{cohan_etal_2018,
    title = {{SMHD}: a Large-Scale Resource for Exploring Online Language Usage for Multiple Mental Health Conditions},
    author = {Cohan, Arman and Desmet, Bart and Yates, Andrew and Soldaini, Luca and MacAvaney, Sean and Goharian, Nazli},
    editor = {Bender, Emily M. and Derczynski, Leon and Isabelle, Pierre},
    booktitle = {Proceedings of the 27th International Conference on Computational Linguistics},
    year = {2018},
    address = {Santa Fe, New Mexico, USA},
    publisher = {Association for Computational Linguistics},
    url = {https://aclanthology.org/C18-1126},
    pages = {1485--1497},
    abstract = {Mental health is a significant and growing public health concern. As language usage can be leveraged to obtain crucial insights into mental health conditions, there is a need for large-scale, labeled, mental health-related datasets of users who have been diagnosed with one or more of such conditions. In this paper, we investigate the creation of high-precision patterns to identify self-reported diagnoses of nine different mental health conditions, and obtain high-quality labeled data without the need for manual labelling. We introduce the SMHD (Self-reported Mental Health Diagnoses) dataset and make it available. SMHD is a novel large dataset of social media posts from users with one or multiple mental health conditions along with matched control users. We examine distinctions in users{'} language, as measured by linguistic and psychological variables. We further explore text classification methods to identify individuals with mental conditions through their language.}
}

@article{creegan_1944,
  author={Creegan, Robert F.},
  title={The phenomenological analysis of personal documents.},
  journal={The Journal of Abnormal and Social Psychology},
  year={1944},
  publisher={American Psychological Association},
  address={US},
  volume={39},
  number={2},
  pages={244-266},
  keywords={*Narratives; *Personality; Phenomenology},
  abstract={The diary of a college boy is analyzed to illustrate the method of integral phenomenology. In comparing the personal worlds revealed by such documents, topical analysis is considered superficial. The content is analyzed, rather, by categories of form (complexity of expression and evaluation); change (in personal world); ideas of causation; values; plenitude (intensity of experience); direction (origin of values and actions); and distance (from objects and topics of interest and from conventional norms). The writer also considers the content symbolic of three complexes, apparently unrelated to these categories but suggested by the case. (PsycInfo Database Record (c) 2021 APA, all rights reserved)},
  issn={0096-851X(Print)},
  doi={10.1037/h0062816},
  url={https://doi.org/10.1037/h0062816}
}

@book{crystal_1997,
  author = {Crystal, David},
  address = {Cambridge},
  booktitle = {The Cambridge encyclopedia of language},
  isbn = {0521550505},
  edition = {Second edition.},
  language = {eng},
  lccn = {96003104},
  publisher = {Cambridge University Press},
  title = {The Cambridge encyclopedia of language},
  year = {1997},
}

@article{curini_valerio_2021,
  author = {Curini, Luigi and Vignoli, Valerio},
  title = "{Committed Moderates and Uncommitted Extremists: Ideological Leaning and Parties’ Narratives on Military Interventions in Italy}",
  journal = {Foreign Policy Analysis},
  volume = {17},
  number = {3},
  pages = {orab016},
  year = {2021},
  month = {05},
  abstract = "{Current research highlights that ideology decisively affects political contestation concerning peace and security operations in European countries. In particular, recent studies suggest that party preferences on this issue follow a curvilinear distribution along the left-right axis, delineating a conflict between moderate and extreme parties. However, the impact of this cleavage on the use of strategic narratives to either support or criticize these missions requires more attention. This article aims to fill this gap by employing seeded latent Dirichlet allocation, a semi-supervised automated text analysis method, to analyze parliamentary debates on Italy's most significant troop deployments between 1994 and 2013. We expect to find that while moderates express a supportive narrative aimed at justifying the use of force, extremists attempt to delegitimize military interventions. Accordingly, we hypothesize that moderate parties emphasize more on the multilateral and humanitarian framework of a mission, while extremist parties focus more on its military means. The empirical findings largely confirm our hypotheses. By means of its method and results, the article contributes both empirically and methodologically to the debate on the party politics of military interventions in Europe.}",
  issn = {1743-8586},
  doi = {10.1093/fpa/orab016},
  url = {https://doi.org/10.1093/fpa/orab016},
  eprint = {https://academic.oup.com/fpa/article-pdf/17/3/orab016/38108570/orab016.pdf},
}

@article{dideriksen_etal_2023, 
  author = {Dideriksen, C. and Christiansen, M. H. and Tylén, K. and Dingemanse, M. and Fusaroli, R.}, 
  title = {Quantifying the interplay of conversational devices in building mutual understanding.}, 
  journal = {Journal of Experimental Psychology: General}, 
  year = {2023}, 
  volume = {152}, 
  issue = {3}, 
  pages = {864-889}, 
  doi = {10.1037/xge0001301} 
}

@inproceedings{dingemanse_liesenfeld_2022,
    title = {From text to talk: {H}arnessing conversational corpora for humane and diversity-aware language technology},
    author = {Dingemanse, Mark  and Liesenfeld, Andreas},
    editor = {Muresan, Smaranda  and Nakov, Preslav  and Villavicencio, Aline},
    booktitle = {Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
    month = may,
    year = {2022},
    address = {Dublin, Ireland},
    publisher = {Association for Computational Linguistics},
    url = {https://aclanthology.org/2022.acl-long.385},
    doi = {10.18653/v1/2022.acl-long.385},
    pages = {5614--5633},
    abstract = {Informal social interaction is the primordial home of human language. Linguistically diverse conversational corpora are an important and largely untapped resource for computational linguistics and language technology. Through the efforts of a worldwide language documentation movement, such corpora are increasingly becoming available. We show how interactional data from 63 languages (26 families) harbours insights about turn-taking, timing, sequential structure and social action, with implications for language technology, natural language understanding, and the design of conversational interfaces. Harnessing linguistically diverse conversational corpora will provide the empirical foundations for flexible, localizable, humane language technologies of the future.},
}

@article{davies_2009,
  title={The 385+ million word Corpus of Contemporary American English (1990―2008+): Design, architecture, and linguistic insights},
  author={Mark Davies},
  journal={International Journal of Corpus Linguistics},
  year={2009},
  volume={14},
  pages={159-190},
  url={https://www.english-corpora.org//coca/}
}

@article{deerwester_etal_1990,
  title={Indexing by Latent Semantic Analysis},
  author={Scott Deerwester and Susan T. Dumais and George W. Furnas and Thomas K. Landauer and Richard A. Harshman},
  journal={Journal of the Association for Information Science and Technology},
  year={1990},
  publisher={John Wiley & Sons, Ltd},
  volume={41},
  pages={391-407},
  number={6},
  doi={10.1002/(SICI)1097-4571(199009)41:6<391::AID-ASI1>3.0.CO;2-9},  
}

@misc{devlin_etal_2019,
  title={BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding}, 
  author={Jacob Devlin and Ming-Wei Chang and Kenton Lee and Kristina Toutanova},
  year={2019},
  eprint={1810.04805},
  archivePrefix={arXiv},
  primaryClass={cs.CL}
}

@article{diveica_etal_2023,
  title={Quantifying social semantics: An inclusive definition of socialness and ratings for 8388 English words},
  author={Diveica, Veronica and Pexman, Penny M and Binney, Richard J},
  journal={Behavior Research Methods},
  volume={55},
  number={2},
  pages={461--473},
  year={2023},
  publisher={Springer}
}

@article{downs_etal_2017,
  title={Detection of Suicidality in Adolescents with Autism Spectrum Disorders: Developing a Natural Language Processing Approach for Use in Electronic Health Records},
  author={Johnny M Downs and Sumithra Velupillai and George Gkotsis and Rachel Holden and Maxim Kikoler and Harry Dean and andrea C. Fernandes and Rina Dutta},
  journal={AMIA ... Annual Symposium proceedings. AMIA Symposium},
  year={2017},
  volume={2017},
  pages={
          641-649
        },
  url={https://api.semanticscholar.org/CorpusID:7388358}
}

@article{duran_etal_2019,
  journal = {Psychological methods},
  author = {Duran, Nicholas D. and Paxton, Alexandra and Fusaroli, Riccardo},
  number = {4},
  pages = {419-438},
  publisher = {American Psychological Association},
  title = {ALIGN: Analyzing Linguistic Interactions With Generalizable techNiques-A Python Library},
  volume = {24},
  year = {2019},
  abstract = {Linguistic alignment (LA) is the tendency during a conversation to reuse each other's linguistic expressions, including lexical, conceptual, or syntactic structures. LA is often argued to be a crucial driver in reciprocal understanding and interpersonal rapport, though its precise dynamics and effects are still controversial. One barrier to more systematic investigation of these effects lies in the diversity in the methods employed to analyze LA, which makes it difficult to integrate and compare results of individual studies. To overcome this issue, we have developed ALIGN (Analyzing Linguistic Interactions with Generalizable techNiques), an open-source Python package to measure LA in conversation (https://pypi.python.org/pypi/align) along with in-depth open-source tutorials hosted on ALIGN's GitHub repository (https://github.com/nickduran/align-linguistic-alignment). Here, we first describe the challenges in the study of LA and outline how ALIGN can address them. We then demonstrate how our analytical protocol can be applied to theory-driven questions using a complex corpus of dialogue (the Devil's Advocate corpus; Duran & Fusaroli, 2017). We close by identifying further challenges and point to future developments of the field.},
  address = {United States},
  copyright = {2019 American Psychological Association},
  issn = {1082-989X},
}

@article{eichstaedt_etal_2015,
  author = {Johannes C. Eichstaedt and H. Andrew Schwartz and Margaret L. Kern and Gregory Park and Darwin R. Labarthe and Raina M. Merchant and Sneha Jha and Megha Agrawal and Lukasz A. Dziurzynski and Maarten Sap and Christopher Weeg and Emily E. Larson and Lyle H. Ungar and Martin E. P. Seligman},
  title ={Psychological Language on Twitter Predicts County-Level Heart Disease Mortality},
  journal = {Psychological Science},
  volume = {26},
  number = {2},
  pages = {159-169},
  year = {2015},
  doi = {10.1177/0956797614557867},
  note ={PMID: 25605707},
  URL = {https://osf.io/rt6w2/},
  eprint = {https://doi.org/10.1177/0956797614557867},
  abstract = { Hostility and chronic stress are known risk factors for heart disease, but they are costly to assess on a large scale. We used language expressed on Twitter to characterize community-level psychological correlates of age-adjusted mortality from atherosclerotic heart disease (AHD). Language patterns reflecting negative social relationships, disengagement, and negative emotions—especially anger—emerged as risk factors; positive emotions and psychological engagement emerged as protective factors. Most correlations remained significant after controlling for income and education. A cross-sectional regression model based only on Twitter language predicted AHD mortality significantly better than did a model that combined 10 common demographic, socioeconomic, and health risk factors, including smoking, diabetes, hypertension, and obesity. Capturing community psychological characteristics through social media is feasible, and these characteristics are strong markers of cardiovascular mortality at the community level. }
}

@article{engelthaler_hills_2018,
  title={Humor norms for 4,997 English words},
  author={Engelthaler, Tomas and Hills, Thomas T},
  journal={Behavior research methods},
  volume={50},
  pages={1116--1124},
  year={2018},
  publisher={Springer}
}

@misc{ethayarajh_2019,
  title={How Contextual are Contextualized Word Representations? Comparing the Geometry of BERT, ELMo, and GPT-2 Embeddings}, 
  author={Kawin Ethayarajh},
  year={2019},
  eprint={1909.00512},
  archivePrefix={arXiv},
  primaryClass={cs.CL}
}

@misc{ethayarajh_etal_2019,
  title={Towards Understanding Linear Word Analogies}, 
  author={Kawin Ethayarajh and David Duvenaud and Graeme Hirst},
  year={2019},
  eprint={1810.04882},
  archivePrefix={arXiv},
  primaryClass={cs.CL}
}

@article{fedorenko_varley_2016,
  author = {Fedorenko, Evelina and Varley, Rosemary},
  year = {2016},
  month = {04},
  pages = {},
  title = {Language and thought are not the same thing: Evidence from neuroimaging and neurological patients},
  volume = {1369},
  journal = {Annals of the New York Academy of Sciences},
  doi = {10.1111/nyas.13046}
}

@article{feng_etal_2015,
  author = {Feng, Shi and Song, Kaisong and Wang, Daling and Yu, Ge},
  address = {New York},
  copyright = {Springer Science+Business Media New York 2014},
  issn = {1386-145X},
  journal = {World wide web (Bussum)},
  keywords = {Computer science ; Database management ; Digital media ; Collection ; Social networks ; Thesauri ; Data mining ; Microblogs ; Sentiment analysis ; Social media ; Data Science ; Natural Language Processing ; Artificial intelligence},
  language = {eng},
  number = {4},
  pages = {949-967},
  publisher = {Springer US},
  title = {A word-emoticon mutual reinforcement ranking model for building sentiment lexicon from massive collection of microblogs},
  volume = {18},
  year = {2015},
}

@article{frimer_etal_2019,
  title={Moral foundations dictionary for linguistic analyses 2.0},
  author={Frimer, Jeremy A and Boghrati, Reihane and Haidt, Jonathan and Graham, Jesse and Dehgani, Morteza},
  journal={Unpublished manuscript},
  year={2019}
}

@article{gagne_etal_2005,
  author = {Gagné, Christina and Spalding, Thomas and Ji, Hongbo},
  year = {2005},
  month = {09},
  pages = {445-455},
  title = {Re-examining evidence for the use of independent relational representations during conceptual combination},
  volume = {53},
  journal = {Journal of Memory and Language},
  doi = {10.1016/j.jml.2005.03.006}
}

@article{gale_sampson_1995,
	title = {Good‐turing frequency estimation without tears},
	volume = {2},
	issn = {0929-6174},
	url = {https://doi.org/10.1080/09296179508590051},
	doi = {10.1080/09296179508590051},
	abstract = {Linguists and speech researchers who use statistical methods often need to estimate the frequency of some type of item in a population containing items of various types. A common approach is to divide the number of cases observed in a sample by the size of the sample; sometimes small positive quantities are added to divisor and dividend in order to avoid zero estimates for types missing from the sample. These approaches are obvious and simple, but they lack principled justification, and yield estimates that can be wildly inaccurate. I.J. Good and Alan Turing developed a family of theoretically well‐founded techniques appropriate to this domain. Some versions of the Good‐Turing approach are very demanding computationally, but we define a version, the Simple Good‐Turing estimator, which is straightforward to use. Tested on a variety of natural‐language‐related data sets, the Simple Good‐Turing estimator performs well, absolutely and relative both to the approaches just discussed and to other, more sophisticated techniques.},
	number = {3},
	urldate = {2024-04-08},
	journal = {Journal of Quantitative Linguistics},
	author = {Gale, William A. and Sampson, Geoffrey},
	month = jan,
	year = {1995},
	note = {Publisher: Routledge
\_eprint: https://doi.org/10.1080/09296179508590051},
	pages = {217--237},
}

@inproceedings{ganesan_etal_2021,
   title={Empirical Evaluation of Pre-trained Transformers for Human-Level NLP: The Role of Sample Size and Dimensionality},
   url={http://dx.doi.org/10.18653/v1/2021.naacl-main.357},
   DOI={10.18653/v1/2021.naacl-main.357},
   booktitle={Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies},
   publisher={Association for Computational Linguistics},
   author={Ganesan, Adithya and Matero, Matthew and Ravula, Aravind Reddy and Vu, Huy and Schwartz, H. Andrew},
   year={2021},
}

@misc{gao_etal_2019,
  title={Representation Degeneration Problem in Training Natural Language Generation Models}, 
  author={Jun Gao and Di He and Xu Tan and Tao Qin and Liwei Wang and Tie-Yan Liu},
  year={2019},
  eprint={1907.12009},
  archivePrefix={arXiv},
  primaryClass={cs.CL}
}

@article{garcia_sikstrom_2013,
  author={Danilo Garcia and Sverker Sikström},
  title={{Quantifying the Semantic Representations of Adolescents’ Memories of Positive and Negative Life Events}},
  journal={Journal of Happiness Studies},
  year=2013,
  volume={14},
  number={4},
  pages={1309-1323},
  month={August},
  keywords={Quantitative semantic; Adolescence; Affect; Autobiographical memory; Happiness; Latent semantic anal},
  doi={10.1007/s10902-012-9385-8},
  abstract={We quantified the semantic content in adolescents’ descriptions of positive and negative life events and studied how these descriptions are related to the assessment subjective well-being (SWB) at two points in time. The semantic content of the descriptions was quantified by latent semantic analysis (LSA). LSA is a computational method based on algorithms stemming from computational linguistics, where a high dimensional semantic representation of words can be generated from co-occurrence of words in huge text corpora. We investigated if the semantic content of written autobiographical memories of positive and negative life events predicted traditionally ranked measures of SWB, i.e., self-reports of Positive and Negative Affect, and thus created semantic measures of SWB. Such measures can be used to investigate the relationship between semantic content and SWB, which could only indirectly be accomplished by the ranked data. Pupils wrote down positive or negative life events during the last 3 months and self-reported SWB. Four weeks later, participants were presented with their own description and asked to report current SWB. The results showed that the semantic representation predicted SWB and experimental conditions. The agreement between semantic and ranked measures supports the validity of the semantic scores. We argue that our proposed method for studying SWB provides new and essential information about well-being by the quantification of a richer set of information from adolescents’ own memories. Copyright Springer Science+Business Media B.V. 2013},
  url={https://ideas.repec.org/a/spr/jhappi/v14y2013i4p1309-1323.html}
}

@article{garten_etal_2018,
  title={Dictionaries and distributions: Combining expert knowledge and large scale textual data content analysis: Distributed dictionary representation},
  author={Garten, Justin and Hoover, Joe and Johnson, Kate M and Boghrati, Reihane and Iskiwitch, Carol and Dehghani, Morteza},
  journal={Behavior research methods},
  volume={50},
  pages={344--361},
  year={2018},
  publisher={Springer}
}

@article{giuntini_etal_2020,
  abstract = {Social networks have become another resource for supporting mental health specialists in making inferences and finding indications of mental disorders, such as depression. This paper addresses the state-of-the-art regarding studies on recognition of depressive mood disorders in social networks through approaches and techniques of sentiment and emotion analysis. The systematic research conducted focused on social networks, social media, and the most employed techniques, feelings, and emotions were analyzed to find predecessors of a depressive disorder. Discussions on the research gaps identified aimed at improving the effectiveness of the analysis process, bringing the analysis close to the user’s reality. Twitter, Facebook, Blogs and Forums, Reddit, Live Journal, and Instagram are the most employed social networks regarding the identification of depressive mood disorders, and the most used information was text, followed by emoticons, user log information, and images. The selected studies usually employ classic off-the-shelf classifiers for the analysis of the available information, combined with lexicons such as NRC Word-Emoticon Association Lexicon, WordNet-Affect, Anew, and LIWC tool. The challenges include the analysis of temporal information and a combination of different types of information.},
  author = {Giuntini, Felipe T. and Cazzolato, Mirela T. and dos Reis, Maria de Jesus Dutra and Campbell, Andrew T. and Traina, Agma J. M. and Ueyama, Jó},
  address = {Berlin/Heidelberg},
  copyright = {Springer-Verlag GmbH Germany, part of Springer Nature 2020},
  issn = {1868-5137},
  journal = {Journal of ambient intelligence and humanized computing},
  keywords = {Engineering ; Computational intelligence ; Artificial intelligence ; Digital media ; Mental illness ; Psychology, Pathological ; Mental health ; Affective disorders ; Social networks ; Depression, Mental ; Emotions ; Appetite ; Data Science ; Data mining ; Sleep disorders ; Suicidal behavior ; Insomnia ; Self-perception},
  language = {eng},
  number = {11},
  pages = {4713-4729},
  publisher = {Springer Berlin Heidelberg},
  title = {A review on recognizing depression in social networks: challenges and opportunities},
  volume = {11},
  year = {2020},
}

@article{goldberg_levy_2014,
      title={word2vec Explained: deriving Mikolov et al.'s negative-sampling word-embedding method}, 
      author={Yoav Goldberg and Omer Levy},
      year={2014},
      eprint={1402.3722},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

@article{golder_macy_2011,
author = {Scott A. Golder  and Michael W. Macy },
title = {Diurnal and Seasonal Mood Vary with Work, Sleep, and Daylength Across Diverse Cultures},
journal = {Science},
volume = {333},
number = {6051},
pages = {1878-1881},
year = {2011},
doi = {10.1126/science.1202775},
URL = {https://www.science.org/doi/abs/10.1126/science.1202775},
eprint = {https://www.science.org/doi/pdf/10.1126/science.1202775},
abstract = {Across the world the collective mood heightens
at breakfast time and during the weekend. We identified individual-level diurnal and seasonal mood rhythms in cultures across the globe, using data from millions of public Twitter messages. We found that individuals awaken in a good mood that deteriorates as the day progresses—which is consistent with the effects of sleep and circadian rhythm—and that seasonal change in baseline positive affect varies with change in daylength. People are happier on weekends, but the morning peak in positive affect is delayed by 2 hours, which suggests that people awaken later on weekends.}
}

@article{good_1953,
	title = {The {Population} {Frequencies} of {Species} and the {Estimation} of {Population} {Parameters}},
	volume = {40},
	issn = {0006-3444},
	url = {https://www.jstor.org/stable/2333344},
	doi = {10.2307/2333344},
	abstract = {A random sample is drawn from a population of animals of various species. (The theory may also be applied to studies of literary vocabulary, for example.) If a particular species is represented r times in the sample of size N, then r/N is not a good estimate of the population frequency, p, when r is small. Methods are given for estimating p, assuming virtually nothing about the underlying population. The estimates are expressed in terms of smoothed values of the numbers nr (r = 1, 2, 3...), where nr is the number of distinct species that are each represented r times in the sample. (nr may be described as `the frequency of the frequency r'.) Turing is acknowledged for the most interesting formula in this part of the work. An estimate of the proportion of the population represented by the species occurring in the sample is an immediate corollary. Estimates are made of measures of heterogeneity of the population, including Yule's characteristic' and Shannon's entropy'. Methods are then discussed that do depend on assumptions about the underlying population. It is here that most work has been done by other writers. It is pointed out that a hypothesis can give a good fit to the numbers nr but can give quite the wrong value for Yule's characteristic. An example of this is Fisher's fit to some data of Williams's on Macrolepidoptera.},
	number = {3/4},
	urldate = {2024-04-08},
	journal = {Biometrika},
	author = {Good, I. J.},
	year = {1953},
	note = {Publisher: [Oxford University Press, Biometrika Trust]},
	pages = {237--264},
	file = {JSTOR Full Text PDF:/Users/louisteitelbaum/Zotero/storage/E5Q73MJK/Good - 1953 - The Population Frequencies of Species and the Esti.pdf:application/pdf},
}

@article{good_2000,
	title = {Turing’s anticipation of empirical bayes in connection with the cryptanalysis of the naval enigma},
	volume = {66},
	issn = {0094-9655},
	url = {https://doi.org/10.1080/00949650008812016},
	doi = {10.1080/00949650008812016},
	abstract = {The Enigma was a cryptographic (enciphering) machine used by the German military during WWII. The German navy changed part of the Enigma keys every other day. One of the important cryptanalytic attacks against the naval usage was called Banburismus, a sequentiai Bayesian procedure (anticipating sequential analysis) which was used from the sorine of 1941 until the middle of 1943. It was invented mainlv bv A. M. Turina and was perhaps the first important sequential Bayesian IE is unnecessab to describe it here. Before Banburismus could be started on a given day it was necessary to identifv which of nine ‘biaram’ (or ‘diaraph’) tables was in use on that day. In Turing’s approach to this identification hk had io istimate the probabilities of certain ‘trigraphs’. rrhese trigraphs were used. as described below. for determinine the initial wheel settings of messages). For estimatidg the probabilities, Turing inventedin important special case o the nonparametric (nonhypermetric) Empirid Bayes method independently of Herbert Robbins. The techniaue is the sumxisine form of Emdrical Baves in which a physical prior is assumed to eist but no apbroxiGate functional fonn is assumed for it.},
	number = {2},
	urldate = {2024-04-09},
	journal = {Journal of Statistical Computation and Simulation},
	author = {Good, I.J.},
	month = may,
	year = {2000},
	note = {Publisher: Taylor \& Francis
\_eprint: https://doi.org/10.1080/00949650008812016},
	keywords = {bletchley park, coverage of a sample, cryptology, empirical bayes, enigma, language statistics, probabilities of unobserved events, species sampling, Turing, word frequencies},
	pages = {101--111},
}

@book{gottschalk_etal_1969,
  author = {Gottschalk, Louis A., and Gleser, Goldine C., and Levine, Maurice},
  address = {Berkeley, CA},
  booktitle = {The Measurement of Psychological States Through the Content Analysis of Verbal Behavior},
  edition = {Reprint 2020},
  isbn = {0-520-37676-5},
  publisher = {University of California Press,},
  title = {The Measurement of Psychological States Through the Content Analysis of Verbal Behavior },
  year = {2020 - 1969}
}

@article{grand_etal_2022,
	title = {Semantic projection recovers rich human knowledge of multiple object features from word embeddings},
	volume = {6},
	copyright = {2022 The Author(s), under exclusive licence to Springer Nature Limited},
	issn = {2397-3374},
	url = {https://www.nature.com/articles/s41562-022-01316-8},
	doi = {10.1038/s41562-022-01316-8},
	abstract = {How is knowledge about word meaning represented in the mental lexicon? Current computational models infer word meanings from lexical co-occurrence patterns. They learn to represent words as vectors in a multidimensional space, wherein words that are used in more similar linguistic contexts—that is, are more semantically related—are located closer together. However, whereas inter-word proximity captures only overall relatedness, human judgements are highly context dependent. For example, dolphins and alligators are similar in size but differ in dangerousness. Here, we use a domain-general method to extract context-dependent relationships from word embeddings: ‘semantic projection’ of word-vectors onto lines that represent features such as size (the line connecting the words ‘small’ and ‘big’) or danger (‘safe’ to ‘dangerous’), analogous to ‘mental scales’. This method recovers human judgements across various object categories and properties. Thus, the geometry of word embeddings explicitly represents a wealth of context-dependent world knowledge.},
	language = {en},
	number = {7},
	urldate = {2024-04-07},
	journal = {Nature Human Behaviour},
	author = {Grand, Gabriel and Blank, Idan Asher and Pereira, Francisco and Fedorenko, Evelina},
	month = jul,
	year = {2022},
	note = {Publisher: Nature Publishing Group},
	keywords = {Human behaviour, Language and linguistics, Psychology},
	pages = {975--987}
}

@inproceedings{grave_etal_2018,
  title={Learning Word Vectors for 157 Languages},
  author={Grave, Edouard and Bojanowski, Piotr and Gupta, Prakhar and Joulin, Armand and Mikolov, Tomas},
  booktitle={Proceedings of the International Conference on Language Resources and Evaluation (LREC 2018)},
  year={2018}
}

@article{grootendorst_2022,
  title={BERTopic: Neural topic modeling with a class-based TF-IDF procedure},
  author={Grootendorst, Maarten},
  journal={arXiv preprint arXiv:2203.05794},
  year={2022}
}

@article{gunther_etal_2019,
  author = {Fritz Günther and Luca Rinaldi and Marco Marelli},
  title ={Vector-Space Models of Semantic Representation From a Cognitive Perspective: A Discussion of Common Misconceptions},
  journal = {Perspectives on Psychological Science},
  volume = {14},
  number = {6},
  pages = {1006-1033},
  year = {2019},
  doi = {10.1177/1745691619861372},
  note ={PMID: 31505121},
  URL = {https://doi.org/10.1177/1745691619861372},
  eprint = {https://doi.org/10.1177/1745691619861372},
  abstract = { Models that represent meaning as high-dimensional numerical vectors—such as latent semantic analysis (LSA), hyperspace analogue to language (HAL), bound encoding of the aggregate language environment (BEAGLE), topic models, global vectors (GloVe), and word2vec—have been introduced as extremely powerful machine-learning proxies for human semantic representations and have seen an explosive rise in popularity over the past 2 decades. However, despite their considerable advancements and spread in the cognitive sciences, one can observe problems associated with the adequate presentation and understanding of some of their features. Indeed, when these models are examined from a cognitive perspective, a number of unfounded arguments tend to appear in the psychological literature. In this article, we review the most common of these arguments and discuss (a) what exactly these models represent at the implementational level and their plausibility as a cognitive theory, (b) how they deal with various aspects of meaning such as polysemy or compositionality, and (c) how they relate to the debate on embodied and grounded cognition. We identify common misconceptions that arise as a result of incomplete descriptions, outdated arguments, and unclear distinctions between theory and implementation of the models. We clarify and amend these points to provide a theoretical basis for future research and discussions on vector models of semantic representation. }
}


@article{harris_1954,
	title = {Distributional {Structure}},
	volume = {10},
	issn = {0043-7956},
	url = {https://doi.org/10.1080/00437956.1954.11659520},
	doi = {10.1080/00437956.1954.11659520},
	number = {2-3},
	urldate = {2024-05-01},
	journal = {WORD},
	author = {Harris, Zellig S.},
	month = aug,
	year = {1954},
	note = {Publisher: Routledge
\_eprint: https://doi.org/10.1080/00437956.1954.11659520},
	pages = {146--162},
	file = {Full Text PDF:/Users/louisteitelbaum/Zotero/storage/XGZVWYEZ/Harris - 1954 - Distributional Structure.pdf:application/pdf},
}

@article{hellman_2011,
  abstract = {This study investigated whether adult‐onset second language (L2) learners achieve native level vocabulary after decades of immersion. Vocabulary tests were given to three groups of participants: highly successful adult‐onset learners of English, monolingual English speakers, and bilingual native speakers of English. Overall, the native speakers outperformed the non‐native speakers; however, the rate of native like achievement was remarkably high among the successful adult‐onset learners, which indicated that native level L2 vocabulary size and depth of word knowledge were attainable in adulthood. Factors that correlated with native level L2 vocabulary were: childhood caregivers' education, verbal ability and literacy in the native language, and interest in word learning and daily reading. The findings suggest that the lexicon may be the potentially most successful area of adult‐onset L2 learning.},
  author = {Hellman, Andrea B.},
  address = {Oxford, UK},
  copyright = {2010 Blackwell Publishing Ltd},
  issn = {0802-6106},
  journal = {International journal of applied linguistics},
  language = {eng},
  number = {2},
  pages = {162-182},
  publisher = {Blackwell Publishing Ltd},
  title = {Vocabulary size and depth of word knowledge in adult-onset second language acquisition},
  volume = {21},
  year = {2011},
}

@article{holtgraves_2011,
  title = {Text messaging, personality, and the social context},
  journal = {Journal of Research in Personality},
  volume = {45},
  number = {1},
  pages = {92-99},
  year = {2011},
  issn = {0092-6566},
  doi = {https://doi.org/10.1016/j.jrp.2010.11.015},
  url = {https://www.sciencedirect.com/science/article/pii/S0092656610001698},
  author = {Thomas Holtgraves},
  keywords = {Texting, Language, Personality, Language use},
  abstract = {The purpose of this research was to undertake some analyses of how the language used in text messaging varies as a function of personality traits and the interpersonal context. After completing personality questionnaires, participants provided their most recent text messages and indicated their relationship with the message recipient on several dimensions. Correlations between Linguistic Inquiry and Word Count (LIWC) categories and personality traits and relationship status were examined. There were significant correlations between certain LIWC categories and extraversion (e.g., personal pronouns), neuroticism (e.g., negative emotion words) and agreeableness (e.g., positive emotion words), suggesting that personality traits are displayed in how one texts. One of the defining features of texting – linguistic alterations (e.g., abbreviations) – varied as a function of both personality traits and relationship status. Overall, the results provide a snapshot of what text messages look like, and how they reflect the texter’s personality and the interpersonal context.}
}

@inproceedings{hovy_spruit_2016,
    title = {The Social Impact of Natural Language Processing},
    author = {Hovy, Dirk and Spruit, Shannon L.},
    editor = {Erk, Katrin  and Smith, Noah A.},
    booktitle = {Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)},
    month = aug,
    year = {2016},
    address = {Berlin, Germany},
    publisher = {Association for Computational Linguistics},
    url = {https://aclanthology.org/P16-2096},
    doi = {10.18653/v1/P16-2096},
    pages = {591--598},
}

@article{ireland_pennebaker_2010,
  abstract = {Each relationship has its own personality. Almost immediately after a social interaction begins, verbal and nonverbal behaviors become synchronized. Even in asocial contexts, individuals tend to produce utterances that match the grammatical structure of sentences they have recently heard or read. Three projects explore language style matching (LSM) in everyday writing tasks and professional writing. LSM is the relative use of 9 function word categories (e.g., articles, personal pronouns) between any 2 texts. In the first project, 2 samples totaling 1,744 college students answered 4 essay questions written in very different styles. Students automatically matched the language style of the target questions. Overall, the LSM metric was internally consistent and reliable across writing tasks. Women, participants of higher socioeconomic status, and students who earned higher test grades matched with targets more than others did. In the second project, 74 participants completed cliffhanger excerpts from popular fiction. Judges' ratings of excerpt-response similarity were related to content matching but not function word matching, as indexed by LSM. Further, participants were not able to intentionally increase style or content matching. In the final project, an archival study tracked the professional writing and personal correspondence of 3 pairs of famous writers across their relationships. Language matching in poetry and letters reflected fluctuations in the relationships of 3 couples: Sigmund Freud and Carl Jung, Elizabeth Barrett and Robert Browning, and Sylvia Plath and Ted Hughes. Implications for using LSM as an implicit marker of social engagement and influence are discussed.},
  author = {Ireland, Molly E and Pennebaker, James W},
  address = {Washington, DC},
  copyright = {2010 American Psychological Association},
  issn = {0022-3514},
  journal = {Journal of personality and social psychology},
  keywords = {Social psychology ; Language ; Literature ; Human beings ; Letter writing ; Male ; Educational attainment ; Socioeconomic Factors ; Texas ; Writing ; Written communication ; Female ; Psychological aspects ; Language and languages ; Composition (Language arts) ; Social aspects ; Individual differences ; Linguistics ; Interpersonal relations ; Social interaction ; Correspondence ; Letters ; Personality ; College students ; Cognitive styles ; Essays ; Nonverbal communication ; Psychology ; Poetry},
  language = {eng},
  number = {3},
  pages = {549-571},
  publisher = {American Psychological Association},
  title = {Language Style Matching in Writing: Synchrony in Essays, Correspondence, and Poetry},
  volume = {99},
  year = {2010},
}

@misc{ji_etal_2021,
  title={MentalBERT: Publicly Available Pretrained Language Models for Mental Healthcare}, 
  author={Shaoxiong Ji and Tianlin Zhang and Luna Ansari and Jie Fu and Prayag Tiwari and Erik Cambria},
  year={2021},
  eprint={2110.15621},
  archivePrefix={arXiv},
  primaryClass={cs.CL}
}

@article{kauf_etal_2024,
  author = {Kauf, Carina and Tuckute, Greta and Levy, Roger and Andreas, Jacob and Fedorenko, Evelina},
  title = "{Lexical-Semantic Content, Not Syntactic Structure, Is the Main Contributor to ANN-Brain Similarity of fMRI Responses in the Language Network}",
  journal = {Neurobiology of Language},
  volume = {5},
  number = {1},
  pages = {7-42},
  year = {2024},
  month = {04},
  abstract = "{Representations from artificial neural network (ANN) language models have been shown to predict human brain activity in the language network. To understand what aspects of linguistic stimuli contribute to ANN-to-brain similarity, we used an fMRI data set of responses to n = 627 naturalistic English sentences (Pereira et al., 2018) and systematically manipulated the stimuli for which ANN representations were extracted. In particular, we (i) perturbed sentences’ word order, (ii) removed different subsets of words, or (iii) replaced sentences with other sentences of varying semantic similarity. We found that the lexical-semantic content of the sentence (largely carried by content words) rather than the sentence’s syntactic form (conveyed via word order or function words) is primarily responsible for the ANN-to-brain similarity. In follow-up analyses, we found that perturbation manipulations that adversely affect brain predictivity also lead to more divergent representations in the ANN’s embedding space and decrease the ANN’s ability to predict upcoming tokens in those stimuli. Further, results are robust as to whether the mapping model is trained on intact or perturbed stimuli and whether the ANN sentence representations are conditioned on the same linguistic context that humans saw. The critical result—that lexical-semantic content is the main contributor to the similarity between ANN representations and neural ones—aligns with the idea that the goal of the human language system is to extract meaning from linguistic strings. Finally, this work highlights the strength of systematic experimental manipulations for evaluating how close we are to accurate and generalizable models of the human language network.}",
  issn = {2641-4368},
  doi = {10.1162/nol_a_00116},
  url = {https://doi.org/10.1162/nol\_a\_00116},
  eprint = {https://direct.mit.edu/nol/article-pdf/5/1/7/2361108/nol\_a\_00116.pdf},
}

@inbook{kennedy_etal_2022,
  author = {Brendan Kennedy and Ashwini Ashokkumar and Boyd, {Ryan L} and Morteza Dehghani},
  title = {Text Analysis for Psychology: Methods, Principles, and Practices},
  year = {2022},
  month = jan,
  day = {7},
  pages = {3-64},
  language = {English},
  isbn = {9781462548439},
  editor = {Morteza Dehghani and Boyd, {Ryan L}},
  booktitle = {Handbook of Language Analysis in Psychology},
  publisher = {Guilford Press},
}

@book{kenny_etal_2006,
  author={Kenny, David A.
  and Kashy, Deborah A.
  and Cook, William L.},
  title={Dyadic data analysis.},
  series={Methodology in the social sciences (David A. Kenny, Series Editor).},
  year={2006},
  publisher={Guilford Press},
  address={New York,  NY,  US},
  pages={xix, 458-xix, 458},
  keywords={*Analysis; *Data Collection; *Dyads; Interpersonal Interaction},
  abstract={Interpersonal phenomena such as attachment, conflict, person perception, learning, and influence have traditionally been studied by examining individuals in isolation, which falls short of capturing their truly interpersonal nature. This book offers state-of-the-art solutions to this age-old problem by presenting methodological and data-analytic approaches useful in investigating processes that take place among dyads: couples, coworkers, parent and child, teacher and student, or doctor and patient, to name just a few. Rich examples from psychology, sociology, family studies, and communication help build the researchers ability to conceptualize relationship processes; model and test for actor effects, partner effects, and relationship effects; and model and control for the statistical interdependence that can exist between partners. (PsycINFO Database Record (c) 2016 APA, all rights reserved)},
  isbn={1-57230-986-5 (Hardcover); 978-1-57230-986-9 (Hardcover)}
}

@article{kessler_2017,
  author    = {Kessler, Jason S.},
  title     = {Scattertext: a Browser-Based Tool for Visualizing how Corpora Differ},
  booktitle = {Proceedings of ACL-2017 System Demonstrations},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  URL = {https://github.com/JasonKessler/scattertext}
}

@misc{kjell_etal_2021,
 title={The text-package: An R-package for Analyzing and Visualizing Human Language Using Natural Language Processing and Deep Learning},
 url={osf.io/preprints/psyarxiv/293kt},
 DOI={10.31234/osf.io/293kt},
 publisher={PsyArXiv},
 author={Kjell, Oscar and Giorgi, Salvatore and Schwartz, H. A.},
 year={2021},
 month={Apr}
}

@article{kjell_etal_2022,
  author = {Kjell, Oscar and Sikström, Sverker and Kjell, Katarina and Schwartz, H.},
  year = {2022},
  month = {03},
  pages = {3918},
  title = {Natural language analyzed with AI-based transformers predict traditional subjective well-being measures approaching the theoretical upper limits in accuracy},
  volume = {12},
  journal = {Scientific Reports},
  doi = {10.1038/s41598-022-07520-w}
}

@article{knief_forstmeier_2021,
  author = {Knief, Ulrich and Forstmeier, Wolfgang},
  year = {2021},
  month = {05},
  pages = {},
  title = {Violating the normality assumption may be the lesser of two evils},
  volume = {53},
  journal = {Behavior Research Methods},
  doi = {10.3758/s13428-021-01587-5}
}

@article{kosinski_etal_2013,
  author = {Michal Kosinski  and David Stillwell  and Thore Graepel },
  title = {Private traits and attributes are predictable from digital records of human behavior},
  journal = {Proceedings of the National Academy of Sciences},
  volume = {110},
  number = {15},
  pages = {5802-5805},
  year = {2013},
  doi = {10.1073/pnas.1218772110},
  URL = {https://www.pnas.org/doi/abs/10.1073/pnas.1218772110},
  eprint = {https://www.pnas.org/doi/pdf/10.1073/pnas.1218772110},
  abstract = {We show that easily accessible digital records of behavior, Facebook Likes, can be used to automatically and accurately predict a range of highly sensitive personal attributes including: sexual orientation, ethnicity, religious and political views, personality traits, intelligence, happiness, use of addictive substances, parental separation, age, and gender. The analysis presented is based on a dataset of over 58,000 volunteers who provided their Facebook Likes, detailed demographic profiles, and the results of several psychometric tests. The proposed model uses dimensionality reduction for preprocessing the Likes data, which are then entered into logistic/linear regression to predict individual psychodemographic profiles from Likes. The model correctly discriminates between homosexual and heterosexual men in 88\% of cases, African Americans and Caucasian Americans in 95\% of cases, and between Democrat and Republican in 85\% of cases. For the personality trait “Openness,” prediction accuracy is close to the test–retest accuracy of a standard personality test. We give examples of associations between attributes and Likes and discuss implications for online personalization and privacy.}
}

@article{kozlowski_etal_2019,
  author = {Austin C. Kozlowski and Matt Taddy and James A. Evans},
  title ={The Geometry of Culture: Analyzing the Meanings of Class through Word Embeddings},
  journal = {American Sociological Review},
  volume = {84},
  number = {5},
  pages = {905-949},
  year = {2019},
  doi = {10.1177/0003122419877135},
  URL = {https://doi.org/10.1177/0003122419877135},
  eprint = {https://doi.org/10.1177/0003122419877135},
  abstract = { We argue word embedding models are a useful tool for the study of culture using a historical analysis of shared understandings of social class as an empirical case. Word embeddings represent semantic relations between words as relationships between vectors in a high-dimensional space, specifying a relational model of meaning consistent with contemporary theories of culture. Dimensions induced by word differences (rich – poor) in these spaces correspond to dimensions of cultural meaning, and the projection of words onto these dimensions reflects widely shared associations, which we validate with surveys. Analyzing text from millions of books published over 100 years, we show that the markers of class continuously shifted amidst the economic transformations of the twentieth century, yet the basic cultural dimensions of class remained remarkably stable. The notable exception is education, which became tightly linked to affluence independent of its association with cultivated taste. }
}

@article{kronmal_1993,
  title = {Spurious Correlation and the Fallacy of the Ratio Standard Revisited},
  author = {Kronmal, Richard A.},
  year = {1993},
  journal = {Journal of the Royal Statistical Society Series A},
  volume = {156},
  number = {3},
  pages = {379-392},
  abstract = {Spurious correlation refers to the correlation between indices that have a common component. A ‘per ratio’ standard is based on a biological measurement adjusted for some physical measurement by division. Renowned statisticians and biologists (Pearson, Neyman and Tanner) have warned about the problems in interpretation that ratios cause. This warning has been largely ignored. The consequences of using a single ratio as either the dependent or one of the independent variables in a multiple‐regression analysis are described. It is shown that the use of ratios in regression analyses can lead to incorrect or misleading inferences. A recommendation is made that the use of ratios in regression analyses be avoided.},
  url = {https://www.jstor.org/stable/2983064}
}

@article{kulkarni_etal_2018,
  doi = {10.1371/journal.pone.0201703},
  author = {Kulkarni, Vivek and Kern, Margaret L. and Stillwell, David and Kosinski, Michal and Matz, Sandra and Ungar, Lyle and Skiena, Steven and Schwartz, H. Andrew},
  journal = {PLOS ONE},
  publisher = {Public Library of Science},
  title = {Latent human traits in the language of social media: An open-vocabulary approach},
  year = {2018},
  month = {11},
  volume = {13},
  url = {https://doi.org/10.1371/journal.pone.0201703},
  pages = {1-18},
  abstract = {Over the past century, personality theory and research has successfully identified core sets of characteristics that consistently describe and explain fundamental differences in the way people think, feel and behave. Such characteristics were derived through theory, dictionary analyses, and survey research using explicit self-reports. The availability of social media data spanning millions of users now makes it possible to automatically derive characteristics from behavioral data—language use—at large scale. Taking advantage of linguistic information available through Facebook, we study the process of inferring a new set of potential human traits based on unprompted language use. We subject these new traits to a comprehensive set of evaluations and compare them with a popular five factor model of personality. We find that our language-based trait construct is often more generalizable in that it often predicts non-questionnaire-based outcomes better than questionnaire-based traits (e.g. entities someone likes, income and intelligence quotient), while the factors remain nearly as stable as traditional factors. Our approach suggests a value in new constructs of personality derived from everyday human language use.},
  number = {11}
}

@article{kuperman_etal_2012,
  title={Age-of-acquisition ratings for 30,000 English words},
  author={Kuperman, Victor and Stadthagen-Gonzalez, Hans and Brysbaert, Marc},
  journal={Behavior research methods},
  volume={44},
  pages={978--990},
  year={2012},
  publisher={Springer}
}

@article{landauer_dumais_1997,
  title={A Solution to Plato's Problem: The Latent Semantic Analysis Theory of Acquisition, Induction, and Representation of Knowledge.},
  author={Thomas K. Landauer and Susan T. Dumais},
  journal={Psychological Review},
  year={1997},
  volume={104},
  pages={211-240},
  url={https://api.semanticscholar.org/CorpusID:1144461}
}

@book{laplace_1816,
	title = {Essai philosophique sur les probabilités; par {M}. le comte {Laplace} ..},
	copyright = {http://creativecommons.org/publicdomain/mark/1.0/},
	url = {http://archive.org/details/bub_gb_vVzdR0tuoWAC},
	language = {French},
	urldate = {2024-04-08},
	publisher = {M.me v.e Courcier, impr.-libr. pour les mathématiques et la marine, quai des Augustins, no 57},
	author = {de Laplace, Pierre Simon},
	collaborator = {{National Library of Naples}},
	year = {1816},
	keywords = {bub\_upload},
}

@article{lazer_etal_2014,
  author = {David Lazer  and Ryan Kennedy  and Gary King  and Alessandro Vespignani },
  title = {The Parable of Google Flu: Traps in Big Data Analysis},
  journal = {Science},
  volume = {343},
  number = {6176},
  pages = {1203-1205},
  year = {2014},
  doi = {10.1126/science.1248506},
  URL = {https://www.science.org/doi/abs/10.1126/science.1248506},
  eprint = {https://www.science.org/doi/pdf/10.1126/science.1248506},
  abstract = {Large errors in flu prediction were largely avoidable, which offers lessons for the use of big data. In February 2013, Google Flu Trends (GFT) made headlines but not for a reason that Google executives or the creators of the flu tracking system would have hoped. Nature reported that GFT was predicting more than double the proportion of doctor visits for influenza-like illness (ILI) than the Centers for Disease Control and Prevention (CDC), which bases its estimates on surveillance reports from laboratories across the United States (1, 2). This happened despite the fact that GFT was built to predict CDC reports. Given that GFT is often held up as an exemplary use of big data (3, 4), what lessons can we draw from this error?}
}

@article{lazer_etal_2009,
  author = {David Lazer  and Alex Pentland  and Lada Adamic  and Sinan Aral  and Albert-László Barabási  and Devon Brewer  and Nicholas Christakis  and Noshir Contractor  and James Fowler  and Myron Gutmann  and Tony Jebara  and Gary King  and Michael Macy  and Deb Roy  and Marshall Van Alstyne },
  title = {Computational Social Science},
  journal = {Science},
  volume = {323},
  number = {5915},
  pages = {721-723},
  year = {2009},
  doi = {10.1126/science.1167742},
  URL = {https://www.science.org/doi/abs/10.1126/science.1167742},
  eprint = {https://www.science.org/doi/pdf/10.1126/science.1167742}
}

@inproceedings{le_etal_2011,
  title={Diurnal and Seasonal Mood Vary with Work, Sleep, and Daylength Across Diverse Cultures},
  author={Hiep D. Le and Christopher Vollmers and Megumi Hatori and Michael Witcher and Julie Secombe},
  year={2011},
  url={https://api.semanticscholar.org/CorpusID:262247896}
}

@article{lee_etal_2023,
  title = {Use all tokens method to improve semantic relationship learning},
  journal = {Expert Systems with Applications},
  volume = {233},
  pages = {120911},
  year = {2023},
  issn = {0957-4174},
  doi = {https://doi.org/10.1016/j.eswa.2023.120911},
  url = {https://www.sciencedirect.com/science/article/pii/S0957417423014136},
  author = {Kihoon Lee and Gyuho Choi and Chang Choi},
  keywords = {Natural language inference, Pretrained language model, Natural language understanding, Semantic relationship, Ensemble},
  abstract = {Recently, research on inference methods has been actively conducted to use language models more effectively for studying natural language understanding. Inference in language models that use bidirectional encoder representations from transformers (BERT) is performed using classification tokens that convey information from the input sentences. The use of single-token inference method for inference does not involve the hidden state vector that contains relevant connection information between the words, which in turn limits the ability to infer semantic relationships. This study proposes a use all tokens (UAT) method that combines unused tokens to improve inference methods through a single token. The UAT method effectively combines hidden state vectors and ensembles the global information of sentences with the local information between words. When the Stanford natural language inference (SNLI) corpus was solved using DeBERTaV3large, compared to the existing single token inference method, the UAT method improved the precision of the neutral relationship by 4.3% (87.7% vs. 92.0%) and the recall of the entailment and contradiction relationship by an average of 2% (93.5% vs. 95.5%). The UAT method proposed in this study can be readily implemented in BERT-based language models, and it enhances the accuracy and F1-score, thereby improving the learning of semantic relationships between sentences.}
}

@misc{lepennec_2023, 
  title={Ggwordcloud: A word cloud geom for ggplot2}, 
  year={2023},
  url={https://lepennec.github.io/ggwordcloud/articles/ggwordcloud.html}, 
  journal={lepennec.github.io/ggwordcloud/}, 
  author={le Pennec, E.}
  } 

@inproceedings{levy_goldberg_2014,
 author = {Levy, Omer and Goldberg, Yoav},
 booktitle = {Advances in Neural Information Processing Systems},
 editor = {Z. Ghahramani and M. Welling and C. Cortes and N. Lawrence and K.Q. Weinberger},
 pages = {},
 publisher = {Curran Associates, Inc.},
 title = {Neural Word Embedding as Implicit Matrix Factorization},
 url = {https://proceedings.neurips.cc/paper_files/paper/2014/file/feab05aa91085b7a8012516bc3533958-Paper.pdf},
 volume = {27},
 year = {2014}
}

@misc{liu_etal_2019,
  title={RoBERTa: A Robustly Optimized BERT Pretraining Approach}, 
  author={Yinhan Liu and Myle Ott and Naman Goyal and Jingfei Du and Mandar Joshi and Danqi Chen and Omer Levy and Mike Lewis and Luke Zettlemoyer and Veselin Stoyanov},
  year={2019},
  eprint={1907.11692},
  archivePrefix={arXiv},
  primaryClass={cs.CL}
}

@misc{li_etal_2020,
  title={On the Sentence Embeddings from Pre-trained Language Models}, 
  author={Bohan Li and Hao Zhou and Junxian He and Mingxuan Wang and Yiming Yang and Lei Li},
  year={2020},
  eprint={2011.05864},
  archivePrefix={arXiv},
  primaryClass={cs.CL}
}

@misc{mangalik_2023,
  title={Robust language-based mental health assessments in time and space through social media}, 
  author={Siddharth Mangalik and Johannes C. Eichstaedt and Salvatore Giorgi and Jihu Mun and Farhan Ahmed and Gilvir Gill and Adithya V. Ganesan and Shashanka Subrahmanya and Nikita Soni and Sean A. P. Clouston and H. Andrew Schwartz},
  year={2023},
  eprint={2302.12952},
  archivePrefix={arXiv},
  primaryClass={cs.CL}
}

@misc{matero_etal_2022,
  title={Evaluating Contextual Embeddings and their Extraction Layers for Depression Assessment}, 
  author={Matthew Matero and Albert Hung and H. Andrew Schwartz},
  year={2022},
  eprint={2112.13795},
  archivePrefix={arXiv},
  primaryClass={cs.CL}
}

@article{mehl_etal_2006,
  title={Personality in its natural habitat: manifestations and implicit folk theories of personality in daily life.},
  author={Matthias R. Mehl and Samuel D. Gosling and James W. Pennebaker},
  journal={Journal of personality and social psychology},
  year={2006},
  volume={90 5},
  pages={862-77},
  url={https://api.semanticscholar.org/CorpusID:2932332}
}

@article{mehta_etal_2020,
  title={Recent Trends in Deep Learning Based Personality Detection},
  author={Mehta, Yash and Majumder, Navonil and Gelbukh, Alexander and Cambria, Erik},
  journal={Artificial Intelligence Review},
  pages={2313–2339},
  year={2020},
  doi = {https://doi.org/10.1007/s10462-019-09770-z},
  url = {https://link.springer.com/article/10.1007/s10462-019-09770-z},
  publisher={Springer}
}

@article{mevik_wehrens_2007,
  title={The pls Package: Principal Component and Partial Least Squares Regression in R},
  volume={18},
  url={https://www.jstatsoft.org/index.php/jss/article/view/v018i02},
  doi={10.18637/jss.v018.i02},
  abstract={The pls package implements principal component regression (PCR) and partial least squares regression (PLSR) in R (R Development Core Team 2006b), and is freely available from the Comprehensive R Archive Network (CRAN), licensed under the GNU General Public License (GPL). The user interface is modelled after the traditional formula interface, as exemplified by lm. This was done so that people used to R would not have to learn yet another interface, and also because we believe the formula interface is a good way of working interactively with models. It thus has methods for generic functions like predict, update and coef. It also has more specialised functions like scores, loadings and RMSEP, and a exible crossvalidation system. Visual inspection and assessment is important in chemometrics, and the pls package has a number of plot functions for plotting scores, loadings, predictions, coefficients and RMSEP estimates. The package implements PCR and several algorithms for PLSR. The design is modular, so that it should be easy to use the underlying algorithms in other functions. It is our hope that the package will serve well both for interactive data analysis and as a building block for other functions or packages using PLSR or PCR. We will here describe the package and how it is used for data analysis, as well as how it can be used as a part of other packages. Also included is a section about formulas and data frames, for people not used to the R modelling idioms.},
  number={2},
  journal={Journal of Statistical Software},
  author={Mevik, Björn-Helge and Wehrens, Ron},
  year={2007},
  pages={1–23}
}

@article{michel_etal_2011,
  author = {Jean-Baptiste Michel  and Yuan Kui Shen  and Aviva Presser Aiden  and Adrian Veres  and Matthew K. Gray  and The Google Books Team and Joseph P. Pickett  and Dale Hoiberg  and Dan Clancy  and Peter Norvig  and Jon Orwant  and Steven Pinker  and Martin A. Nowak  and Erez Lieberman Aiden },
  title = {Quantitative Analysis of Culture Using Millions of Digitized Books},
  journal = {Science},
  volume = {331},
  number = {6014},
  pages = {176-182},
  year = {2011},
  doi = {10.1126/science.1199644},
  URL = {https://www.science.org/doi/abs/10.1126/science.1199644},
  eprint = {https://www.science.org/doi/pdf/10.1126/science.1199644},
  abstract = {Linguistic and cultural changes are revealed through the analyses of words appearing in books. We constructed a corpus of digitized texts containing about 4\% of all books ever printed. Analysis of this corpus enables us to investigate cultural trends quantitatively. We survey the vast terrain of ‘culturomics,’ focusing on linguistic and cultural phenomena that were reflected in the English language between 1800 and 2000. We show how this approach can provide insights about fields as diverse as lexicography, the evolution of grammar, collective memory, the adoption of technology, the pursuit of fame, censorship, and historical epidemiology. Culturomics extends the boundaries of rigorous quantitative inquiry to a wide array of new phenomena spanning the social sciences and the humanities.}
}

@inproceedings{mikolov_etal_2013,
  title = {Linguistic Regularities in Continuous Space Word Representations},
  author = {Mikolov, Tomas and Yih, Wen-tau and Zweig, Geoffrey},
  editor = {Vanderwende, Lucy  and
    Daum{\'e} III, Hal  and
    Kirchhoff, Katrin},
  booktitle = {Proceedings of the 2013 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies},
  month = jun,
  year = {2013},
  address = {Atlanta, Georgia},
  publisher = {Association for Computational Linguistics},
  url = {https://aclanthology.org/N13-1090},
  pages = {746--751},
}

@misc{mikolov_etal_2013b,
  title={Efficient Estimation of Word Representations in Vector Space}, 
  author={Tomas Mikolov and Kai Chen and Greg Corrado and Jeffrey Dean},
  year={2013},
  eprint={1301.3781},
  archivePrefix={arXiv},
  primaryClass={cs.CL}
}

@misc{mikolov_etal_2013c,
      title={Distributed Representations of Words and Phrases and their Compositionality}, 
      author={Tomas Mikolov and Ilya Sutskever and Kai Chen and Greg Corrado and Jeffrey Dean},
      year={2013},
      eprint={1310.4546},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

@inproceedings{millet_etal_2022,
  author = {Millet, Juliette and Caucheteux, Charlotte and Orhan, Pierre and Boubenec, Yves and Gramfort, Alexandre and Dunbar, Ewan and Pallier, Christophe and King, Jean-Remi},
  booktitle = {Advances in Neural Information Processing Systems},
  editor = {S. Koyejo and S. Mohamed and A. Agarwal and D. Belgrave and K. Cho and A. Oh},
  pages = {33428--33443},
  publisher = {Curran Associates, Inc.},
  title = {Toward a realistic model of speech processing in the brain with self-supervised learning},
  url = {https://proceedings.neurips.cc/paper_files/paper/2022/file/d81ecfc8fb18e833a3fa0a35d92532b8-Paper-Conference.pdf},
  volume = {35},
  year = {2022}
}

@inproceedings{mohammad_2018,
  title={Obtaining Reliable Human Ratings of Valence, Arousal, and Dominance for 20,000 English Words},
  author={Mohammad, Saif M.},
    booktitle={Proceedings of The Annual Conference of the Association for Computational Linguistics (ACL)},
    year={2018},
    address={Melbourne, Australia}
}

@inproceedings{mohammad_2018b,
    author = {Mohammad, Saif M.},
    title = {Word Affect Intensities},
    booktitle = {Proceedings of the 11th Edition of the Language Resources and Evaluation Conference (LREC-2018)},
    year = {2018},
    address={Miyazaki, Japan}
}

@article{mohammad_kiritchenko_2015,
  title={Using Hashtags to Capture Fine Emotion Categories from Tweets},
  author={Saif M. Mohammad and Svetlana Kiritchenko},
  journal={Computational Intelligence},
  year={2015},
  volume={31},
  pages={301 - 326},
  url={https://api.semanticscholar.org/CorpusID:2498838}
}

@article{mohammad_turney_2013,
	Author = {Mohammad, Saif M. and Turney, Peter D.},
	Journal = {Computational Intelligence},
	Number = {3},
	Pages = {436--465},
	Title = {Crowdsourcing a Word-Emotion Association Lexicon},
	Volume = {29},
	Year = {2013}
}

@inproceedings{mohammad_turney_2010,
  title = {Emotions Evoked by Common Words and Phrases: Using {M}echanical {T}urk to Create an Emotion Lexicon},
  author = {Mohammad, Saif  and
    Turney, Peter},
  booktitle = {Proceedings of the {NAACL} {HLT} 2010 Workshop on Computational Approaches to Analysis and Generation of Emotion in Text},
  month = jun,
  year = {2010},
  address = {Los Angeles, CA},
  publisher = {Association for Computational Linguistics},
  url = {https://aclanthology.org/W10-0204},
  pages = {26--34}
}

@article{mooijman_etal_2018,
  title={Moralization in social networks and the emergence of violence during protests},
  author={Marlon Mooijman and Joe Hoover and Ying Lin and Heng Ji and Morteza Dehghani},
  journal={Nature Human Behaviour},
  year={2018},
  volume={2},
  pages={389 - 396},
  url={https://doi.org/10.1038/s41562-018-0353-0}
}

@article{mosleh_etal_2021,
  author = {Mosleh, Mohsen and Pennycook, Gordon and Arechar, Antonio},
  year = {2021},
  month = {02},
  pages = {},
  title = {Cognitive reflection correlates with behavior on Twitter},
  volume = {12},
  journal = {Nature Communications},
  doi = {10.1038/s41467-020-20043-0}
}

@article{moss_etal_2006,
	title = {The {Role} of {Functionality} in the {Mental} {Representations} of {Engineering} {Students}: {Some} {Differences} in the {Early} {Stages} of {Expertise}},
	volume = {30},
	issn = {1551-6709},
	shorttitle = {The {Role} of {Functionality} in the {Mental} {Representations} of {Engineering} {Students}},
	url = {https://onlinelibrary.wiley.com/doi/abs/10.1207/s15516709cog0000_45},
	doi = {10.1207/s15516709cog0000_45},
	abstract = {As engineers gain experience and become experts in their domain, the structure and content of their knowledge changes. Two studies are presented that examine differences in knowledge representation among freshman and senior engineering students. The first study examines recall of mechanical devices and chunking of components, and the second examines whether seniors represent devices in a more abstract functional manner than do freshmen. The most prominent differences between these 2 groups involve their representation of the functioning of groups of electromechanical components and how these groups of components interact to produce device behavior. Seniors are better able to construct coherent representations of devices by focusing on the function of sets of components in the device. The findings from these studies highlight some ways in which the structure and content of mental representations of design knowledge differ during the early stages of expertise acquisition.},
	language = {en},
	number = {1},
	urldate = {2024-05-08},
	journal = {Cognitive Science},
	author = {Moss, Jarrod and Kotovsky, Kenneth and Cagan, Jonathan},
	year = {2006},
	note = {\_eprint: https://onlinelibrary.wiley.com/doi/pdf/10.1207/s15516709cog0000\_45},
	keywords = {Engineering, Expertise, Psychology, Representation},
	pages = {65--93},
	file = {Full Text PDF:/Users/louisteitelbaum/Zotero/storage/MMYP9VZH/Moss et al. - 2006 - The Role of Functionality in the Mental Representa.pdf:application/pdf;Snapshot:/Users/louisteitelbaum/Zotero/storage/VUC3SGXM/s15516709cog0000_45.html:text/html},
}

@Manual{mouselimis_2021,
  title = {{fastText}: Efficient Learning of Word Representations and
    Sentence Classification using R},
  author = {Lampros Mouselimis},
  year = {2021},
  note = {R package version 1.0.3},
  url = {https://CRAN.R-project.org/package=fastText},
}

@article{munoz_iglesias_2022,
  title = {A text classification approach to detect psychological stress combining a lexicon-based feature framework with distributional representations},
  journal = {Information Processing & Management},
  volume = {59},
  number = {5},
  pages = {103011},
  year = {2022},
  issn = {0306-4573},
  doi = {https://doi.org/10.1016/j.ipm.2022.103011},
  url = {https://www.sciencedirect.com/science/article/pii/S0306457322001212},
  author = {Sergio Muñoz and Carlos A. Iglesias},
  keywords = {Stress detection, Stress framework, Distributional representations, Text classification, Affective computing},
  abstract = {Nowadays, stress has become a growing problem for society due to its high impact on individuals but also on health care systems and companies. In order to overcome this problem, early detection of stress is a key factor. Previous studies have shown the effectiveness of text analysis in the detection of sentiment, emotion, and mental illness. However, existing solutions for stress detection from text are focused on a specific corpus. There is still a lack of well-validated methods that provide good results in different datasets. We aim to advance state of the art by proposing a method to detect stress in textual data and evaluating it using multiple public English datasets. The proposed approach combines lexicon-based features with distributional representations to enhance classification performance. To help organize features for stress detection in text, we propose a lexicon-based feature framework that exploits affective, syntactic, social, and topic-related features. Also, three different word embedding techniques are studied for exploiting distributional representation. Our approach has been implemented with three machine learning models that have been evaluated in terms of performance through several experiments. This evaluation has been conducted using three public English datasets and provides a baseline for other researchers. The obtained results identify the combination of FastText embeddings with a selection of lexicon-based features as the best-performing model, achieving F-scores above 80%.}
}

@misc{nielsen_2011,
  title={A new ANEW: Evaluation of a word list for sentiment analysis in microblogs}, 
  author={Finn Årup Nielsen},
  year={2011},
  eprint={1103.2903},
  archivePrefix={arXiv},
  primaryClass={cs.IR}
}

@article{nisbett_wilson_1977,
  author = {Nisbett, Richard and Wilson, Timothy},
  year = {1977},
  month = {05},
  pages = {231-259},
  title = {Telling More Than We Can Know: Verbal Reports on Mental Processes},
  volume = {84},
  journal = {Psychological Review},
  doi = {10.1037/0033-295X.84.3.231}
}

@article{oconnell_kowal_2003,
  author = {O'Connell, Daniel and Kowal, Sabine},
  year = {2003},
  month = {02},
  pages = {191-212},
  title = {Psycholinguistics: A Half Century of Monologism},
  volume = {116},
  journal = {The American journal of psychology},
  doi = {10.2307/1423577}
}

@misc{oconnor_2012,
	title = {Cosine similarity, {Pearson} correlation, and {OLS} coefficients},
	url = {https://brenocon.com/blog/2012/03/cosine-similarity-pearson-correlation-and-ols-coefficients/},
	year = {2012},
	urldate = {2024-04-12},
	journal = {AI and Social Science},
	author = {O'Connor, Brendan}
}

@article{olteanu_etal_2016,
  author = {Olteanu, Alexandra and Castillo, Carlos and Diaz, Fernando and Kiciman, Emre},
  year = {2016},
  month = {01},
  pages = {},
  title = {Social Data: Biases, Methodological Pitfalls, and Ethical Boundaries},
  journal = {SSRN Electronic Journal},
  doi = {10.2139/ssrn.2886526}
}

@misc{oyama_etal_2023,
  title={Norm of Word Embedding Encodes Information Gain}, 
  author={Momose Oyama and Sho Yokoi and Hidetoshi Shimodaira},
  year={2023},
  eprint={2212.09663},
  archivePrefix={arXiv},
  primaryClass={cs.CL}
}

@misc{pang_lee_2005,
  title={Seeing stars: Exploiting class relationships for sentiment categorization with respect to rating scales}, 
  author={Bo Pang and Lillian Lee},
  year={2005},
  eprint={cs/0506075},
  archivePrefix={arXiv},
  primaryClass={cs.CL}
}

@article{pennebaker_king_1999,
  title={Linguistic styles: language use as an individual difference.},
  author={Pennebaker, James W and King, Laura A},
  journal={Journal of personality and social psychology},
  volume={77},
  number={6},
  pages={1296},
  year={1999},
  publisher={American Psychological Association},
  url={https://doi.org/10.1037//0022-3514.77.6.1296}
}

@inproceedings{pennington_etal_2014,
  author = {Jeffrey Pennington and Richard Socher and Christopher D. Manning},
  booktitle = {Empirical Methods in Natural Language Processing (EMNLP)},
  title = {GloVe: Global Vectors for Word Representation},
  year = {2014},
  pages = {1532--1543},
  url = {http://www.aclweb.org/anthology/D14-1162},
}

@article{peterson_seligman_1984,
  journal = {Psychological review},
  language = {eng},
  number = {3},
  pages = {347-374},
  publisher = {American Psychological Association},
  title = {Causal explanations as a risk factor for depression: Theory and evidence},
  volume = {91},
  year = {1984},
  abstract = {The attributional reformulation of the learned helplessness model as outlined by L. Y. Abramson et al claims that an explanatory style in which bad events are explained by internal, stable, and global causes is associated with depressive symptoms. This style is claimed to be a risk factor for subsequent depression when bad events are encountered. A variety of new investigations of the helplessness reformulation are described that have employed 5 research strategies: cross-sectional correlational studies, longitudinal studies, experiments of nature, laboratory experiments, and case studies. Ss in these investigations included children, college students, poor women, depressed patients, and prisoners. Most of these studies involved the use of the Attributional Style Questionnaire and measures such as the Beck Depression Inventory and Multiple Affect Adjective Check List. These studies converge in their support for the learned helplessness reformulation. (120 ref)},
  author = {Peterson, Christopher and Seligman, Martin E},
  address = {Washington, DC},
  copyright = {1984 American Psychological Association},
  issn = {0033-295X},
}

@article{placinski_zywiczynski_2023,
title = {Modality effect in interactive alignment: Differences between spoken and text-based conversation},
journal = {Lingua},
volume = {293},
pages = {103592},
year = {2023},
issn = {0024-3841},
doi = {https://doi.org/10.1016/j.lingua.2023.103592},
url = {https://www.sciencedirect.com/science/article/pii/S002438412300116X},
author = {Marek Placiński and Przemysław Żywiczyński},
keywords = {Interactive alignment, Computer-mediated communication, Priming, Processing effort},
abstract = {A distinctive feature of dialogue is the convergence of linguistic choices. The Interactive Alignment Model posits that this convergence is obtained by a mechanism of priming that operates at all levels of linguistic representation. Under the model, priming is supposed to foster mutual understanding. Experimental research has confirmed that priming increases language production and comprehension ease. However, the influence of constraints caused by the medium of communication on alignment remains unexplored. In this paper, we look at structural alignment in face-to-face and synchronous text-based communication. In our study, we analysed the influence of different factors on structural alignment in two datasets, one of computer-mediated conversation and the other of spoken conversations, and found that the likelihood of structural alignment in both cases is determined by the same factors. However, when comparing the overall magnitude of structural alignment in the two datasets, we found a greater magnitude of structural alignment in text-based conversations. In our view, this result suggests that structural alignment is used to trade off the constraints of the text-based medium.}
}

@book{plutchik_1962,
  author = {Robert Plutchik},
  title = {The Emotions},
  year = {1962},
  isbn = {0819182869},
  publisher = {Random House},
  address = {New York}
}


@article{poldrack_etal_2012,
	title = {Discovering {Relations} {Between} {Mind}, {Brain}, and {Mental} {Disorders} {Using} {Topic} {Mapping}},
	volume = {8},
	issn = {1553-7358},
	url = {https://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1002707},
	doi = {10.1371/journal.pcbi.1002707},
	abstract = {Neuroimaging research has largely focused on the identification of associations between brain activation and specific mental functions. Here we show that data mining techniques applied to a large database of neuroimaging results can be used to identify the conceptual structure of mental functions and their mapping to brain systems. This analysis confirms many current ideas regarding the neural organization of cognition, but also provides some new insights into the roles of particular brain systems in mental function. We further show that the same methods can be used to identify the relations between mental disorders. Finally, we show that these two approaches can be combined to empirically identify novel relations between mental disorders and mental functions via their common involvement of particular brain networks. This approach has the potential to discover novel endophenotypes for neuropsychiatric disorders and to better characterize the structure of these disorders and the relations between them.},
	language = {en},
	number = {10},
	urldate = {2024-05-08},
	journal = {PLOS Computational Biology},
	author = {Poldrack, Russell A. and Mumford, Jeanette A. and Schonberg, Tom and Kalar, Donald and Barman, Bishal and Yarkoni, Tal},
	month = oct,
	year = {2012},
	note = {Publisher: Public Library of Science},
	keywords = {Autism, Clinical psychology, Cognitive psychology, Emotions, Language, Memory recall, Neuroimaging, Schizophrenia},
	pages = {e1002707},
	file = {Full Text PDF:/Users/louisteitelbaum/Zotero/storage/ZHZLPRGH/Poldrack et al. - 2012 - Discovering Relations Between Mind, Brain, and Men.pdf:application/pdf},
}

@article{poria_etal_2014,
  author = {Poria, Soujanya and Cambria, Erik and Winterstein, Grégoire and Huang, Guang-Bin},
  year = {2014},
  month = {10},
  pages = {},
  title = {Sentic patterns: Dependency-based rules for concept-level sentiment analysis},
  volume = {69},
  journal = {Knowledge-Based Systems},
  doi = {10.1016/j.knosys.2014.05.005}
}

@article{porter_1980,
  title={An algorithm for suffix stripping},
  author={Porter, Martin F},
  journal={Program},
  volume={14},
  number={3},
  pages={130--137},
  year={1980},
  publisher={MCB UP Ltd}
}

@article{proferes_etal_2021,
  author = {Nicholas Proferes and Naiyan Jones and Sarah Gilbert and Casey Fiesler and Michael Zimmer},
  title ={Studying Reddit: A Systematic Overview of Disciplines, Approaches, Methods, and Ethics},
  journal = {Social Media + Society},
  volume = {7},
  number = {2},
  pages = {20563051211019004},
  year = {2021},
  doi = {10.1177/20563051211019004},
  URL = {https://doi.org/10.1177/20563051211019004},
  eprint = {https://doi.org/10.1177/20563051211019004},
      abstract = { This article offers a systematic analysis of 727 manuscripts that used Reddit as a data source, published between 2010 and 2020. Our analysis reveals the increasing growth in use of Reddit as a data source, the range of disciplines this research is occurring in, how researchers are getting access to Reddit data, the characteristics of the datasets researchers are using, the subreddits and topics being studied, the kinds of analysis and methods researchers are engaging in, and the emerging ethical questions of research in this space. We discuss how researchers need to consider the impact of Reddit’s algorithms, affordances, and generalizability of the scientific knowledge produced using Reddit data, as well as the potential ethical dimensions of research that draws data from subreddits with potentially sensitive populations. }
}

@article{rao_etal_2014,
  abstract = {Sentiment analysis of online documents such as news articles, blogs and microblogs has received increasing attention in recent years. In this article, we propose an efficient algorithm and three pruning strategies to automatically build a word-level emotional dictionary for social emotion detection. In the dictionary, each word is associated with the distribution on a series of human emotions. In addition, a method based on topic modeling is proposed to construct a topic-level dictionary, where each topic is correlated with social emotions. Experiment on the real-world data sets has validated the effectiveness and reliability of the methods. Compared with other lexicons, the dictionary generated using our approach is language-independent, fine-grained, and volume-unlimited. The generated dictionary has a wide range of applications, including predicting the emotional distribution of news articles, identifying social emotions on certain entities and news events.},
  author = {Rao, Yanghui and Lei, Jingsheng and Wenyin, Liu and Li, Qing and Chen, Mingliang},
  address = {Boston},
  copyright = {Springer Science+Business Media New York 2013},
  issn = {1386-145X},
  journal = {World wide web (Bussum)},
  keywords = {Computer science ; Database management ; Construction ; Dictionaries ; News ; Strategy ; Data mining ; Emotions ; Sentiment analysis ; Web 2.0 ; Social media ; Information retrieval ; Microblogs ; Natural Language Processing ; Artificial intelligence},
  language = {eng},
  number = {4},
  pages = {723-742},
  publisher = {Springer US},
  title = {Building emotional dictionary for sentiment analysis of online news},
  volume = {17},
  year = {2014},
}

@inproceedings{reif_etal_2019,
  author = {Reif, Emily and Yuan, Ann and Wattenberg, Martin and Viegas, Fernanda B and Coenen, Andy and Pearce, Adam and Kim, Been},
  booktitle = {Advances in Neural Information Processing Systems},
  editor = {H. Wallach and H. Larochelle and A. Beygelzimer and F. d\textquotesingle Alch\'{e}-Buc and E. Fox and R. Garnett},
  pages = {},
  publisher = {Curran Associates, Inc.},
  title = {Visualizing and Measuring the Geometry of BERT},
  url = {https://proceedings.neurips.cc/paper_files/paper/2019/file/159c1ffe5b61b41b3c4d8f4c2150f6c4-Paper.pdf},
  volume = {32},
  year = {2019}
}

@inproceedings{reimers_gurevych_2019,
  title = "Sentence-{BERT}: Sentence Embeddings using {S}iamese {BERT}-Networks",
  author = "Reimers, Nils  and
    Gurevych, Iryna",
  editor = "Inui, Kentaro  and
    Jiang, Jing  and
    Ng, Vincent  and
    Wan, Xiaojun",
  booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)",
  month = nov,
  year = "2019",
  address = "Hong Kong, China",
  publisher = "Association for Computational Linguistics",
  url = "https://aclanthology.org/D19-1410",
  doi = "10.18653/v1/D19-1410",
  pages = "3982--3992",
  abstract = "BERT (Devlin et al., 2018) and RoBERTa (Liu et al., 2019) has set a new state-of-the-art performance on sentence-pair regression tasks like semantic textual similarity (STS). However, it requires that both sentences are fed into the network, which causes a massive computational overhead: Finding the most similar pair in a collection of 10,000 sentences requires about 50 million inference computations ({\textasciitilde}65 hours) with BERT. The construction of BERT makes it unsuitable for semantic similarity search as well as for unsupervised tasks like clustering. In this publication, we present Sentence-BERT (SBERT), a modification of the pretrained BERT network that use siamese and triplet network structures to derive semantically meaningful sentence embeddings that can be compared using cosine-similarity. This reduces the effort for finding the most similar pair from 65 hours with BERT / RoBERTa to about 5 seconds with SBERT, while maintaining the accuracy from BERT. We evaluate SBERT and SRoBERTa on common STS tasks and transfer learning tasks, where it outperforms other state-of-the-art sentence embeddings methods.",
}

@article{reitter_moore_2014,
  title = {Alignment and task success in spoken dialogue},
  journal = {Journal of Memory and Language},
  volume = {76},
  pages = {29-46},
  year = {2014},
  issn = {0749-596X},
  doi = {https://doi.org/10.1016/j.jml.2014.05.008},
  url = {https://www.sciencedirect.com/science/article/pii/S0749596X14000576},
  author = {David Reitter and Johanna D. Moore},
  keywords = {Dialogue, Interactive alignment, Syntactic priming, Structural priming, Entrainment, Task success},
  abstract = {Task-solving in dialogue depends on the convergence of the situation models held by the dialogue partners. The Interactive Alignment Model (Pickering & Garrod, 2004) suggests that this convergence is the result of an interactive alignment process, which is based on mechanistic repetition at a number of linguistic levels. In this paper, we develop two predictions arising from the theory, along with two methods to quantify the known structural priming effects in the full inventory of syntactic choices found in text and speech corpora. (a) Under a rational perspective, we expect increased repetition in task-oriented dialogue compared to spontaneous conversation. We find within- and between-speaker priming in a corpus of spontaneous conversations, but stronger priming in task-oriented dialogue. (b) The Interactive Alignment Model predicts linguistic adaptation to be correlated with task success. We show this effect in a corpus of task-oriented dialogue, where we find a positive correlation of long-term adaptation and a quantifiable task success measure. We argue that the repetition tendency relevant for the high-level alignment of situation models is based on slow adaptation rather than short-term priming. We demonstrate that lexical and syntactic repetition are reliable and computationally exploitable predictors of task success.}
}

@inproceedings{rosario_2001,
	title = {Latent {Semantic} {Indexing} : {An} {Overview} 1 {Latent} {Semantic} {Indexing} : {An} overview {INFOSYS} 240 {Spring} 2000 {Final} {Paper}},
	shorttitle = {Latent {Semantic} {Indexing}},
	url = {https://www.semanticscholar.org/paper/Latent-Semantic-Indexing-%3A-An-Overview-1-Latent-%3A-Rosario/95981f057cb76a24329fcf2b572f75d8c2b1613e#citing-papers},
	abstract = {Typically, information is retrieved by literally matching terms in documents with those of a query. However, lexical matching methods can be inaccurate when they are used to match a user's query. Since there are usually many ways to express a given concept (synonymy), the literal terms in a user's query may not match those of a relevant document. In addition, most words have multiple meanings (polysemy), so terms in a user's query will literally match terms in irrelevant documents. A better approach would allow users to retrieve information on the basis of a conceptual topic or meaning of a document. Latent Semantic Indexing (LSI) [Deerwester et al] tries to overcome the problems of lexical matching by using statistically derived conceptual indices instead of individual words for retrieval. LSI assumes that there is some underlying or latent structure in word usage that is partially obscured by variability in word choice. A truncated singular value decomposition (SVD) is used to estimate the structure in word usage across documents. Retrieval is then performed using the database of singular values and vectors obtained from the truncated SVD. Performance data shows that these statistically derived vectors are more robust indicators of meaning than individual terms. Section 2 is a review of basic concepts needed to understand LSI. In Section 3, a description of some of the advantages and disadvantages of LSI. The effectiveness of LSI has been demonstrated empirically in several text collections as increased average retrieval precision but a theoretical (and quantitative) understanding beyond empirical evidence is desirable. Section 4 describes some of the attempts that have been done in this direction. Finally, in Section 5 some applications of LSI.},
	urldate = {2024-05-07},
	author = {Rosario, Barbara},
	year = {2001},
	annote = {[TLDR] Latent Semantic Indexing (LSI) tries to overcome the problems of lexical matching by using statistically derived conceptual indices instead of individual words for retrieval, and shows that these statistically derived vectors are more robust indicators of meaning than individual terms.},
}

@article{rosenbusch_etal_2019,
  author = {Hannes Rosenbusch and Anthony M. Evans and Marcel Zeelenberg},
  title ={Multilevel Emotion Transfer on YouTube: Disentangling the Effects of Emotional Contagion and Homophily on Video Audiences},
  journal = {Social Psychological and Personality Science},
  volume = {10},
  number = {8},
  pages = {1028-1035},
  year = {2019},
  doi = {10.1177/1948550618820309},
  URL = {https://doi.org/10.1177/1948550618820309},
  eprint = {https://doi.org/10.1177/1948550618820309},
  abstract = { Why do connected users in online social networks express similar emotions? Past approaches have suggested situational emotion transfers (i.e., contagion) and the phenomenon that emotionally similar users flock together (i.e., homophily). We analyze these mechanisms in unison by exploiting the hierarchical structure of YouTube through multilevel analyses, disaggregating the video- and channel-level effects of YouTuber emotions on audience comments. Dictionary analyses using the National Research Council emotion lexica were used to measure the emotions expressed in videos and user comments from 2,083 YouTube vlogs selected from 110 vloggers. We find that video- and channel-level emotions independently influence audience emotions, providing evidence for both contagion and homophily effects. Random slope models suggest that contagion strength varies between YouTube channels for some emotions. However, neither average channel-level emotions nor number of subscribers significantly moderate the strength of contagion effects. The present study highlights that multiple, independent mechanisms shape emotions in online social networks. }
}

@misc{sanh_etal_2020,
  title={DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter}, 
  author={Victor Sanh and Lysandre Debut and Julien Chaumond and Thomas Wolf},
  year={2020},
  eprint={1910.01108},
  archivePrefix={arXiv},
  primaryClass={cs.CL}
}

@article{sap_etal_2022,
author = {Maarten Sap  and Anna Jafarpour  and Yejin Choi  and Noah A. Smith  and James W. Pennebaker  and Eric Horvitz },
title = {Quantifying the narrative flow of imagined versus autobiographical stories},
journal = {Proceedings of the National Academy of Sciences},
volume = {119},
number = {45},
pages = {e2211715119},
year = {2022},
doi = {10.1073/pnas.2211715119},
URL = {https://www.pnas.org/doi/abs/10.1073/pnas.2211715119},
eprint = {https://www.pnas.org/doi/pdf/10.1073/pnas.2211715119},
abstract = {Lifelong experiences and learned knowledge lead to shared expectations about how common situations tend to unfold. Such knowledge of narrative event flow enables people to weave together a story. However, comparable computational tools to evaluate the flow of events in narratives are limited. We quantify the differences between autobiographical and imagined stories by introducing sequentiality, a measure of narrative flow of events, drawing probabilistic inferences from a cutting-edge large language model (GPT-3). Sequentiality captures the flow of a narrative by comparing the probability of a sentence with and without its preceding story context. We applied our measure to study thousands of diary-like stories, collected from crowdworkers, about either a recent remembered experience or an imagined story on the same topic. The results show that imagined stories have higher sequentiality than autobiographical stories and that the sequentiality of autobiographical stories increases when the memories are retold several months later. In pursuit of deeper understandings of how sequentiality measures the flow of narratives, we explore proportions of major and minor events in story sentences, as annotated by crowdworkers. We find that lower sequentiality is associated with higher proportions of major events. The methods and results highlight opportunities to use cutting-edge computational analyses, such as sequentiality, on large corpora of matched imagined and autobiographical stories to investigate the influences of memory and reasoning on language generation processes.}
}

@inproceedings{sap_etal_2020,
    title = {Recollection versus Imagination: Exploring Human Memory and Cognition via Neural Language Models},
    author = {Sap, Maarten  and
      Horvitz, Eric  and
      Choi, Yejin  and
      Smith, Noah A.  and
      Pennebaker, James},
    editor = {Jurafsky, Dan  and
      Chai, Joyce  and
      Schluter, Natalie  and
      Tetreault, Joel},
    booktitle = {Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics},
    month = jul,
    year = {2020},
    address = {Online},
    publisher = {Association for Computational Linguistics},
    url = {https://aclanthology.org/2020.acl-main.178},
    doi = {10.18653/v1/2020.acl-main.178},
    pages = {1970--1978},
    abstract = {We investigate the use of NLP as a measure of the cognitive processes involved in storytelling, contrasting imagination and recollection of events. To facilitate this, we collect and release Hippocorpus, a dataset of 7,000 stories about imagined and recalled events. We introduce a measure of narrative flow and use this to examine the narratives for imagined and recalled events. Additionally, we measure the differential recruitment of knowledge attributed to semantic memory versus episodic memory (Tulving, 1972) for imagined and recalled storytelling by comparing the frequency of descriptions of general commonsense events with more specific realis events. Our analyses show that imagined stories have a substantially more linear narrative flow, compared to recalled stories in which adjacent sentences are more disconnected. In addition, while recalled stories rely more on autobiographical events based on episodic memory, imagined stories express more commonsense knowledge based on semantic memory. Finally, our measures reveal the effect of narrativization of memories in stories (e.g., stories about frequently recalled memories flow more linearly; Bartlett, 1932). Our findings highlight the potential of using NLP tools to study the traces of human cognition in language.},
}

@misc{schakel_wilson_2015,
      title={Measuring Word Significance using Distributed Representations of Words}, 
      author={Adriaan M. J. Schakel and Benjamin J. Wilson},
      year={2015},
      eprint={1508.02297},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

@inproceedings{schler_etal_2006,
author = {Schler, Jonathan and Koppel, Moshe and Argamon, Shlomo and Pennebaker, James},
year = {2006},
month = {01},
pages = {199-205},
title = {Effects of Age and Gender on Blogging.}
}

@article{schwartz_etal_2013,
    doi = {10.1371/journal.pone.0073791},
    author = {Schwartz, H. Andrew and Eichstaedt, Johannes C. and Kern, Margaret L. and Dziurzynski, Lukasz and Ramones, Stephanie M. and Agrawal, Megha and Shah, Achal and Kosinski, Michal and Stillwell, David and Seligman, Martin E. P. and Ungar, Lyle H.},
    journal = {PLOS ONE},
    publisher = {Public Library of Science},
    title = {Personality, Gender, and Age in the Language of Social Media: The Open-Vocabulary Approach},
    year = {2013},
    month = {09},
    volume = {8},
    url = {https://doi.org/10.1371/journal.pone.0073791},
    pages = {1-16},
    abstract = {We analyzed 700 million words, phrases, and topic instances collected from the Facebook messages of 75,000 volunteers, who also took standard personality tests, and found striking variations in language with personality, gender, and age. In our open-vocabulary technique, the data itself drives a comprehensive exploration of language that distinguishes people, finding connections that are not captured with traditional closed-vocabulary word-category analyses. Our analyses shed new light on psychosocial processes yielding results that are face valid (e.g., subjects living in high elevations talk about the mountains), tie in with other research (e.g., neurotic people disproportionately use the phrase ‘sick of’ and the word ‘depressed’), suggest new hypotheses (e.g., an active life implies emotional stability), and give detailed insights (males use the possessive ‘my’ when mentioning their ‘wife’ or ‘girlfriend’ more often than females use ‘my’ with ‘husband’ or 'boyfriend’). To date, this represents the largest study, by an order of magnitude, of language and personality.},
    number = {9},
}

@article{schwartz_ungar_2015,
author = {H. Andrew Schwartz and Lyle H. Ungar},
title ={Data-Driven Content Analysis of Social Media: A Systematic Overview of Automated Methods},
journal = {The ANNALS of the American Academy of Political and Social Science},
volume = {659},
number = {1},
pages = {78-94},
year = {2015},
doi = {10.1177/0002716215569197},
URL = {https://doi.org/10.1177/0002716215569197},
eprint = {https://doi.org/10.1177/0002716215569197},
abstract = { Researchers have long measured people’s thoughts, feelings, and personalities using carefully designed survey questions, which are often given to a relatively small number of volunteers. The proliferation of social media, such as Twitter and Facebook, offers alternative measurement approaches: automatic content coding at unprecedented scales and the statistical power to do open-vocabulary exploratory analysis. We describe a range of automatic and partially automatic content analysis techniques and illustrate how their use on social media generates insights into subjective well-being, health, gender differences, and personality.}
}

@inproceedings{shapira_etal_2021,
    title = "{H}ebrew Psychological Lexicons",
    author = "Shapira, Natalie  and
      Atzil-Slonim, Dana  and
      Juravski, Daniel  and
      Baruch, Moran  and
      Stolowicz-Melman, Dana  and
      Paz, Adar  and
      Alfi-Yogev, Tal  and
      Azoulay, Roy  and
      Singer, Adi  and
      Revivo, Maayan  and
      Dahbash, Chen  and
      Dayan, Limor  and
      Naim, Tamar  and
      Gez, Lidar  and
      Yanai, Boaz  and
      Maman, Adva  and
      Nadaf, Adam  and
      Sarfati, Elinor  and
      Baloum, Amna  and
      Naor, Tal  and
      Mosenkis, Ephraim  and
      Sarsour, Badreya  and
      Gelfand Morgenshteyn, Jany  and
      Elias, Yarden  and
      Braun, Liat  and
      Rubin, Moria  and
      Kenigsbuch, Matan  and
      Bergwerk, Noa  and
      Yosef, Noam  and
      Peled, Sivan  and
      Avigdor, Coral  and
      Obercyger, Rahav  and
      Mann, Rachel  and
      Alper, Tomer  and
      Beka, Inbal  and
      Shapira, Ori  and
      Goldberg, Yoav",
    editor = "Goharian, Nazli  and
      Resnik, Philip  and
      Yates, Andrew  and
      Ireland, Molly  and
      Niederhoffer, Kate  and
      Resnik, Rebecca",
    booktitle = "Proceedings of the Seventh Workshop on Computational Linguistics and Clinical Psychology: Improving Access",
    month = jun,
    year = "2021",
    address = "Online",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2021.clpsych-1.6",
    doi = "10.18653/v1/2021.clpsych-1.6",
    pages = "55--69",
    abstract = "We introduce a large set of Hebrew lexicons pertaining to psychological aspects. These lexicons are useful for various psychology applications such as detecting emotional state, well being, relationship quality in conversation, identifying topics (e.g., family, work) and many more. We discuss the challenges in creating and validating lexicons in a new language, and highlight our methodological considerations in the data-driven lexicon construction process. Most of the lexicons are publicly available, which will facilitate further research on Hebrew clinical psychology text analysis. The lexicons were developed through data driven means, and verified by domain experts, clinical psychologists and psychology students, in a process of reconciliation with three judges. Development and verification relied on a dataset of a total of 872 psychotherapy session transcripts. We describe the construction process of each collection, the final resource and initial results of research studies employing this resource.",
}

@article{simchon_etal_2020,
  title={Political depression? A big-data, multimethod investigation of Americans' emotional response to the Trump presidency.},
  author={Almog Simchon and Sharath Chandra Guntuku and Rotem Simhon and Lyle H. Ungar and Ran R. Hassin and Michael Gilead},
  journal={Journal of experimental psychology. General},
  year={2020},
  url={https://doi.org/10.1037/xge0000767}
}

@article{simchon_etal_2021,
  title = {Beyond doubt in a dangerous world: The effect of existential threats on the certitude of societal discourse},
  journal = {Journal of Experimental Social Psychology},
  volume = {97},
  pages = {104221},
  year = {2021},
  issn = {0022-1031},
  doi = {https://doi.org/10.1016/j.jesp.2021.104221},
  url = {https://www.sciencedirect.com/science/article/pii/S0022103121001244},
  author = {Almog Simchon and Chaya Turkin and Tal Svoray and Itai Kloog and Michael Dorman and Michael Gilead},
  keywords = {Big data, Terror management theory, Emotion, Social discourse, Motivated reasoning},
  abstract = {What happens when entire populations are exposed to news of impending existential threats? In the current study, we address this question by investigating the association between existential threats and the certitude of societal discourse. According to appraisal theory, threats give rise to anxiety and perceptions of uncertainty; as such, it predicts that exposure to life-threatening events will increase expressions of uncertainty. An alternative possibility is that people will respond to threats by utilizing psychological compensation mechanisms that will give rise to greater expressions of certainty. Across two studies, we measured linguistic certainty in more than 3.2 million tweets, covering different psychological contexts: (i) the 15 major terrorist and school shooting events that took place between 2016 and 2018; (ii) the COVID-19 pandemic. Consistent with the idea of compensatory processing, the results show that levels of expressed certainty increased following intentional and natural existential threats. We discuss the implications of our findings to theories of psychological compensation and to our understanding of collective response in the age of global threats.}
}

@article{simchon_etal_2023,
  title = "A computational text analysis investigation of the relation between personal and linguistic agency",
  abstract = "Previous psycholinguistic findings showed that linguistic framing – such as the use of passive voice - influences the level of agency attributed to other people. To investigate whether passive voice use relates to people{\textquoteright}s personal sense of agency, we conducted three studies in which we analyzed existing experimental and observational data. In Study 1 (N = 835) we show that sense of personal agency, operationalized between participants as recalling instances of having more or less power over others, affects the use of agentive language. InStudy 2 (N = 2.7 M) we show that increased personal agency (operationalized as one{\textquoteright}s social media followership) is associated with more agentive language. In Study 3 and its two replications (N = 43,140) we demonstrate using Reddit data that the language of individuals who post on the r/depression subreddit is less agentive. Together, these findings advance our understanding of the nuanced relationship between personal and linguistic agency.",
  author = "Almog Simchon and Britt Hadar and Michael Gilead",
  year = "2023",
  month = sep,
  day = "25",
  doi = "10.1038/s44271-023-00020-1",
  language = "English",
  pages = "1--9",
  journal = "Communications Psychology",
  issn = "2731-9121",
  publisher = "Springer Nature",
}

@inproceedings{song_etal_2015,
  author = {Song, Kaisong and Feng, Shi and Gao, Wei and Wang, Daling and Chen, Ling and Zhang, Chengqi},
  title = {Build Emotion Lexicon from Microblogs by Combining Effects of Seed Words and Emoticons in a Heterogeneous Graph},
  year = {2015},
  isbn = {9781450333955},
  publisher = {Association for Computing Machinery},
  address = {New York, NY, USA},
  url = {https://doi-org.bengurionu.idm.oclc.org/10.1145/2700171.2791035},
  doi = {10.1145/2700171.2791035},
  abstract = {As an indispensable resource for emotion analysis, emotion lexicons have attracted increasing attention in recent years. Most existing methods focus on capturing the single emotional effect of words rather than the emotion distributions which are helpful to model multiple complex emotions in a subjective text. Meanwhile, automatic lexicon building methods are overly dependent on seed words but neglect the effect of emoticons which are natural graphical labels of fine-grained emotion. In this paper, we propose a novel emotion lexicon building framework that leverages both seed words and emoticons simultaneously to capture emotion distributions of candidate words more accurately. Our method overcomes the weakness of existing methods by combining the effects of both seed words and emoticons in a unified three-layer heterogeneous graph, in which a multi-label random walk (MLRW) algorithm is performed to strengthen the emotion distribution estimation. Experimental results on real-world data reveal that our constructed emotion lexicon achieves promising results for emotion classification compared to the state-of-the-art lexicons.},
  booktitle = {Proceedings of the 26th ACM Conference on Hypertext \& Social Media},
  pages = {283–292},
  numpages = {10},
  keywords = {emoticon, emotion lexicon, heterogeneous graph, microblogs, seed word},
  location = {Guzelyurt, Northern Cyprus},
  series = {HT '15}
}

@article{sparckjones_1972,
	title = {A statistical interpretation of term specificity and its application in retrieval},
	volume = {28},
	issn = {0022-0418},
	url = {https://www.emerald.com/insight/content/doi/10.1108/eb026526/full/html},
	doi = {10.1108/eb026526},
	abstract = {The exhaustivity of document descriptions and the specificity of index terms are usually regarded as independent. It is suggested that specificity should be interpreted statistically, as a function of term use rather than of term meaning. The effects on retrieval of variations in term specificity are examined, experiments with three test collections showing in particular that frequently‐occurring terms are required for good overall performance. It is argued that terms should be weighted according to collection frequency, so that matches on less frequent, more specific, terms are of greater value than matches on frequent terms. Results for the test collections show that considerable improvements in performance are obtained with this very simple procedure.},
	language = {en},
	number = {1},
	urldate = {2024-04-08},
	journal = {Journal of Documentation},
	author = {Sparck Jones, Karen},
	month = jan,
	year = {1972},
	pages = {11--21}
}

@article{stadthagen_davis_2006,
  title={The Bristol norms for age of acquisition, imageability, and familiarity},
  author={Stadthagen-Gonzalez, Hans and Davis, Colin J},
  journal={Behavior research methods},
  volume={38},
  number={4},
  pages={598--605},
  year={2006},
  publisher={Springer}
}

@misc{stillwell_kosinski_2015,  
  title={myPersonality Project website},  
  author={Stillwell, DJ and Kosinski, M}, 
  year={2015}
}

@inproceedings{strapparava_mihalcea_2007,
    title = "{S}em{E}val-2007 Task 14: Affective Text",
    author = "Strapparava, Carlo  and
      Mihalcea, Rada",
    editor = "Agirre, Eneko  and
      M{\`a}rquez, Llu{\'\i}s  and
      Wicentowski, Richard",
    booktitle = "Proceedings of the Fourth International Workshop on Semantic Evaluations ({S}em{E}val-2007)",
    month = jun,
    year = "2007",
    address = "Prague, Czech Republic",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/S07-1013",
    pages = "70--74",
}

@inproceedings{strapparava_valitutti_2004,
  title={Wordnet affect: an affective extension of wordnet.},
  author={Strapparava, Carlo and Valitutti, Alessandro},
  booktitle={Lrec},
  volume={4},
  number={1083-1086},
  pages={40},
  year={2004},
  organization={Lisbon, Portugal}
}

@article{sumner_etal_2011,
  author = {Sumner, Chris and Byers, Alison and Shearing, Matthew},
  year = {2011},
  month = {01},
  pages = {},
  title = {Determining personality traits & privacy concerns from Facebook activity},
  volume = {11},
  journal = {Black Hat brief}
}

@article{takhteyev_etal_2012,
  title = {Geography of Twitter networks},
  journal = {Social Networks},
  volume = {34},
  number = {1},
  pages = {73-81},
  year = {2012},
  note = {Capturing Context: Integrating Spatial and Social Network Analyses},
  issn = {0378-8733},
  doi = {https://doi.org/10.1016/j.socnet.2011.05.006},
  url = {https://www.sciencedirect.com/science/article/pii/S0378873311000359},
  author = {Yuri Takhteyev and Anatoliy Gruzd and Barry Wellman},
  keywords = {Social networks, Distance, Proximity, Nation-states, Language, Air travel, Twitter},
  abstract = {The paper examines the influence of geographic distance, national boundaries, language, and frequency of air travel on the formation of social ties on Twitter, a popular micro-blogging website. Based on a large sample of publicly available Twitter data, our study shows that a substantial share of ties lies within the same metropolitan region, and that between regional clusters, distance, national borders and language differences all predict Twitter ties. We find that the frequency of airline flights between the two parties is the best predictor of Twitter ties. This highlights the importance of looking at pre-existing ties between places and people.}
}

@article{tausczik_pennebaker_2010,
  author = {Yla R. Tausczik and James W. Pennebaker},
  title ={The Psychological Meaning of Words: LIWC and Computerized Text Analysis Methods},
  journal = {Journal of Language and Social Psychology},
  volume = {29},
  number = {1},
  pages = {24-54},
  year = {2010},
  doi = {10.1177/0261927X09351676},
  URL = {https://doi.org/10.1177/0261927X09351676},
  eprint = {https://doi.org/10.1177/0261927X09351676},
  abstract = { We are in the midst of a technological revolution whereby, for the first time, researchers can link daily word use to a broad array of real-world behaviors. This article reviews several computerized text analysis methods and describes how Linguistic Inquiry and Word Count (LIWC) was created and validated. LIWC is a transparent text analysis program that counts words in psychologically meaningful categories. Empirical results using LIWC demonstrate its ability to detect meaning in a wide variety of experimental settings, including to show attentional focus, emotionality, social relationships, thinking styles, and individual differences. }
}

@inproceedings{torabi-asr_etal_2018,
    title = "Querying Word Embeddings for Similarity and Relatedness",
    author = "Torabi Asr, Fatemeh  and
      Zinkov, Robert  and
      Jones, Michael",
    editor = "Walker, Marilyn  and
      Ji, Heng  and
      Stent, Amanda",
    booktitle = "Proceedings of the 2018 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long Papers)",
    month = jun,
    year = "2018",
    address = "New Orleans, Louisiana",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/N18-1062",
    doi = "10.18653/v1/N18-1062",
    pages = "675--684",
    abstract = "Word embeddings obtained from neural network models such as Word2Vec Skipgram have become popular representations of word meaning and have been evaluated on a variety of word similarity and relatedness norming data. Skipgram generates a set of word and context embeddings, the latter typically discarded after training. We demonstrate the usefulness of context embeddings in predicting asymmetric association between words from a recently published dataset of production norms (Jouravlev {\&} McRae, 2016). Our findings suggest that humans respond with words closer to the cue within the context embedding space (rather than the word embedding space), when asked to generate thematically related words.",
}

@misc{trager_etal_2022,
  title={The Moral Foundations Reddit Corpus},
  author={Jackson Trager and Alireza S. Ziabari and Aida Mostafazadeh Davani and Preni Golazazian and Farzan Karimi-Malekabadi and Ali Omrani and Zhihe Li and Brendan Kennedy and Nils Karl Reimer and Melissa Reyes and Kelsey Cheng and Mellow Wei and Christina Merrifield and Arta Khosravi and Evans Alvarez and Morteza Dehghani},
  year={2022},
  eprint={2208.05545},
  archivePrefix={arXiv},
  primaryClass={cs.CL}
}

@article{tufekci_2014,
  author = {Tufekci, Zeynep},
  year = {2014},
  month = {03},
  pages = {},
  title = {Big Questions for Social Media Big Data: Representativeness, Validity and Other Methodological Pitfalls},
  volume = {8},
  journal = {Proceedings of the 8th International Conference on Weblogs and Social Media, ICWSM 2014},
  doi = {10.1609/icwsm.v8i1.14517}
}

@article{tversky_gati_1982,
  author = {Tversky, Amos and Gati, Itamar},
  year = {1982},
  month = {03},
  pages = {123-154},
  title = {Similarity, separability, and the triangle inequality},
  volume = {89},
  journal = {Psychological Review},
  doi = {10.1037/0033-295X.89.2.123}
}

@article{utsumi_2020,
  title={Exploring What Is Encoded in Distributional Word Vectors: A Neurobiologically Motivated Analysis},
  author={Akira Utsumi},
  journal={Cognitive science},
  year={2020},
  volume={44 6},
  pages={e12844},
  url={https://api.semanticscholar.org/CorpusID:218911983}
}

@article{vandervegt_etal_2021,
  title={The grievance dictionary: Understanding threatening language use},
  author={van der Vegt, Isabelle and Mozes, Maximilian and Kleinberg, Bennett and Gill, Paul},
  journal={Behavior research methods},
  pages={1--15},
  year={2021},
  publisher={Springer}
}

@article{warriner_etal_2013,
  title={Norms of valence, arousal, and dominance for 13,915 English lemmas},
  author={Warriner, Amy Beth and Kuperman, Victor and Brysbaert, Marc},
  journal={Behavior research methods},
  volume={45},
  pages={1191--1207},
  year={2013},
  publisher={Springer}
}

@book{wilkinson_2005,
  author = {Wilkinson, Leland},
  title = {The Grammar of Graphics (Statistics and Computing)},
  year = {2005},
  doi = {10.5555/1088896},
  isbn = {0387245448},
  publisher = {Springer-Verlag},
  address = {Berlin, Heidelberg}
}

@article{wold_etal_2001,
  title={PLS-regression: a basic tool of chemometrics},
  author={Svante Wold and Michael Sj{\"o}str{\"o}m and Lennart Eriksson},
  journal={Chemometrics and Intelligent Laboratory Systems},
  year={2001},
  volume={58},
  pages={109-130},
  url={https://api.semanticscholar.org/CorpusID:11920190}
}

@InProceedings{xiao_mensah_2022,
  author={Xiao, Lu and Mensah, Humphrey},
  editor={Arai, Kohei},
  title={How Does the Thread Level of a Comment Affect its Perceived Persuasiveness? A Reddit Study},
  booktitle={Intelligent Computing},
  year={2022},
  publisher={Springer International Publishing},
  pages={800--813},
  abstract={Online interactions increasingly involve complex processes of persuasion and influence. Compared to the long history and richness of persuasion studies in traditional communication settings, we have limited understanding of how people are influenced by others in online communications and how persuasion works in online environments. While it is common in online discussions that some comments are threaded under a specific thread, it is un-known whether and how the thread level affects its perceived persuasiveness. To explore this research inquiry, we collected and analyzed threaded discussions in Reddit's r/changemyview context. We found that the perceived persuasiveness of a comment fluctuates systematically from the top thread level to the most nested level. We conducted a semantic similarity analysis among adjacent comments in the threads examining how similar the comments are with respect to their content. Our results suggest that the first thread comment brings up a new idea or perspective, and the next comment matures it by adding new information to elaborate it, therefore, this comment is more likely to receive a delta point than the first comment. Additionally, this pattern continues onto the next comments. Implying that there is a common reasoning pattern in engaging in the threaded discussions in Reddit r/changemyview, our study sheds light on a comprehensive understanding of online participants' reasoning behavior in threaded discussions.},
  isbn={978-3-031-10464-0}
}

@article{yadav_etal_2011,
  author = {Yadav, Aman and Phillips, Michael and Lundeberg, Mary and Koehler, Matthew and Hilden, Katherine and Dirkin, Kathryn},
  year = {2011},
  month = {04},
  pages = {15-37},
  title = {If a picture is worth a thousand words is video worth a million? Differences in affective and cognitive processing of video and text cases},
  volume = {23},
  journal = {J. Computing in Higher Education},
  doi = {10.1007/s12528-011-9042-y}
}

@inproceedings{zamani_etal_2018,
    title = {Predicting Human Trustfulness from {F}acebook Language},
    author = {Zamani, Mohammadzaman  and
      Buffone, Anneke  and
      Schwartz, H. Andrew},
    editor = {Loveys, Kate  and
      Niederhoffer, Kate  and
      Prud{'}hommeaux, Emily  and
      Resnik, Rebecca  and
      Resnik, Philip},
    booktitle = {Proceedings of the Fifth Workshop on Computational Linguistics and Clinical Psychology: From Keyboard to Clinic},
    month = jun,
    year = {2018},
    address = {New Orleans, LA},
    publisher = {Association for Computational Linguistics},
    url = {https://aclanthology.org/W18-0619},
    doi = {10.18653/v1/W18-0619},
    pages = {174--181},
    abstract = {Trustfulness {---} one{'}s general tendency to have confidence in unknown people or situations {---} predicts many important real-world outcomes such as mental health and likelihood to cooperate with others such as clinicians. While data-driven measures of interpersonal trust have previously been introduced, here, we develop the first language-based assessment of the personality trait of trustfulness by fitting one{'}s language to an accepted questionnaire-based trust score. Further, using trustfulness as a type of case study, we explore the role of questionnaire size as well as word count in developing language-based predictive models of users{'} psychological traits. We find that leveraging a longer questionnaire can yield greater test set accuracy, while, for training, we find it beneficial to include users who took smaller questionnaires which offers more observations for training. Similarly, after noting a decrease in individual prediction error as word count increased, we found a word count-weighted training scheme was helpful when there were very few users in the first place.},
}

@misc{zhou_etal_2022,
  title={Problems with Cosine as a Measure of Embedding Similarity for High Frequency Words}, 
  author={Kaitlyn Zhou and Kawin Ethayarajh and Dallas Card and Dan Jurafsky},
  year={2022},
  eprint={2205.05092},
  archivePrefix={arXiv},
  primaryClass={cs.CL}
}