-
Notifications
You must be signed in to change notification settings - Fork 0
/
references.bib
2101 lines (1934 loc) · 145 KB
/
references.bib
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
@inproceedings{adamic_etal_2005,
author = {Adamic, Lada A. and Glance, Natalie},
title = {The Political Blogosphere and the 2004 U.S. Election: Divided They Blog},
year = {2005},
isbn = {1595932151},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/1134271.1134277},
doi = {10.1145/1134271.1134277},
abstract = {In this paper, we study the linking patterns and discussion topics of political bloggers. Our aim is to measure the degree of interaction between liberal and conservative blogs, and to uncover any differences in the structure of the two communities. Specifically, we analyze the posts of 40 "A-list" blogs over the period of two months preceding the U.S. Presidential Election of 2004, to study how often they referred to one another and to quantify the overlap in the topics they discussed, both within the liberal and conservative communities, and also across communities. We also study a single day snapshot of over 1,000 political blogs. This snapshot captures blogrolls (the list of links to other blogs frequently found in sidebars), and presents a more static picture of a broader blogosphere. Most significantly, we find differences in the behavior of liberal and conservative blogs, with conservative blogs linking to each other more frequently and in a denser pattern.},
booktitle = {Proceedings of the 3rd International Workshop on Link Discovery},
pages = {36–43},
numpages = {8},
keywords = {social networks, political blogs, link analysis},
location = {Chicago, Illinois},
series = {LinkKDD '05}
}
@misc{alammar_2019,
title={The Illustrated Word2vec},
url={http://jalammar.github.io/illustrated-word2vec/},
journal={Jay Alammar – Visualizing machine learning one concept at a time},
author={Alammar, Jay},
year={2019}
}
@article{allport_1942,
author={Allport, G. W.},
title={The use of personal documents in psychological science.},
journal={Social Science Research Council Bulletin},
year={1942},
volume={49},
pages={xix + 210-xix + 210},
abstract={The psychological use of personal documents is traced from its uncritical beginnings at the turn of the century to its emergence in the last 20 years as a method in its own right. Its uses in molecular and molar research; in teaching; in suggesting new items for questionnaires; in inductive studies, often with the construction of typologies; in social psychology; etc. are examined. The place of the personal document in an idiographic rather than a nomothetic scheme is stressed: "Lawful happenings may be one-time events. Frequency is not a necessary test of validity." The forms of personal documents are presented with examples and discussion. Essentially, documents are reducible to autobiographies, questionnaire responses, verbatim recordings, diaries, letters, or expressive and projective productions. The evaluation of personal documents (65 pages) examines the case for and against their use. "It can be shown that{\ldots} critical tests of science are met by personal documents properly handled," and personal documents may be superior to actuarial methods by themselves in achieving the scientific goals of understanding, prediction, and control. Bibliography of 198 references; indices. (PsycINFO Database Record (c) 2016 APA, all rights reserved)}
}
@article{anderson_etal_1991,
author = {Anne H. Anderson and Miles Bader and Ellen Gurman Bard and Elizabeth Boyle and Gwyneth Doherty and Simon Garrod and Stephen Isard and Jacqueline Kowtko and Jan McAllister and Jim Miller and Catherine Sotillo and Henry S. Thompson and Regina Weinert},
title = {The HCRC Map Task Corpus},
journal = {Language and Speech},
volume = {34},
number = {4},
pages = {351-366},
year = {1991},
doi = {10.1177/002383099103400404},
URL = {https://doi.org/10.1177/002383099103400404},
eprint = {https://doi.org/10.1177/002383099103400404},
abstract = { This paper describes a corpus of unscripted, task-oriented dialogues which has been designed, digitally recorded, and transcribed to support the study of spontaneous speech on many levels. The corpus uses the Map Task (Brown, Anderson, Yule, and Shillcock, 1983) in which speakers must collaborate verbally to reproduce on one participant's map a route printed on the other's. In all, the corpus includes four conversations from each of 64 young adults and manipulates the following variables: familiarity of speakers, eye contact between speakers, matching between landmarks on the participants' maps, opportunities for contrastive stress, and phonological characteristics of landmark names. The motivations for the design are set out and basic corpus statistics are presented. }
}
@article{araque_etal_2018,
author = {Oscar Araque and
Lorenzo Gatti and
Jacopo Staiano and
Marco Guerini},
title = {DepecheMood++: a Bilingual Emotion Lexicon Built Through Simple Yet
Powerful Techniques},
journal = {CoRR},
volume = {abs/1810.03660},
year = {2018},
url = {http://arxiv.org/abs/1810.03660},
eprinttype = {arXiv},
eprint = {1810.03660},
timestamp = {Tue, 30 Oct 2018 10:49:09 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-1810-03660.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{ashokkumar_pennebaker_2022,
author = {Ashokkumar, Ashwini and Pennebaker, James W},
title = {Tracking group identity through natural language within groups},
journal = {PNAS Nexus},
volume = {1},
number = {2},
pages = {pgac022},
year = {2022},
month = {06},
abstract = {To what degree can we determine people's connections with groups through the language they use? In recent years, large archives of behavioral data from social media communities have become available to social scientists, opening the possibility of tracking naturally occurring group identity processes. A feature of most digital groups is that they rely exclusively on the written word. Across 3 studies, we developed and validated a language-based metric of group identity strength and demonstrated its potential in tracking identity processes in online communities. In Studies 1a–1c, 873 people wrote about their connections to various groups (country, college, or religion). A total of 2 language markers of group identity strength were found: high affiliation (more words like we, togetherness) and low cognitive processing or questioning (fewer words like think, unsure). Using these markers, a language-based unquestioning affiliation index was developed and applied to in-class stream-of-consciousness essays of 2,161 college students (Study 2). Greater levels of unquestioning affiliation expressed in language predicted not only self-reported university identity but also students’ likelihood of remaining enrolled in college a year later. In Study 3, the index was applied to naturalistic Reddit conversations of 270,784 people in 2 online communities of supporters of the 2016 presidential candidates—Hillary Clinton and Donald Trump. The index predicted how long people would remain in the group (3a) and revealed temporal shifts mirroring members’ joining and leaving of groups (3b). Together, the studies highlight the promise of a language-based approach for tracking and studying group identity processes in online groups.},
issn = {2752-6542},
doi = {10.1093/pnasnexus/pgac022},
url = {https://doi.org/10.1093/pnasnexus/pgac022},
eprint = {https://academic.oup.com/pnasnexus/article-pdf/1/2/pgac022/47087259/pgac022.pdf},
}
@misc{atari_etal_2023,
title={Contextualized Construct Representation: Leveraging Psychometric Scales to Advance Theory-Driven Text Analysis},
url={osf.io/preprints/psyarxiv/m93pd},
DOI={10.31234/osf.io/m93pd},
publisher={PsyArXiv},
author={Atari, Mohammad and Omrani, Ali and Dehghani, Morteza},
year={2023},
month={Feb}
}
@book{baayen_2001,
title={Word Frequency Distributions},
author={Baayen, R.H.},
isbn={9780792370178},
lccn={2001029823},
series={Text, Speech and Language Technology},
url={https://link.springer.com/book/10.1007/978-94-010-0844-0},
year={2001},
publisher={Springer Netherlands}
}
@article{bandhakavi_etal_2021,
author = {Bandhakavi, Anil and Wiratunga, Nirmalie and Massie, Stewart and P., Deepak},
address = {Oxford},
copyright = {2018 John Wiley & Sons, Ltd.},
issn = {0266-4720},
journal = {Expert systems},
keywords = {Sentiment analysis ; Dirichlet problem ; Data mining ; Psychology ; Emotions ; Vocabulary ; Social media ; Grammar, Generative ; Generative grammar ; Computer science ; Natural Language Processing ; Information storage and retrieval systems ; Artificial intelligence},
language = {eng},
number = {7},
publisher = {Blackwell Publishing Ltd},
title = {Emotion‐aware polarity lexicons for Twitter sentiment analysis},
volume = {38},
year = {2021},
}
@inproceedings{badaro_etal_2018,
title = "{E}mo{W}ord{N}et: Automatic Expansion of Emotion Lexicon Using {E}nglish {W}ord{N}et",
author = "Badaro, Gilbert and Jundi, Hussein and Hajj, Hazem and El-Hajj, Wassim",
editor = "Nissim, Malvina and Berant, Jonathan and Lenci, Alessandro",
booktitle = "Proceedings of the Seventh Joint Conference on Lexical and Computational Semantics",
month = jun,
year = "2018",
address = "New Orleans, Louisiana",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/S18-2009",
doi = "10.18653/v1/S18-2009",
pages = "86--93",
}
@proceedings{baumgartner_etal_2020,
author = {Baumgartner, Jason and Zannettou, Savvas and Keegan, Brian and Squire, Megan and Blackburn, Jeremy},
title = {The Pushshift Reddit Dataset},
year = 2020,
publisher = {Zenodo},
month = jan,
doi = {10.5281/zenodo.3608135},
url = {https://doi.org/10.5281/zenodo.3608135}
}
@article{bellezza_etal_1986,
title={Words high and low in pleasantness as rated by male and female college students},
author={Bellezza, Francis S and Greenwald, Anthony G and Banaji, Mahzarin R},
journal={Behavior Research Methods, Instruments, \& Computers},
volume={18},
pages={299--303},
year={1986},
publisher={Springer}
}
@article{biester_etal_2022,
doi = {10.1371/journal.pone.0278179},
author = {Biester, Laura and Pennebaker, James and Mihalcea, Rada},
journal = {PLOS ONE},
publisher = {Public Library of Science},
title = {Emotional and cognitive changes surrounding online depression identity claims},
year = {2022},
month = {12},
volume = {17},
url = {https://doi.org/10.1371/journal.pone.0278179},
pages = {1-20},
abstract = {As social media has proliferated, a key aspect to making meaningful connections with people online has been revealing important parts of one’s identity. In this work, we study changes that occur in people’s language use after they share a specific piece of their identity: a depression diagnosis. To do so, we collect data from over five thousand users who have made such a statement, which we refer to as an identity claim. Prior to making a depression identity claim, the Reddit user’s language displays evidence of increasingly higher rates of anxiety, sadness, and cognitive processing language compared to matched controls. After the identity claim, these language markers decrease and more closely match the controls. Similarly, first person singular pronoun usage decreases following the identity claim, which was previously previously found to be indicative of self-focus and associated with depression. By further considering how and to whom people express their identity, we find that the observed longitudinal changes are larger for those who do so in ways that are more correlated with seeking help (sharing in a post instead of a comment; sharing in a mental health support forum). This work suggests that there may be benefits to sharing one’s depression diagnosis, especially in a semi-anonymous forum where others are likely to be empathetic.},
number = {12},
}
@article{blei_etal_2003,
author = {Blei, David M. and Ng, Andrew Y. and Jordan, Michael I.},
title = {Latent dirichlet allocation},
year = {2003},
issue_date = {3/1/2003},
publisher = {JMLR.org},
volume = {3},
number = {null},
doi = {10.5555/944919.944937},
issn = {1532-4435},
abstract = {We describe latent Dirichlet allocation (LDA), a generative probabilistic model for collections of discrete data such as text corpora. LDA is a three-level hierarchical Bayesian model, in which each item of a collection is modeled as a finite mixture over an underlying set of topics. Each topic is, in turn, modeled as an infinite mixture over an underlying set of topic probabilities. In the context of text modeling, the topic probabilities provide an explicit representation of a document. We present efficient approximate inference techniques based on variational methods and an EM algorithm for empirical Bayes parameter estimation. We report results in document modeling, text classification, and collaborative filtering, comparing to a mixture of unigrams model and the probabilistic LSI model.},
journal = {J. Mach. Learn. Res.},
month = {mar},
pages = {993–1022},
numpages = {30}
}
@misc{blei_mcauliffe_2010,
title={Supervised Topic Models},
author={David M. Blei and Jon D. McAuliffe},
year={2010},
eprint={1003.0783},
archivePrefix={arXiv},
primaryClass={stat.ML}
}
@article{bojanowski_etal_2017,
title={Enriching Word Vectors with Subword Information},
author={Piotr Bojanowski and Edouard Grave and Armand Joulin and Tomas Mikolov},
year={2017},
eprint={1607.04606},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
@unknown{boyd_etal_2022,
author = {Boyd, Ryan and Ashokkumar, Ashwini and Seraj, Sarah and Pennebaker, James},
year = {2022},
month = {02},
pages = {},
title = {The Development and Psychometric Properties of LIWC-22},
doi = {10.13140/RG.2.2.23890.43205}
}
@article{buechel_etal_2018,
author = {Sven Buechel and
Anneke Buffone and
Barry Slaff and
Lyle H. Ungar and
Jo{\~{a}}o Sedoc},
title = {Modeling Empathy and Distress in Reaction to News Stories},
journal = {CoRR},
volume = {abs/1808.10399},
year = {2018},
url = {http://arxiv.org/abs/1808.10399},
eprinttype = {arXiv},
eprint = {1808.10399},
timestamp = {Mon, 03 Sep 2018 13:36:40 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-1808-10399.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{buechel_etal_2020,
title = "Learning and Evaluating Emotion Lexicons for 91 Languages",
author = {Buechel, Sven and
R{\"u}cker, Susanna and
Hahn, Udo},
editor = "Jurafsky, Dan and
Chai, Joyce and
Schluter, Natalie and
Tetreault, Joel",
booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics",
month = jul,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.acl-main.112",
doi = "10.18653/v1/2020.acl-main.112",
pages = "1202--1217",
abstract = "Emotion lexicons describe the affective meaning of words and thus constitute a centerpiece for advanced sentiment and emotion analysis. Yet, manually curated lexicons are only available for a handful of languages, leaving most languages of the world without such a precious resource for downstream applications. Even worse, their coverage is often limited both in terms of the lexical units they contain and the emotional variables they feature. In order to break this bottleneck, we here introduce a methodology for creating almost arbitrarily large emotion lexicons for any target language. Our approach requires nothing but a source language emotion lexicon, a bilingual word translation model, and a target language embedding model. Fulfilling these requirements for 91 languages, we are able to generate representationally rich high-coverage lexicons comprising eight emotional variables with more than 100k lexical entries each. We evaluated the automatically generated lexicons against human judgment from 26 datasets, spanning 12 typologically diverse languages, and found that our approach produces results in line with state-of-the-art monolingual approaches to lexicon creation and even surpasses human reliability for some languages and variables. Code and data are available at \url{https://github.com/JULIELab/MEmoLon} archived under DOI 10.5281/zenodo.3779901.",
}
@inproceedings{burger_etal_2011,
author = {Burger, John D. and Henderson, John and Kim, George and Zarrella, Guido},
title = {Discriminating Gender on Twitter},
year = {2011},
isbn = {9781937284114},
publisher = {Association for Computational Linguistics},
address = {USA},
abstract = {Accurate prediction of demographic attributes from social media and other informal online content is valuable for marketing, personalization, and legal investigation. This paper describes the construction of a large, multilingual dataset labeled with gender, and investigates statistical models for determining the gender of uncharacterized Twitter users. We explore several different classifier types on this dataset. We show the degree to which classifier accuracy varies based on tweet volumes as well as when various kinds of profile metadata are included in the models. We also perform a large-scale human assessment using Amazon Mechanical Turk. Our methods significantly out-perform both baseline models and almost all humans on the same task.},
booktitle = {Proceedings of the Conference on Empirical Methods in Natural Language Processing},
pages = {1301–1309},
numpages = {9},
location = {Edinburgh, United Kingdom},
series = {EMNLP '11}
}
@article{brysbaert_etal_2014,
title={Concreteness ratings for 40 thousand generally known English word lemmas},
author={Brysbaert, Marc and Warriner, Amy Beth and Kuperman, Victor},
journal={Behavior research methods},
volume={46},
pages={904--911},
year={2014},
publisher={Springer}
}
@inproceedings{cai_etal_2021,
title={Isotropy in the Contextual Embedding Space: Clusters and Manifolds},
author={Xingyu Cai and Jiaji Huang and Yuchen Bian and Kenneth Church},
booktitle={International Conference on Learning Representations},
year={2021},
url={https://openreview.net/forum?id=xYGNO86OWDH}
}
@article{chatterjee_etal_2023,
author = {Chatterjee, Promothesh and Mishra, Himanshu and Mishra, Arul},
year = {2023},
month = {05},
pages = {},
title = {Does the first letter of one's name affect life decisions? A natural language processing examination of nominative determinism},
volume = {125},
journal = {Journal of personality and social psychology},
doi = {10.1037/pspa0000347}
}
@article{chersoni_etal_2021,
title = {Decoding Word Embeddings with Brain-Based Semantic Features},
author = {Chersoni, Emmanuele and Santus, Enrico and Huang, Chu-Ren and Lenci, Alessandro},
journal = {Computational Linguistics},
volume = {47},
number = {3},
month = nov,
year = {2021},
address = {Cambridge, MA},
publisher = {MIT Press},
url = {https://aclanthology.org/2021.cl-3.20},
doi = {10.1162/coli_a_00412},
pages = {663--698},
abstract = {Word embeddings are vectorial semantic representations built with either counting or predicting techniques aimed at capturing shades of meaning from word co-occurrences. Since their introduction, these representations have been criticized for lacking interpretable dimensions. This property of word embeddings limits our understanding of the semantic features they actually encode. Moreover, it contributes to the {``}black box{''} nature of the tasks in which they are used, since the reasons for word embedding performance often remain opaque to humans. In this contribution, we explore the semantic properties encoded in word embeddings by mapping them onto interpretable vectors, consisting of explicit and neurobiologically motivated semantic features (Binder et al. 2016). Our exploration takes into account different types of embeddings, including factorized count vectors and predict models (Skip-Gram, GloVe, etc.), as well as the most recent contextualized representations (i.e., ELMo and BERT). In our analysis, we first evaluate the quality of the mapping in a retrieval task, then we shed light on the semantic features that are better encoded in each embedding type. A large number of probing tasks is finally set to assess how the original and the mapped embeddings perform in discriminating semantic categories. For each probing task, we identify the most relevant semantic features and we show that there is a correlation between the embedding performance and how they encode those features. This study sets itself as a step forward in understanding which aspects of meaning are captured by vector spaces, by proposing a new and simple method to carve human-interpretable semantic representations from distributional vectors.}
}
@article{choi_choi_2010,
abstract = {We compared the magnitude of the hindsight bias in individuals and groups with the prediction that the plausibility of an outcome would affect the magnitude of the group–individual difference. We provided groups and individuals with outcomes of scientific studies, and asked them to predict the probability of those outcomes as if they did not know the given outcomes and to report their level of surprise at the outcomes. Overall, groups were more prone to hindsight bias than were individuals, but the group–individual difference was present only when the given outcomes were relatively implausible (Study 1). Moreover, this difference was not eliminated even when participants were asked to consider alternative outcomes (Study 2). Implications are discussed. [ABSTRACT FROM AUTHOR]},
author = {Choi, Dong‐Won and Choi, Incheol},
issn = {00219029},
journal = {Journal of Applied Social Psychology},
keywords = {SOCIAL psychology research, HINDSIGHT bias (Psychology), PREJUDICES, MEMORY, HUMAN behavior research, SOCIAL groups, PLAUSIBILITY (Logic)},
number = {2},
pages = {325 - 343},
title = {A Comparison of Hindsight Bias in Groups and Individuals: The Moderating Role of Plausibility.},
volume = {40},
url = {https://search.ebscohost.com/login.aspx?direct=true&db=sxi&AN=48116256&site=ehost-live},
year = {2010},
}
@article{choi_nisbett_2000,
pages = {890-905},
publisher = {American Psychological Association},
title = {Cultural Psychology of Surprise: Holistic Theories and Recognition of Contradiction},
volume = {79},
year = {2000},
author = {Choi, Incheol and Nisbett, Richard E},
address = {Washington, DC},
copyright = {2000 American Psychological Association},
issn = {0022-3514},
journal = {Journal of personality and social psychology},
keywords = {Adult ; Asians ; Attitude formation ; Attitudes ; Behavior ; Biological and medical sciences ; Cognition ; Concept Formation ; Cross Cultural Differences ; Cross-Cultural Comparison ; Cultural differences ; Culture ; Emotional Responses ; Emotions ; Ethnic Groups - psychology ; Expectations ; Female ; Fundamental and applied biological sciences. Psychology ; Helping Behavior ; Holism ; Human ; Humans ; Korea ; Logic ; Male ; Probability Learning ; Psychology ; Psychology. Psychoanalysis. Psychiatry ; Psychology. Psychophysiology ; Response Bias ; Social attribution, perception and cognition ; Social Perception ; Social psychology ; U.S.A ; United States},
language = {eng},
number = {6},
abstract = {The authors tested the hypothesis that East Asians, because of their holistic reasoning, take contradiction and inconsistency for granted and consequently are less likely than Americans to experience surprise. Studies 1 and 2 showed that Korean participants displayed less surprise and greater hindsight bias than American participants did when a target's behavior contradicted their expectations. Studies 3 and 4 further demonstrated that even when contradiction was created in highly explicit ways, Korean participants experienced little surprise, whereas American participants reported substantial surprise. We discuss the implications of these findings for various issues, including the psychology of conviction, cognitive dissonance, and the development of science.},
}
@article{chung_pennebaker_2008,
title = {Revealing dimensions of thinking in open-ended self-descriptions: An automated meaning extraction method for natural language},
journal = {Journal of Research in Personality},
volume = {42},
number = {1},
pages = {96-132},
year = {2008},
issn = {0092-6566},
doi = {https://doi.org/10.1016/j.jrp.2007.04.006},
url = {https://www.sciencedirect.com/science/article/pii/S0092656607000451},
author = {Cindy K. Chung and James W. Pennebaker},
keywords = {LIWC, Meaning extraction method, Natural language, Self-descriptions},
abstract = {A new method for extracting common themes from written text is introduced and applied to 1165 open-ended self-descriptive narratives. Drawing on a lexical approach to personality, the most commonly-used adjectives within narratives written by college students were identified using computerized text analytic tools. A factor analysis on the use of these adjectives in the self-descriptions produced a 7-factor solution consisting of psychologically meaningful dimensions. Some dimensions were unipolar (e.g., Negativity factor, wherein most loaded items were negatively valenced adjectives); others were dimensional in that semantically opposite words clustered together (e.g., Sociability factor, wherein terms such as shy, outgoing, reserved, and loud all loaded in the same direction). The factors exhibited modest reliability across different types of writing samples and were correlated with self-reports and behaviors consistent with the dimensions. Similar analyses with additional content words (adjectives, adverbs, nouns, and verbs) yielded additional psychological dimensions associated with physical appearance, school, relationships, etc. in which people contextualize their self-concepts. The results suggest that the meaning extraction method is a promising strategy that determines the dimensions along which people think about themselves.}
}
@inproceedings{cohan_etal_2018,
title = {{SMHD}: a Large-Scale Resource for Exploring Online Language Usage for Multiple Mental Health Conditions},
author = {Cohan, Arman and Desmet, Bart and Yates, Andrew and Soldaini, Luca and MacAvaney, Sean and Goharian, Nazli},
editor = {Bender, Emily M. and Derczynski, Leon and Isabelle, Pierre},
booktitle = {Proceedings of the 27th International Conference on Computational Linguistics},
year = {2018},
address = {Santa Fe, New Mexico, USA},
publisher = {Association for Computational Linguistics},
url = {https://aclanthology.org/C18-1126},
pages = {1485--1497},
abstract = {Mental health is a significant and growing public health concern. As language usage can be leveraged to obtain crucial insights into mental health conditions, there is a need for large-scale, labeled, mental health-related datasets of users who have been diagnosed with one or more of such conditions. In this paper, we investigate the creation of high-precision patterns to identify self-reported diagnoses of nine different mental health conditions, and obtain high-quality labeled data without the need for manual labelling. We introduce the SMHD (Self-reported Mental Health Diagnoses) dataset and make it available. SMHD is a novel large dataset of social media posts from users with one or multiple mental health conditions along with matched control users. We examine distinctions in users{'} language, as measured by linguistic and psychological variables. We further explore text classification methods to identify individuals with mental conditions through their language.}
}
@article{creegan_1944,
author={Creegan, Robert F.},
title={The phenomenological analysis of personal documents.},
journal={The Journal of Abnormal and Social Psychology},
year={1944},
publisher={American Psychological Association},
address={US},
volume={39},
number={2},
pages={244-266},
keywords={*Narratives; *Personality; Phenomenology},
abstract={The diary of a college boy is analyzed to illustrate the method of integral phenomenology. In comparing the personal worlds revealed by such documents, topical analysis is considered superficial. The content is analyzed, rather, by categories of form (complexity of expression and evaluation); change (in personal world); ideas of causation; values; plenitude (intensity of experience); direction (origin of values and actions); and distance (from objects and topics of interest and from conventional norms). The writer also considers the content symbolic of three complexes, apparently unrelated to these categories but suggested by the case. (PsycInfo Database Record (c) 2021 APA, all rights reserved)},
issn={0096-851X(Print)},
doi={10.1037/h0062816},
url={https://doi.org/10.1037/h0062816}
}
@book{crystal_1997,
author = {Crystal, David},
address = {Cambridge},
booktitle = {The Cambridge encyclopedia of language},
isbn = {0521550505},
edition = {Second edition.},
language = {eng},
lccn = {96003104},
publisher = {Cambridge University Press},
title = {The Cambridge encyclopedia of language},
year = {1997},
}
@article{curini_valerio_2021,
author = {Curini, Luigi and Vignoli, Valerio},
title = "{Committed Moderates and Uncommitted Extremists: Ideological Leaning and Parties’ Narratives on Military Interventions in Italy}",
journal = {Foreign Policy Analysis},
volume = {17},
number = {3},
pages = {orab016},
year = {2021},
month = {05},
abstract = "{Current research highlights that ideology decisively affects political contestation concerning peace and security operations in European countries. In particular, recent studies suggest that party preferences on this issue follow a curvilinear distribution along the left-right axis, delineating a conflict between moderate and extreme parties. However, the impact of this cleavage on the use of strategic narratives to either support or criticize these missions requires more attention. This article aims to fill this gap by employing seeded latent Dirichlet allocation, a semi-supervised automated text analysis method, to analyze parliamentary debates on Italy's most significant troop deployments between 1994 and 2013. We expect to find that while moderates express a supportive narrative aimed at justifying the use of force, extremists attempt to delegitimize military interventions. Accordingly, we hypothesize that moderate parties emphasize more on the multilateral and humanitarian framework of a mission, while extremist parties focus more on its military means. The empirical findings largely confirm our hypotheses. By means of its method and results, the article contributes both empirically and methodologically to the debate on the party politics of military interventions in Europe.}",
issn = {1743-8586},
doi = {10.1093/fpa/orab016},
url = {https://doi.org/10.1093/fpa/orab016},
eprint = {https://academic.oup.com/fpa/article-pdf/17/3/orab016/38108570/orab016.pdf},
}
@article{dideriksen_etal_2023,
author = {Dideriksen, C. and Christiansen, M. H. and Tylén, K. and Dingemanse, M. and Fusaroli, R.},
title = {Quantifying the interplay of conversational devices in building mutual understanding.},
journal = {Journal of Experimental Psychology: General},
year = {2023},
volume = {152},
issue = {3},
pages = {864-889},
doi = {10.1037/xge0001301}
}
@inproceedings{dingemanse_liesenfeld_2022,
title = {From text to talk: {H}arnessing conversational corpora for humane and diversity-aware language technology},
author = {Dingemanse, Mark and Liesenfeld, Andreas},
editor = {Muresan, Smaranda and Nakov, Preslav and Villavicencio, Aline},
booktitle = {Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
month = may,
year = {2022},
address = {Dublin, Ireland},
publisher = {Association for Computational Linguistics},
url = {https://aclanthology.org/2022.acl-long.385},
doi = {10.18653/v1/2022.acl-long.385},
pages = {5614--5633},
abstract = {Informal social interaction is the primordial home of human language. Linguistically diverse conversational corpora are an important and largely untapped resource for computational linguistics and language technology. Through the efforts of a worldwide language documentation movement, such corpora are increasingly becoming available. We show how interactional data from 63 languages (26 families) harbours insights about turn-taking, timing, sequential structure and social action, with implications for language technology, natural language understanding, and the design of conversational interfaces. Harnessing linguistically diverse conversational corpora will provide the empirical foundations for flexible, localizable, humane language technologies of the future.},
}
@article{davies_2009,
title={The 385+ million word Corpus of Contemporary American English (1990―2008+): Design, architecture, and linguistic insights},
author={Mark Davies},
journal={International Journal of Corpus Linguistics},
year={2009},
volume={14},
pages={159-190},
url={https://www.english-corpora.org//coca/}
}
@article{deerwester_etal_1990,
title={Indexing by Latent Semantic Analysis},
author={Scott Deerwester and Susan T. Dumais and George W. Furnas and Thomas K. Landauer and Richard A. Harshman},
journal={Journal of the Association for Information Science and Technology},
year={1990},
publisher={John Wiley & Sons, Ltd},
volume={41},
pages={391-407},
number={6},
doi={10.1002/(SICI)1097-4571(199009)41:6<391::AID-ASI1>3.0.CO;2-9},
}
@misc{devlin_etal_2019,
title={BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding},
author={Jacob Devlin and Ming-Wei Chang and Kenton Lee and Kristina Toutanova},
year={2019},
eprint={1810.04805},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
@article{diveica_etal_2023,
title={Quantifying social semantics: An inclusive definition of socialness and ratings for 8388 English words},
author={Diveica, Veronica and Pexman, Penny M and Binney, Richard J},
journal={Behavior Research Methods},
volume={55},
number={2},
pages={461--473},
year={2023},
publisher={Springer}
}
@article{downs_etal_2017,
title={Detection of Suicidality in Adolescents with Autism Spectrum Disorders: Developing a Natural Language Processing Approach for Use in Electronic Health Records},
author={Johnny M Downs and Sumithra Velupillai and George Gkotsis and Rachel Holden and Maxim Kikoler and Harry Dean and andrea C. Fernandes and Rina Dutta},
journal={AMIA ... Annual Symposium proceedings. AMIA Symposium},
year={2017},
volume={2017},
pages={
641-649
},
url={https://api.semanticscholar.org/CorpusID:7388358}
}
@article{duran_etal_2019,
journal = {Psychological methods},
author = {Duran, Nicholas D. and Paxton, Alexandra and Fusaroli, Riccardo},
number = {4},
pages = {419-438},
publisher = {American Psychological Association},
title = {ALIGN: Analyzing Linguistic Interactions With Generalizable techNiques-A Python Library},
volume = {24},
year = {2019},
abstract = {Linguistic alignment (LA) is the tendency during a conversation to reuse each other's linguistic expressions, including lexical, conceptual, or syntactic structures. LA is often argued to be a crucial driver in reciprocal understanding and interpersonal rapport, though its precise dynamics and effects are still controversial. One barrier to more systematic investigation of these effects lies in the diversity in the methods employed to analyze LA, which makes it difficult to integrate and compare results of individual studies. To overcome this issue, we have developed ALIGN (Analyzing Linguistic Interactions with Generalizable techNiques), an open-source Python package to measure LA in conversation (https://pypi.python.org/pypi/align) along with in-depth open-source tutorials hosted on ALIGN's GitHub repository (https://github.com/nickduran/align-linguistic-alignment). Here, we first describe the challenges in the study of LA and outline how ALIGN can address them. We then demonstrate how our analytical protocol can be applied to theory-driven questions using a complex corpus of dialogue (the Devil's Advocate corpus; Duran & Fusaroli, 2017). We close by identifying further challenges and point to future developments of the field.},
address = {United States},
copyright = {2019 American Psychological Association},
issn = {1082-989X},
}
@article{eichstaedt_etal_2015,
author = {Johannes C. Eichstaedt and H. Andrew Schwartz and Margaret L. Kern and Gregory Park and Darwin R. Labarthe and Raina M. Merchant and Sneha Jha and Megha Agrawal and Lukasz A. Dziurzynski and Maarten Sap and Christopher Weeg and Emily E. Larson and Lyle H. Ungar and Martin E. P. Seligman},
title ={Psychological Language on Twitter Predicts County-Level Heart Disease Mortality},
journal = {Psychological Science},
volume = {26},
number = {2},
pages = {159-169},
year = {2015},
doi = {10.1177/0956797614557867},
note ={PMID: 25605707},
URL = {https://osf.io/rt6w2/},
eprint = {https://doi.org/10.1177/0956797614557867},
abstract = { Hostility and chronic stress are known risk factors for heart disease, but they are costly to assess on a large scale. We used language expressed on Twitter to characterize community-level psychological correlates of age-adjusted mortality from atherosclerotic heart disease (AHD). Language patterns reflecting negative social relationships, disengagement, and negative emotions—especially anger—emerged as risk factors; positive emotions and psychological engagement emerged as protective factors. Most correlations remained significant after controlling for income and education. A cross-sectional regression model based only on Twitter language predicted AHD mortality significantly better than did a model that combined 10 common demographic, socioeconomic, and health risk factors, including smoking, diabetes, hypertension, and obesity. Capturing community psychological characteristics through social media is feasible, and these characteristics are strong markers of cardiovascular mortality at the community level. }
}
@article{engelthaler_hills_2018,
title={Humor norms for 4,997 English words},
author={Engelthaler, Tomas and Hills, Thomas T},
journal={Behavior research methods},
volume={50},
pages={1116--1124},
year={2018},
publisher={Springer}
}
@misc{ethayarajh_2019,
title={How Contextual are Contextualized Word Representations? Comparing the Geometry of BERT, ELMo, and GPT-2 Embeddings},
author={Kawin Ethayarajh},
year={2019},
eprint={1909.00512},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
@misc{ethayarajh_etal_2019,
title={Towards Understanding Linear Word Analogies},
author={Kawin Ethayarajh and David Duvenaud and Graeme Hirst},
year={2019},
eprint={1810.04882},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
@article{fedorenko_varley_2016,
author = {Fedorenko, Evelina and Varley, Rosemary},
year = {2016},
month = {04},
pages = {},
title = {Language and thought are not the same thing: Evidence from neuroimaging and neurological patients},
volume = {1369},
journal = {Annals of the New York Academy of Sciences},
doi = {10.1111/nyas.13046}
}
@article{feng_etal_2015,
author = {Feng, Shi and Song, Kaisong and Wang, Daling and Yu, Ge},
address = {New York},
copyright = {Springer Science+Business Media New York 2014},
issn = {1386-145X},
journal = {World wide web (Bussum)},
keywords = {Computer science ; Database management ; Digital media ; Collection ; Social networks ; Thesauri ; Data mining ; Microblogs ; Sentiment analysis ; Social media ; Data Science ; Natural Language Processing ; Artificial intelligence},
language = {eng},
number = {4},
pages = {949-967},
publisher = {Springer US},
title = {A word-emoticon mutual reinforcement ranking model for building sentiment lexicon from massive collection of microblogs},
volume = {18},
year = {2015},
}
@article{frimer_etal_2019,
title={Moral foundations dictionary for linguistic analyses 2.0},
author={Frimer, Jeremy A and Boghrati, Reihane and Haidt, Jonathan and Graham, Jesse and Dehgani, Morteza},
journal={Unpublished manuscript},
year={2019}
}
@article{gagne_etal_2005,
author = {Gagné, Christina and Spalding, Thomas and Ji, Hongbo},
year = {2005},
month = {09},
pages = {445-455},
title = {Re-examining evidence for the use of independent relational representations during conceptual combination},
volume = {53},
journal = {Journal of Memory and Language},
doi = {10.1016/j.jml.2005.03.006}
}
@article{gale_sampson_1995,
title = {Good‐turing frequency estimation without tears},
volume = {2},
issn = {0929-6174},
url = {https://doi.org/10.1080/09296179508590051},
doi = {10.1080/09296179508590051},
abstract = {Linguists and speech researchers who use statistical methods often need to estimate the frequency of some type of item in a population containing items of various types. A common approach is to divide the number of cases observed in a sample by the size of the sample; sometimes small positive quantities are added to divisor and dividend in order to avoid zero estimates for types missing from the sample. These approaches are obvious and simple, but they lack principled justification, and yield estimates that can be wildly inaccurate. I.J. Good and Alan Turing developed a family of theoretically well‐founded techniques appropriate to this domain. Some versions of the Good‐Turing approach are very demanding computationally, but we define a version, the Simple Good‐Turing estimator, which is straightforward to use. Tested on a variety of natural‐language‐related data sets, the Simple Good‐Turing estimator performs well, absolutely and relative both to the approaches just discussed and to other, more sophisticated techniques.},
number = {3},
urldate = {2024-04-08},
journal = {Journal of Quantitative Linguistics},
author = {Gale, William A. and Sampson, Geoffrey},
month = jan,
year = {1995},
note = {Publisher: Routledge
\_eprint: https://doi.org/10.1080/09296179508590051},
pages = {217--237},
}
@inproceedings{ganesan_etal_2021,
title={Empirical Evaluation of Pre-trained Transformers for Human-Level NLP: The Role of Sample Size and Dimensionality},
url={http://dx.doi.org/10.18653/v1/2021.naacl-main.357},
DOI={10.18653/v1/2021.naacl-main.357},
booktitle={Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies},
publisher={Association for Computational Linguistics},
author={Ganesan, Adithya and Matero, Matthew and Ravula, Aravind Reddy and Vu, Huy and Schwartz, H. Andrew},
year={2021},
}
@misc{gao_etal_2019,
title={Representation Degeneration Problem in Training Natural Language Generation Models},
author={Jun Gao and Di He and Xu Tan and Tao Qin and Liwei Wang and Tie-Yan Liu},
year={2019},
eprint={1907.12009},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
@article{garcia_sikstrom_2013,
author={Danilo Garcia and Sverker Sikström},
title={{Quantifying the Semantic Representations of Adolescents’ Memories of Positive and Negative Life Events}},
journal={Journal of Happiness Studies},
year=2013,
volume={14},
number={4},
pages={1309-1323},
month={August},
keywords={Quantitative semantic; Adolescence; Affect; Autobiographical memory; Happiness; Latent semantic anal},
doi={10.1007/s10902-012-9385-8},
abstract={We quantified the semantic content in adolescents’ descriptions of positive and negative life events and studied how these descriptions are related to the assessment subjective well-being (SWB) at two points in time. The semantic content of the descriptions was quantified by latent semantic analysis (LSA). LSA is a computational method based on algorithms stemming from computational linguistics, where a high dimensional semantic representation of words can be generated from co-occurrence of words in huge text corpora. We investigated if the semantic content of written autobiographical memories of positive and negative life events predicted traditionally ranked measures of SWB, i.e., self-reports of Positive and Negative Affect, and thus created semantic measures of SWB. Such measures can be used to investigate the relationship between semantic content and SWB, which could only indirectly be accomplished by the ranked data. Pupils wrote down positive or negative life events during the last 3 months and self-reported SWB. Four weeks later, participants were presented with their own description and asked to report current SWB. The results showed that the semantic representation predicted SWB and experimental conditions. The agreement between semantic and ranked measures supports the validity of the semantic scores. We argue that our proposed method for studying SWB provides new and essential information about well-being by the quantification of a richer set of information from adolescents’ own memories. Copyright Springer Science+Business Media B.V. 2013},
url={https://ideas.repec.org/a/spr/jhappi/v14y2013i4p1309-1323.html}
}
@article{garten_etal_2018,
title={Dictionaries and distributions: Combining expert knowledge and large scale textual data content analysis: Distributed dictionary representation},
author={Garten, Justin and Hoover, Joe and Johnson, Kate M and Boghrati, Reihane and Iskiwitch, Carol and Dehghani, Morteza},
journal={Behavior research methods},
volume={50},
pages={344--361},
year={2018},
publisher={Springer}
}
@article{giuntini_etal_2020,
abstract = {Social networks have become another resource for supporting mental health specialists in making inferences and finding indications of mental disorders, such as depression. This paper addresses the state-of-the-art regarding studies on recognition of depressive mood disorders in social networks through approaches and techniques of sentiment and emotion analysis. The systematic research conducted focused on social networks, social media, and the most employed techniques, feelings, and emotions were analyzed to find predecessors of a depressive disorder. Discussions on the research gaps identified aimed at improving the effectiveness of the analysis process, bringing the analysis close to the user’s reality. Twitter, Facebook, Blogs and Forums, Reddit, Live Journal, and Instagram are the most employed social networks regarding the identification of depressive mood disorders, and the most used information was text, followed by emoticons, user log information, and images. The selected studies usually employ classic off-the-shelf classifiers for the analysis of the available information, combined with lexicons such as NRC Word-Emoticon Association Lexicon, WordNet-Affect, Anew, and LIWC tool. The challenges include the analysis of temporal information and a combination of different types of information.},
author = {Giuntini, Felipe T. and Cazzolato, Mirela T. and dos Reis, Maria de Jesus Dutra and Campbell, Andrew T. and Traina, Agma J. M. and Ueyama, Jó},
address = {Berlin/Heidelberg},
copyright = {Springer-Verlag GmbH Germany, part of Springer Nature 2020},
issn = {1868-5137},
journal = {Journal of ambient intelligence and humanized computing},
keywords = {Engineering ; Computational intelligence ; Artificial intelligence ; Digital media ; Mental illness ; Psychology, Pathological ; Mental health ; Affective disorders ; Social networks ; Depression, Mental ; Emotions ; Appetite ; Data Science ; Data mining ; Sleep disorders ; Suicidal behavior ; Insomnia ; Self-perception},
language = {eng},
number = {11},
pages = {4713-4729},
publisher = {Springer Berlin Heidelberg},
title = {A review on recognizing depression in social networks: challenges and opportunities},
volume = {11},
year = {2020},
}
@article{goldberg_levy_2014,
title={word2vec Explained: deriving Mikolov et al.'s negative-sampling word-embedding method},
author={Yoav Goldberg and Omer Levy},
year={2014},
eprint={1402.3722},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
@article{golder_macy_2011,
author = {Scott A. Golder and Michael W. Macy },
title = {Diurnal and Seasonal Mood Vary with Work, Sleep, and Daylength Across Diverse Cultures},
journal = {Science},
volume = {333},
number = {6051},
pages = {1878-1881},
year = {2011},
doi = {10.1126/science.1202775},
URL = {https://www.science.org/doi/abs/10.1126/science.1202775},
eprint = {https://www.science.org/doi/pdf/10.1126/science.1202775},
abstract = {Across the world the collective mood heightens
at breakfast time and during the weekend. We identified individual-level diurnal and seasonal mood rhythms in cultures across the globe, using data from millions of public Twitter messages. We found that individuals awaken in a good mood that deteriorates as the day progresses—which is consistent with the effects of sleep and circadian rhythm—and that seasonal change in baseline positive affect varies with change in daylength. People are happier on weekends, but the morning peak in positive affect is delayed by 2 hours, which suggests that people awaken later on weekends.}
}
@article{good_1953,
title = {The {Population} {Frequencies} of {Species} and the {Estimation} of {Population} {Parameters}},
volume = {40},
issn = {0006-3444},
url = {https://www.jstor.org/stable/2333344},
doi = {10.2307/2333344},
abstract = {A random sample is drawn from a population of animals of various species. (The theory may also be applied to studies of literary vocabulary, for example.) If a particular species is represented r times in the sample of size N, then r/N is not a good estimate of the population frequency, p, when r is small. Methods are given for estimating p, assuming virtually nothing about the underlying population. The estimates are expressed in terms of smoothed values of the numbers nr (r = 1, 2, 3...), where nr is the number of distinct species that are each represented r times in the sample. (nr may be described as `the frequency of the frequency r'.) Turing is acknowledged for the most interesting formula in this part of the work. An estimate of the proportion of the population represented by the species occurring in the sample is an immediate corollary. Estimates are made of measures of heterogeneity of the population, including Yule's characteristic' and Shannon's entropy'. Methods are then discussed that do depend on assumptions about the underlying population. It is here that most work has been done by other writers. It is pointed out that a hypothesis can give a good fit to the numbers nr but can give quite the wrong value for Yule's characteristic. An example of this is Fisher's fit to some data of Williams's on Macrolepidoptera.},
number = {3/4},
urldate = {2024-04-08},
journal = {Biometrika},
author = {Good, I. J.},
year = {1953},
note = {Publisher: [Oxford University Press, Biometrika Trust]},
pages = {237--264},
file = {JSTOR Full Text PDF:/Users/louisteitelbaum/Zotero/storage/E5Q73MJK/Good - 1953 - The Population Frequencies of Species and the Esti.pdf:application/pdf},
}
@article{good_2000,
title = {Turing’s anticipation of empirical bayes in connection with the cryptanalysis of the naval enigma},
volume = {66},
issn = {0094-9655},
url = {https://doi.org/10.1080/00949650008812016},
doi = {10.1080/00949650008812016},
abstract = {The Enigma was a cryptographic (enciphering) machine used by the German military during WWII. The German navy changed part of the Enigma keys every other day. One of the important cryptanalytic attacks against the naval usage was called Banburismus, a sequentiai Bayesian procedure (anticipating sequential analysis) which was used from the sorine of 1941 until the middle of 1943. It was invented mainlv bv A. M. Turina and was perhaps the first important sequential Bayesian IE is unnecessab to describe it here. Before Banburismus could be started on a given day it was necessary to identifv which of nine ‘biaram’ (or ‘diaraph’) tables was in use on that day. In Turing’s approach to this identification hk had io istimate the probabilities of certain ‘trigraphs’. rrhese trigraphs were used. as described below. for determinine the initial wheel settings of messages). For estimatidg the probabilities, Turing inventedin important special case o the nonparametric (nonhypermetric) Empirid Bayes method independently of Herbert Robbins. The techniaue is the sumxisine form of Emdrical Baves in which a physical prior is assumed to eist but no apbroxiGate functional fonn is assumed for it.},
number = {2},
urldate = {2024-04-09},
journal = {Journal of Statistical Computation and Simulation},
author = {Good, I.J.},
month = may,
year = {2000},
note = {Publisher: Taylor \& Francis
\_eprint: https://doi.org/10.1080/00949650008812016},
keywords = {bletchley park, coverage of a sample, cryptology, empirical bayes, enigma, language statistics, probabilities of unobserved events, species sampling, Turing, word frequencies},
pages = {101--111},
}
@book{gottschalk_etal_1969,
author = {Gottschalk, Louis A., and Gleser, Goldine C., and Levine, Maurice},
address = {Berkeley, CA},
booktitle = {The Measurement of Psychological States Through the Content Analysis of Verbal Behavior},
edition = {Reprint 2020},
isbn = {0-520-37676-5},
publisher = {University of California Press,},
title = {The Measurement of Psychological States Through the Content Analysis of Verbal Behavior },
year = {2020 - 1969}
}
@article{grand_etal_2022,
title = {Semantic projection recovers rich human knowledge of multiple object features from word embeddings},
volume = {6},
copyright = {2022 The Author(s), under exclusive licence to Springer Nature Limited},
issn = {2397-3374},
url = {https://www.nature.com/articles/s41562-022-01316-8},
doi = {10.1038/s41562-022-01316-8},
abstract = {How is knowledge about word meaning represented in the mental lexicon? Current computational models infer word meanings from lexical co-occurrence patterns. They learn to represent words as vectors in a multidimensional space, wherein words that are used in more similar linguistic contexts—that is, are more semantically related—are located closer together. However, whereas inter-word proximity captures only overall relatedness, human judgements are highly context dependent. For example, dolphins and alligators are similar in size but differ in dangerousness. Here, we use a domain-general method to extract context-dependent relationships from word embeddings: ‘semantic projection’ of word-vectors onto lines that represent features such as size (the line connecting the words ‘small’ and ‘big’) or danger (‘safe’ to ‘dangerous’), analogous to ‘mental scales’. This method recovers human judgements across various object categories and properties. Thus, the geometry of word embeddings explicitly represents a wealth of context-dependent world knowledge.},
language = {en},
number = {7},
urldate = {2024-04-07},
journal = {Nature Human Behaviour},
author = {Grand, Gabriel and Blank, Idan Asher and Pereira, Francisco and Fedorenko, Evelina},
month = jul,
year = {2022},
note = {Publisher: Nature Publishing Group},
keywords = {Human behaviour, Language and linguistics, Psychology},
pages = {975--987}
}
@inproceedings{grave_etal_2018,
title={Learning Word Vectors for 157 Languages},
author={Grave, Edouard and Bojanowski, Piotr and Gupta, Prakhar and Joulin, Armand and Mikolov, Tomas},
booktitle={Proceedings of the International Conference on Language Resources and Evaluation (LREC 2018)},
year={2018}
}
@article{grootendorst_2022,
title={BERTopic: Neural topic modeling with a class-based TF-IDF procedure},
author={Grootendorst, Maarten},
journal={arXiv preprint arXiv:2203.05794},
year={2022}
}
@article{gunther_etal_2019,
author = {Fritz Günther and Luca Rinaldi and Marco Marelli},
title ={Vector-Space Models of Semantic Representation From a Cognitive Perspective: A Discussion of Common Misconceptions},
journal = {Perspectives on Psychological Science},
volume = {14},
number = {6},
pages = {1006-1033},
year = {2019},
doi = {10.1177/1745691619861372},
note ={PMID: 31505121},
URL = {https://doi.org/10.1177/1745691619861372},
eprint = {https://doi.org/10.1177/1745691619861372},
abstract = { Models that represent meaning as high-dimensional numerical vectors—such as latent semantic analysis (LSA), hyperspace analogue to language (HAL), bound encoding of the aggregate language environment (BEAGLE), topic models, global vectors (GloVe), and word2vec—have been introduced as extremely powerful machine-learning proxies for human semantic representations and have seen an explosive rise in popularity over the past 2 decades. However, despite their considerable advancements and spread in the cognitive sciences, one can observe problems associated with the adequate presentation and understanding of some of their features. Indeed, when these models are examined from a cognitive perspective, a number of unfounded arguments tend to appear in the psychological literature. In this article, we review the most common of these arguments and discuss (a) what exactly these models represent at the implementational level and their plausibility as a cognitive theory, (b) how they deal with various aspects of meaning such as polysemy or compositionality, and (c) how they relate to the debate on embodied and grounded cognition. We identify common misconceptions that arise as a result of incomplete descriptions, outdated arguments, and unclear distinctions between theory and implementation of the models. We clarify and amend these points to provide a theoretical basis for future research and discussions on vector models of semantic representation. }
}
@article{harris_1954,
title = {Distributional {Structure}},
volume = {10},
issn = {0043-7956},
url = {https://doi.org/10.1080/00437956.1954.11659520},
doi = {10.1080/00437956.1954.11659520},
number = {2-3},
urldate = {2024-05-01},
journal = {WORD},
author = {Harris, Zellig S.},
month = aug,
year = {1954},
note = {Publisher: Routledge
\_eprint: https://doi.org/10.1080/00437956.1954.11659520},
pages = {146--162},
file = {Full Text PDF:/Users/louisteitelbaum/Zotero/storage/XGZVWYEZ/Harris - 1954 - Distributional Structure.pdf:application/pdf},
}
@article{hellman_2011,
abstract = {This study investigated whether adult‐onset second language (L2) learners achieve native level vocabulary after decades of immersion. Vocabulary tests were given to three groups of participants: highly successful adult‐onset learners of English, monolingual English speakers, and bilingual native speakers of English. Overall, the native speakers outperformed the non‐native speakers; however, the rate of native like achievement was remarkably high among the successful adult‐onset learners, which indicated that native level L2 vocabulary size and depth of word knowledge were attainable in adulthood. Factors that correlated with native level L2 vocabulary were: childhood caregivers' education, verbal ability and literacy in the native language, and interest in word learning and daily reading. The findings suggest that the lexicon may be the potentially most successful area of adult‐onset L2 learning.},
author = {Hellman, Andrea B.},
address = {Oxford, UK},
copyright = {2010 Blackwell Publishing Ltd},
issn = {0802-6106},
journal = {International journal of applied linguistics},
language = {eng},
number = {2},
pages = {162-182},
publisher = {Blackwell Publishing Ltd},
title = {Vocabulary size and depth of word knowledge in adult-onset second language acquisition},
volume = {21},
year = {2011},
}
@article{holtgraves_2011,
title = {Text messaging, personality, and the social context},
journal = {Journal of Research in Personality},
volume = {45},
number = {1},
pages = {92-99},
year = {2011},
issn = {0092-6566},
doi = {https://doi.org/10.1016/j.jrp.2010.11.015},
url = {https://www.sciencedirect.com/science/article/pii/S0092656610001698},
author = {Thomas Holtgraves},
keywords = {Texting, Language, Personality, Language use},
abstract = {The purpose of this research was to undertake some analyses of how the language used in text messaging varies as a function of personality traits and the interpersonal context. After completing personality questionnaires, participants provided their most recent text messages and indicated their relationship with the message recipient on several dimensions. Correlations between Linguistic Inquiry and Word Count (LIWC) categories and personality traits and relationship status were examined. There were significant correlations between certain LIWC categories and extraversion (e.g., personal pronouns), neuroticism (e.g., negative emotion words) and agreeableness (e.g., positive emotion words), suggesting that personality traits are displayed in how one texts. One of the defining features of texting – linguistic alterations (e.g., abbreviations) – varied as a function of both personality traits and relationship status. Overall, the results provide a snapshot of what text messages look like, and how they reflect the texter’s personality and the interpersonal context.}
}
@inproceedings{hovy_spruit_2016,
title = {The Social Impact of Natural Language Processing},
author = {Hovy, Dirk and Spruit, Shannon L.},
editor = {Erk, Katrin and Smith, Noah A.},
booktitle = {Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)},
month = aug,
year = {2016},
address = {Berlin, Germany},
publisher = {Association for Computational Linguistics},
url = {https://aclanthology.org/P16-2096},
doi = {10.18653/v1/P16-2096},
pages = {591--598},
}
@article{ireland_pennebaker_2010,
abstract = {Each relationship has its own personality. Almost immediately after a social interaction begins, verbal and nonverbal behaviors become synchronized. Even in asocial contexts, individuals tend to produce utterances that match the grammatical structure of sentences they have recently heard or read. Three projects explore language style matching (LSM) in everyday writing tasks and professional writing. LSM is the relative use of 9 function word categories (e.g., articles, personal pronouns) between any 2 texts. In the first project, 2 samples totaling 1,744 college students answered 4 essay questions written in very different styles. Students automatically matched the language style of the target questions. Overall, the LSM metric was internally consistent and reliable across writing tasks. Women, participants of higher socioeconomic status, and students who earned higher test grades matched with targets more than others did. In the second project, 74 participants completed cliffhanger excerpts from popular fiction. Judges' ratings of excerpt-response similarity were related to content matching but not function word matching, as indexed by LSM. Further, participants were not able to intentionally increase style or content matching. In the final project, an archival study tracked the professional writing and personal correspondence of 3 pairs of famous writers across their relationships. Language matching in poetry and letters reflected fluctuations in the relationships of 3 couples: Sigmund Freud and Carl Jung, Elizabeth Barrett and Robert Browning, and Sylvia Plath and Ted Hughes. Implications for using LSM as an implicit marker of social engagement and influence are discussed.},
author = {Ireland, Molly E and Pennebaker, James W},
address = {Washington, DC},
copyright = {2010 American Psychological Association},
issn = {0022-3514},
journal = {Journal of personality and social psychology},
keywords = {Social psychology ; Language ; Literature ; Human beings ; Letter writing ; Male ; Educational attainment ; Socioeconomic Factors ; Texas ; Writing ; Written communication ; Female ; Psychological aspects ; Language and languages ; Composition (Language arts) ; Social aspects ; Individual differences ; Linguistics ; Interpersonal relations ; Social interaction ; Correspondence ; Letters ; Personality ; College students ; Cognitive styles ; Essays ; Nonverbal communication ; Psychology ; Poetry},
language = {eng},
number = {3},
pages = {549-571},
publisher = {American Psychological Association},
title = {Language Style Matching in Writing: Synchrony in Essays, Correspondence, and Poetry},
volume = {99},
year = {2010},
}
@misc{ji_etal_2021,
title={MentalBERT: Publicly Available Pretrained Language Models for Mental Healthcare},
author={Shaoxiong Ji and Tianlin Zhang and Luna Ansari and Jie Fu and Prayag Tiwari and Erik Cambria},
year={2021},
eprint={2110.15621},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
@article{kauf_etal_2024,
author = {Kauf, Carina and Tuckute, Greta and Levy, Roger and Andreas, Jacob and Fedorenko, Evelina},
title = "{Lexical-Semantic Content, Not Syntactic Structure, Is the Main Contributor to ANN-Brain Similarity of fMRI Responses in the Language Network}",
journal = {Neurobiology of Language},
volume = {5},
number = {1},
pages = {7-42},
year = {2024},
month = {04},
abstract = "{Representations from artificial neural network (ANN) language models have been shown to predict human brain activity in the language network. To understand what aspects of linguistic stimuli contribute to ANN-to-brain similarity, we used an fMRI data set of responses to n = 627 naturalistic English sentences (Pereira et al., 2018) and systematically manipulated the stimuli for which ANN representations were extracted. In particular, we (i) perturbed sentences’ word order, (ii) removed different subsets of words, or (iii) replaced sentences with other sentences of varying semantic similarity. We found that the lexical-semantic content of the sentence (largely carried by content words) rather than the sentence’s syntactic form (conveyed via word order or function words) is primarily responsible for the ANN-to-brain similarity. In follow-up analyses, we found that perturbation manipulations that adversely affect brain predictivity also lead to more divergent representations in the ANN’s embedding space and decrease the ANN’s ability to predict upcoming tokens in those stimuli. Further, results are robust as to whether the mapping model is trained on intact or perturbed stimuli and whether the ANN sentence representations are conditioned on the same linguistic context that humans saw. The critical result—that lexical-semantic content is the main contributor to the similarity between ANN representations and neural ones—aligns with the idea that the goal of the human language system is to extract meaning from linguistic strings. Finally, this work highlights the strength of systematic experimental manipulations for evaluating how close we are to accurate and generalizable models of the human language network.}",
issn = {2641-4368},
doi = {10.1162/nol_a_00116},
url = {https://doi.org/10.1162/nol\_a\_00116},
eprint = {https://direct.mit.edu/nol/article-pdf/5/1/7/2361108/nol\_a\_00116.pdf},
}
@inbook{kennedy_etal_2022,
author = {Brendan Kennedy and Ashwini Ashokkumar and Boyd, {Ryan L} and Morteza Dehghani},
title = {Text Analysis for Psychology: Methods, Principles, and Practices},
year = {2022},
month = jan,
day = {7},
pages = {3-64},
language = {English},
isbn = {9781462548439},
editor = {Morteza Dehghani and Boyd, {Ryan L}},
booktitle = {Handbook of Language Analysis in Psychology},
publisher = {Guilford Press},
}
@book{kenny_etal_2006,
author={Kenny, David A.
and Kashy, Deborah A.
and Cook, William L.},
title={Dyadic data analysis.},
series={Methodology in the social sciences (David A. Kenny, Series Editor).},
year={2006},
publisher={Guilford Press},
address={New York, NY, US},
pages={xix, 458-xix, 458},
keywords={*Analysis; *Data Collection; *Dyads; Interpersonal Interaction},
abstract={Interpersonal phenomena such as attachment, conflict, person perception, learning, and influence have traditionally been studied by examining individuals in isolation, which falls short of capturing their truly interpersonal nature. This book offers state-of-the-art solutions to this age-old problem by presenting methodological and data-analytic approaches useful in investigating processes that take place among dyads: couples, coworkers, parent and child, teacher and student, or doctor and patient, to name just a few. Rich examples from psychology, sociology, family studies, and communication help build the researchers ability to conceptualize relationship processes; model and test for actor effects, partner effects, and relationship effects; and model and control for the statistical interdependence that can exist between partners. (PsycINFO Database Record (c) 2016 APA, all rights reserved)},
isbn={1-57230-986-5 (Hardcover); 978-1-57230-986-9 (Hardcover)}
}
@article{kessler_2017,
author = {Kessler, Jason S.},
title = {Scattertext: a Browser-Based Tool for Visualizing how Corpora Differ},
booktitle = {Proceedings of ACL-2017 System Demonstrations},
year = {2017},
address = {Vancouver, Canada},
publisher = {Association for Computational Linguistics},
URL = {https://github.com/JasonKessler/scattertext}
}
@misc{kjell_etal_2021,
title={The text-package: An R-package for Analyzing and Visualizing Human Language Using Natural Language Processing and Deep Learning},
url={osf.io/preprints/psyarxiv/293kt},
DOI={10.31234/osf.io/293kt},
publisher={PsyArXiv},
author={Kjell, Oscar and Giorgi, Salvatore and Schwartz, H. A.},
year={2021},
month={Apr}
}
@article{kjell_etal_2022,
author = {Kjell, Oscar and Sikström, Sverker and Kjell, Katarina and Schwartz, H.},
year = {2022},
month = {03},
pages = {3918},
title = {Natural language analyzed with AI-based transformers predict traditional subjective well-being measures approaching the theoretical upper limits in accuracy},
volume = {12},
journal = {Scientific Reports},
doi = {10.1038/s41598-022-07520-w}
}
@article{knief_forstmeier_2021,
author = {Knief, Ulrich and Forstmeier, Wolfgang},
year = {2021},
month = {05},
pages = {},
title = {Violating the normality assumption may be the lesser of two evils},
volume = {53},
journal = {Behavior Research Methods},
doi = {10.3758/s13428-021-01587-5}
}
@article{kosinski_etal_2013,
author = {Michal Kosinski and David Stillwell and Thore Graepel },
title = {Private traits and attributes are predictable from digital records of human behavior},
journal = {Proceedings of the National Academy of Sciences},
volume = {110},
number = {15},
pages = {5802-5805},
year = {2013},
doi = {10.1073/pnas.1218772110},
URL = {https://www.pnas.org/doi/abs/10.1073/pnas.1218772110},
eprint = {https://www.pnas.org/doi/pdf/10.1073/pnas.1218772110},
abstract = {We show that easily accessible digital records of behavior, Facebook Likes, can be used to automatically and accurately predict a range of highly sensitive personal attributes including: sexual orientation, ethnicity, religious and political views, personality traits, intelligence, happiness, use of addictive substances, parental separation, age, and gender. The analysis presented is based on a dataset of over 58,000 volunteers who provided their Facebook Likes, detailed demographic profiles, and the results of several psychometric tests. The proposed model uses dimensionality reduction for preprocessing the Likes data, which are then entered into logistic/linear regression to predict individual psychodemographic profiles from Likes. The model correctly discriminates between homosexual and heterosexual men in 88\% of cases, African Americans and Caucasian Americans in 95\% of cases, and between Democrat and Republican in 85\% of cases. For the personality trait “Openness,” prediction accuracy is close to the test–retest accuracy of a standard personality test. We give examples of associations between attributes and Likes and discuss implications for online personalization and privacy.}
}
@article{kozlowski_etal_2019,
author = {Austin C. Kozlowski and Matt Taddy and James A. Evans},
title ={The Geometry of Culture: Analyzing the Meanings of Class through Word Embeddings},
journal = {American Sociological Review},
volume = {84},
number = {5},
pages = {905-949},
year = {2019},
doi = {10.1177/0003122419877135},
URL = {https://doi.org/10.1177/0003122419877135},