-
Notifications
You must be signed in to change notification settings - Fork 0
/
Mémoire-TNAH-2022-Reignier.bib
888 lines (824 loc) · 84.3 KB
/
Mémoire-TNAH-2022-Reignier.bib
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
@thesis{scheithauer_reconnaissanc_2021,
title = {La reconnaissanc d'entités nommées appliquées à des données issues de la transcription automatique de documents manuscrits patrimoniaux. Expérimentations et préconisations à partir du projet {LECTAUREP}},
url = {https://raw.githubusercontent.com/HugoSchtr/memoire_TNAH_M2_HugoScheithauer/main/memoire_Hugo_Scheithauer_TNAH.pdf},
institution = {Ecole nationale des chartes},
type = {Mémoire de master "Technologies numériques appliquées à l'histoire"},
author = {Scheithauer, Hugo},
urldate = {2022-05-29},
date = {2021},
}
@article{stutzmann_recherche_2017,
title = {La recherche en plein texte dans les sources manuscrites médiévales : enjeux et perspectives du projet {HIMANIS} pour l’édition électronique},
volume = {73},
rights = {Tous droits réservés},
issn = {0751-2708},
url = {https://journals.openedition.org/medievales/8198},
doi = {10.4000/medievales.8198},
shorttitle = {La recherche en plein texte dans les sources manuscrites médiévales},
abstract = {{HIMANIS} (Historical {MANuscript} Indexing for user-controlled Search) est un projet de recherche européen, associant, sous le pilotage de l’{IRHT} ({CNRS}, France), la société innovante A2iA (France), la Rijksuniversiteit Groningen (Pays-Bas) et l’Universitat Politècnica de València (Espagne). Il vise à l’indexation du texte des registres de la chancellerie royale française des années 1302-1483, conservés aux Archives nationales sous les cotes {JJ}35 à {JJ}211, à partir des images produites par leur numérisation. Les enjeux de recherche d’information (données massives et bruitées) permettent de conjoindre les enjeux technologiques (reconnaissance de l’écriture manuscrite) et historiques (analyses paléographiques et diplomatiques, recherche sur les institutions, le fonctionnement de la monarchie, la naissance de l’État-nation). La présente contribution propose un modèle d’accès à l’information dans un corpus de données massives, d’un point de vue tant ergonomique qu’herméneutique. À cette fin, après une présentation du corpus, des outils actuels pour accéder à l’information qu’ils contiennent et de leur formalisation en {TEI}, elle problématise l’édition électronique comme « vérité terrain » et « terrain d’apprentissage », en renversant l’approche classique de l’édition critique comme finalité. Enfin, elle décrit le modèle d’accès proposé, à la fois pour une approche par « indexation » (et non par transcription) et pour une granularité par acte.},
pages = {67--96},
number = {73},
journaltitle = {Médiévales. Langues, Textes, Histoire},
author = {Stutzmann, Dominique and Moufflet, Jean-François and Hamel, Sébastien},
urldate = {2022-04-20},
date = {2017-12-15},
langid = {french},
keywords = {Archives nationales, chancellerie royale, édition électronique, {HIMANIS}, humanités numériques, recherche en plein texte},
}
@thesis{janes_du_2021,
location = {Paris},
title = {Du catalogue papier au numérique Une chaîne de traitement ouverte pour l’extraction d’informations issues de documents structurés},
url = {https://raw.githubusercontent.com/Juliettejns/Memoire_TNAH/main/Jjanes_Memoire.pdf},
institution = {Ecole nationale des chartes},
type = {Mémoire de master "Technologies numériques appliquées à l'histoire"},
author = {Janès, Juliette},
urldate = {2022-05-29},
date = {2021},
}
@thesis{ehrmann_les_2008,
title = {Les entités nommées, de la linguistique au {TAL} : statut théorique et méthodes de désambiguïsation},
rights = {Licence Etalab},
url = {https://hal.archives-ouvertes.fr/tel-01639190/document},
shorttitle = {Les entités nommées, de la linguistique au {TAL}},
abstract = {Le traitement des entités nommées fait aujourd'hui figure d'incontournable en Traitement Automatique des Langues. Apparue au milieu des années 1990, la tâche de reconnaissance et de catégorisation des noms de personnes, de lieux, d'organisations, etc. Apparaît en effet comme fondamentale pour diverses applications participant de l'analyse de contenu et nombreux sont les travaux se consacrant à sa mise en oeuvre, obtenant des résultats plus qu'honorables. Fort de ce succès, le traitement des entités nommées s'oriente désormais vers de nouvelles perspectives, avec la désambiguïsation et une annotation enrichie de ces unités. Ces nouveaux défis rendent cependant d'autant plus cruciale la question du statut théorique des entités nommées, lequel n'a guère été discuté jusqu'à aujourd'hui. Deux axes de recherche ont été investis durant ce travail de thèse avec, d'une part, la proposition d'une définition des entités nommées et, d'autre part, des méthodes de désambiguïsation. A la suite d'un état des lieux de la tâche de reconnaissance de ces unités, il fut nécessaire d'examiner, d'un point de vue méthodologique, comment aborder la question de la définition les entités nommées. La démarche adoptée invita à se tourner du côté de la linguistique (noms propres et descriptions définies) puis du côté du traitement automatique, ce parcours visant au final à proposer une définition tenant compte tant des aspects du langage que des exigences des systèmes informatiques. La suite du mémoire rend compte d'un travail davantage expérimental, avec l'exposé d'une méthode d'annotation fine tout d'abord, de résolution de métonymie enfin.},
institution = {Paris 7},
type = {Thèse de doctorat},
author = {Ehrmann, Maud},
editora = {Victorri, Bernard},
editoratype = {collaborator},
urldate = {2022-04-14},
date = {2008-01-01},
keywords = {Traitement automatique du langage naturel, Catégorisation (linguistique), Linguistique, Métonymie, Référence (linguistique)},
}
@thesis{stern_identification_2013,
title = {Identification automatique d'entités pour l'enrichissement de contenus textuels},
url = {https://tel.archives-ouvertes.fr/tel-00939420},
abstract = {Cette thèse propose une méthode et un système d'identification d'entités (personnes, lieux, organisations) mentionnées au sein des contenus textuels produits par l'Agence France Presse dans la perspective de l'enrichissement automatique de ces contenus. Les différents domaines concernés par cette tâche ainsi que par l'objectif poursuivi par les acteurs de la publication numérique de contenus textuels sont abordés et mis en relation : Web Sémantique, Extraction d'Information et en particulier Reconnaissance d'Entités Nommées ({\textbackslash}ren), Annotation Sémantique, Liage d'Entités. À l'issue de cette étude, le besoin industriel formulé par l'Agence France Presse fait l'objet des spécifications utiles au développement d'une réponse reposant sur des outils de Traitement Automatique du Langage. L'approche adoptée pour l'identification des entités visées est ensuite décrite : nous proposons la conception d'un système prenant en charge l'étape de {\textbackslash}ren à l'aide de n'importe quel module existant, dont les résultats, éventuellement combinés à ceux d'autres modules, sont évalués par un module de Liage capable à la fois (i) d'aligner une mention donnée sur l'entité qu'elle dénote parmi un inventaire constitué au préalable, (ii) de repérer une dénotation ne présentant pas d'alignement dans cet inventaire et (iii) de remettre en cause la lecture dénotationnelle d'une mention (repérage des faux positifs). Le système {\textbackslash}nomos est développé à cette fin pour le traitement de données en français. Sa conception donne également lieu à la construction et à l'utilisation de ressources ancrées dans le réseau des {\textbackslash}ld ainsi que d'une base de connaissances riche sur les entités concernées.},
institution = {Université Paris-Diderot - Paris {VII}},
type = {Thèse de doctorat},
author = {Stern, Rosa},
urldate = {2022-03-28},
date = {2013},
langid = {french},
file = {Full Text PDF:C\:\\Users\\virgi\\Zotero\\storage\\GFFYDQED\\Stern - 2013 - Identification automatique d'entités pour l'enrich.pdf:application/pdf;Snapshot:C\:\\Users\\virgi\\Zotero\\storage\\PRN375SP\\tel-00939420.html:text/html;Snapshot:C\:\\Users\\virgi\\Zotero\\storage\\QMQI4VIM\\tel-00939420.html:text/html},
}
@article{holtz_les_2000,
title = {Les premières années de l’Institut de recherche et d’histoire des textes},
rights = {All rights reserved},
issn = {1298-9800},
url = {https://journals.openedition.org/histoire-cnrs/2742?&id=2742},
doi = {10.4000/histoire-cnrs.2742},
abstract = {The First Years of The Institute for the Research of Texts and their History The idea for the founding of the Institute for the Research of Texts and their History ({IRHT}) originated with the historian, Félix Grat, archivist paleographer and former member of the Ecole Francaise in Rome, later elected to the National Assembly. As early as 1937, two years before the creation of {CNRS}, he succeeded in convincing Jean Perrin, Nobel Prize winner in physics and then Under Secretary of State for Science and Research under Prime Minister Léon Blum, of the importance of a project whose goal was nothing less than to assure the conservation of the written memory of human thought. This resulted in the foundation of the first laboratory for research in a domain other than the exact sciences. To be sure, for this classical scholar, the first priority to be carried out was the transmission of those works that had first seen the light of day in manuscript form, in particular the great writers of Ancient Rome. Right from the beginning however F. Grat laid out an ambitious program with his plan for an Arabic section (a goal that he took particularly to heart for reasons that were as scientific as they were political), as well as Greek, French, Celtic sections and so on. Being himself particularly keen on all the progress accomplished in photography, F. Grat wanted a huge library that would collect photographs of all written manuscripts spread throughout the world in diverse languages in order to make them accessible to researchers and to facilitate research. With the help of Jeanne Vieillard, who was first place in the class of 1924 at the Ecole des chartes, Félix Grat opened the new institute located at first in the Bibliothèque Nationale. He sent out his assistants throughout Europe to photograph the manuscripts. However, war was threatening and once it broke out, the patriotic F. Grat enlisted in an auxiliary corps, while the {IRHT} withdrew to Laval He was one of the first officers to fail at the head of his troops at the very beginning of the German offensive. Jeanne Vieillard took over the direction of the {IRHT}. Specialized sections were founded one after another, going even beyond the limits of the program planned by Félix Grat. At the end of 1940, the {IRHT} relocated to the National Archives and, in 1960, was transferred to a building constructed by {CNRS} on the Quai Anatole-France in Paris. Then followed twenty years of accumulating first rate documentation on each author, each text, each manuscript and in all the disciplines bordering on textual history. This was an institute ahead of its time due to its organization, specialization, technical nature and feeling for multidisciplinary research. {IRHT} was admired by users from all countries. The staff, well supervised by the archivists paleographers, was constantly growing and so were the programs. Already collections were established that would consolidate the international status of this laboratory which under Jean Glénisson, successor to Jeanne Vielliard, was to enjoy renewed momentum.},
number = {2},
journaltitle = {La revue pour l’histoire du {CNRS}},
author = {Holtz, Louis},
urldate = {2022-07-16},
date = {2000-05-05},
langid = {french},
note = {{ISBN}: 9782271057082
Number: 2
Publisher: {CNRS} Éditions},
keywords = {{IRHT}},
file = {Full Text PDF:C\:\\Users\\virgi\\Zotero\\storage\\ZX8NNULE\\Holtz - 2000 - Les premières années de l’Institut de recherche et.pdf:application/pdf},
}
@online{stutzmann_compte-rendu_nodate,
title = {Compte-rendu final du projet {ORIFLAMMS} / {ORIFLAMMS} Final report},
url = {https://oriflamms.hypotheses.org/1592},
abstract = {Lien vers le rapport complet / Link to the full report: D. Stutzmann, Projet {ANR}-12-{CORP}-0010 Oriflamms: Compte-rendu de fin de projet, octobre 2016, 31 p. [en ligne] http://oriflamms.hypotheses.org/files/2017/04/Oriflamms-Compte-rendu-final.pdf English version below L’écriture du Moyen Âge : un objet sous le regard croisé des Humanités et des Sciences de l’ingénieur Comprendre les écritures dans une approche … Continuer la lecture de Compte-rendu final du projet {ORIFLAMMS} / {ORIFLAMMS} Final report →},
titleaddon = {Écriture médiévale \& numérique},
type = {Billet},
author = {Stutzmann, Dominique},
urldate = {2022-07-16},
langid = {french},
file = {Snapshot:C\:\\Users\\virgi\\Zotero\\storage\\GIAE2CFD\\1592.html:text/html},
}
@thesis{potin_mise_2007,
location = {Paris},
title = {La mise en archives du Trésor des chartes ({XIIIe}-{XIXe} siècle)},
url = {http://theses.enc.sorbonne.fr/2007/potin},
institution = {Ecole nationale des chartes},
type = {Positions de thèse pour le diplôme d'archiviste-paléographe},
author = {Potin, Yann},
urldate = {2022-07-16},
date = {2007},
file = {theses.enc.sorbonne.fr/2007/potin:C\:\\Users\\virgi\\Zotero\\storage\\JJGI3XSD\\potin.html:text/html},
}
@inproceedings{bluche_automatic_2016,
location = {Santorini, France},
title = {Automatic Handwritten Character Segmentation for Paleographical Character Shape Analysis},
url = {https://www.researchgate.net/profile/Christopher-Kermorvant/publication/303950834_Automatic_Handwritten_Character_Segmentation_for_Paleographical_Character_Shape_Analysis/links/5a1bd9d54585155c26ae0850/Automatic-Handwritten-Character-Segmentation-for-Paleographical-Character-Shape-Analysis.pdf?origin=publication_detail},
doi = {10.1109/DAS.2016.74},
series = {2016 12th {IAPR} Workshop on Document Analysis Systems ({DAS})},
abstract = {Written texts are both abstract and physical objects: ideas, signs and shapes, whose meanings, graphical systems and social connotations evolve through time. To study this dual nature of texts, paleographers need to analyse large scale corpora at the finest granularity, such as character shape. This goal can only be reached through an automatic segmentation process. In this paper, we present a method, based on Handwritten Text Recognition, to automatically align images of digitized manuscripts with texts from scholarly editions, at the levels of page, column, line, word, and character. It has been successfully
applied to two datasets of medieval manuscripts, which are now almost fully segmented at character level. The quality of the word and character segmentations are evaluated and further paleographical analysis are presented.},
pages = {42--47},
booktitle = {2016 12th {IAPR} Workshop on Document Analysis Systems ({DAS})},
publisher = {{IEEE}},
author = {Bluche, Théodore and Stutzmann, Dominique and Kermorvant, Christopher},
urldate = {2022-07-16},
date = {2016-04},
keywords = {Text recognition, automatic text recognition, Character recognition, Error analysis, Hidden Markov models, Image segmentation, Paleography, Shape, Training, word and character segmentation},
file = {HAL Snapshot:C\:\\Users\\virgi\\Zotero\\storage\\GGBD8YH2\\hal-02425715.html:text/html},
}
@inproceedings{bluche_preparatory_2017,
title = {Preparatory {KWS} Experiments for Large-Scale Indexing of a Vast Medieval Manuscript Collection in the {HIMANIS} Project},
volume = {01},
url = {http://www.jpuigcerver.net/pubs/bluche_icdar2017.pdf},
doi = {10.1109/ICDAR.2017.59},
abstract = {Making large-scale collections of digitized historical documents searchable is being earnestly demanded by many archives and libraries. Probabilistically indexing the text images of these collections by means of keyword spotting techniques is currently seen as perhaps the only feasible approach to meet this demand. A vast medieval manuscript collection, written in both Latin and French, called "Chancery", is currently being considered for indexing at large. In addition to its bilingual nature, one of the major difficulties of this collection is the very high rate of abbreviated words which, on the other hand, are completely expanded in the ground truth transcripts available. In preparation to undertake full indexing of Chancery, experiments have been carried out on a relatively small but fully representative subset of this collection. To this end, a keyword spotting approach has been adopted which computes word relevance probabilities using character lattices produced by a recurrent neural network and a N-gram character language model. Results confirm the viability of the chosen approach for the large-scale indexing aimed at and show the ability of the proposed modeling and training approaches to properly deal with the abbreviation difficulties mentioned.},
eventtitle = {2017 14th {IAPR} International Conference on Document Analysis and Recognition ({ICDAR})},
pages = {311--316},
booktitle = {2017 14th {IAPR} International Conference on Document Analysis and Recognition ({ICDAR})},
author = {Bluche, Théodore and Hamel, Sebastien and Kermorvant, Christopher and Puigcerver, Joan and Stutzmann, Dominique and Toselli, Alejandro H. and Vidal, Enrique},
urldate = {2022-07-11},
date = {2017-11},
note = {{ISSN}: 2379-2140},
keywords = {Hidden Markov models, character lattice, Decoding, Electronic mail, Indexing, indexing historical manuscript, keyword spotting, Lattices, Probabilistic logic, recurrent neural network, Recurrent neural networks},
file = {IEEE Xplore Abstract Record:C\:\\Users\\virgi\\Zotero\\storage\\RQXXECTB\\8269990.html:text/html},
}
@inproceedings{torres_aguilar_named_2021,
location = {Helsinki, Finland},
title = {Named Entity Recognition for French medieval charters},
url = {https://hal.archives-ouvertes.fr/hal-03503055},
series = {Workshop on Natural Language Processing for Digital Humanities Proceedings of the Workshop},
abstract = {This paper presents the process of annotating and modelling a corpus to automatically detect named entities in medieval charters in French. It introduces a new annotated corpus and a new system which outperforms state-of-the art libraries. Charters are legal documents and among the most important historical sources for medieval studies as they reflect economic and social dynamics as well as the evolution of literacy and writing practices. Automatic detection of named entities greatly improves the access to these unstructured texts and facilitates historical research. The experiments described here are based on a corpus encompassing about 500k words (1200 charters) coming from three charter collections of
the 13th and 14th centuries. We annotated the corpus and then trained two state-of-the art {NLP} libraries for Named Entity Recognition (Spacy and Flair) and a custom neural model (Bi-{LSTM}-{CRF}). The evaluation shows that all three models achieve a high performance rate on the test set and a high generalization capacity against two external corpora unseen during training. This paper describes the corpus and the annotation model, and discusses the issues related to the linguistic processing of medieval French and formulaic discourse, so as to interpret the results within a larger historical perspective.},
booktitle = {Workshop on Natural Language Processing for Digital Humanities},
author = {Torres Aguilar, Sergio and Stutzmann, Dominique},
urldate = {2022-07-17},
date = {2021-12},
keywords = {reconnaissance des entités nommées, ancien et moyen français, cultural heritage, named entity recognition, natural language processing, Old and Middle French, patrimoine culturel, traitement automatique du langage naturel},
file = {HAL PDF Full Text:C\:\\Users\\virgi\\Zotero\\storage\\6CVSH42U\\Torres Aguilar et Stutzmann - 2021 - Named Entity Recognition for French medieval chart.pdf:application/pdf},
}
@inproceedings{boros_comparison_2020,
title = {A comparison of sequential and combined approaches for named entity recognition in a corpus of handwritten medieval charters},
url = {https://teklia.com/publications/ICFHR2020_NER_Comparison_final_updated.pdf},
doi = {10.1109/ICFHR2020.2020.00025},
abstract = {This paper introduces a new corpus of multilingual medieval handwritten charter images, annotated with full transcription and named entities. The corpus is used to compare two approaches for named entity recognition in historical document images in several languages: on the one hand, a sequential approach, more commonly used, that sequentially applies handwritten text recognition ({HTR}) and named entity recognition ({NER}), on the other hand, a combined approach that simultaneously transcribes the image text line and extracts the entities. Experiments conducted on the charter corpus in Latin, early new high German and old Czech for name, date and location recognition demonstrate a superior performance of the combined approach.},
eventtitle = {2020 17th International Conference on Frontiers in Handwriting Recognition ({ICFHR})},
pages = {79--84},
booktitle = {2020 17th International Conference on Frontiers in Handwriting Recognition ({ICFHR})},
author = {Boroş, Emanuela and Romero, Verónica and Maarand, Martin and Zenklová, Kateřina and Křečková, Jitka and Vidal, Enrique and Stutzmann, Dominique and Kermorvant, Christopher},
date = {2020-09},
keywords = {Text recognition, Adaptive optics, Handwriting recognition, Handwritten Text Recognition, historical document processing, Mathematical model, multilingualism, Named entity recognition, Neural networks, Optical character recognition software, Optical imaging},
file = {IEEE Xplore Abstract Record:C\:\\Users\\virgi\\Zotero\\storage\\4CQINEHL\\9257761.html:text/html},
}
@article{chastang_named_nodate,
title = {A Named Entity Recognition Model for Medieval Latin Charters},
volume = {015},
issn = {1938-4122},
number = {4},
journaltitle = {Digital Humanities Quarterly},
shortjournal = {{DHQ}},
author = {Chastang, Pierre and Aguilar, Sergio Torres and Tannier, Xavier},
file = {DHQ\: Digital Humanities Quarterly\: A Named Entity Recognition Model for Medieval Latin Charters:C\:\\Users\\virgi\\Zotero\\storage\\J5ZXFLTV\\000574.html:text/html},
}
@misc{stutzmann_home-alcar_2021,
title = {{HOME}-Alcar: Aligned and Annotated Cartularies},
url = {https://zenodo.org/record/5600884},
shorttitle = {{HOME}-Alcar},
abstract = {The {HOME}-Alcar (Aligned and Annotated Cartularies) corpus was produced as part of the European research project {HOME} History of Medieval Europe (https://www.heritageresearch-hub.eu/project/home/), led under the coordination oflinebreakof Institut de Recherche et d'Histoire des Textes ({PI}: D. Stutzmann), with the Universitat Politecnica de Valencia ({PI}: E. Vidal), the National Archives of the Czech Republic in Prague ({PI}: J. Kreckova), and Teklia {SAS} ({PI}: C. Kermorvant) The {HOME}-Alcar (Aligned and Annotated Cartularies) corpus is a resource created to train Handwritten Text Recognition ({HTR}) and Named Entity Recognition ({NER}), and presents a collection of (i) digital images of 17 medieval manuscripts; (ii) scholarly editions thereof; (iii) coordinates linking images and text at line level; (iv) annotations of Named Entities (place and person names). The 17 medieval manuscripts in this corpus are cartularies, i.e. books copying charters and legal acts, produced between the 12th and 14th centuries.},
publisher = {Zenodo},
author = {Stutzmann, Dominique and Torres Aguilar, Sergio and Chaffenet, Paul},
urldate = {2022-07-17},
date = {2021-10-26},
doi = {10.5281/zenodo.5600884},
note = {Type: dataset},
keywords = {named entity recognition, handwritten text recognition, Latin palaeography},
file = {Zenodo Full Text PDF:C\:\\Users\\virgi\\Zotero\\storage\\KA9LWSAW\\Stutzmann et al. - 2021 - HOME-Alcar Aligned and Annotated Cartularies.pdf:application/pdf},
}
@inproceedings{kolitsas_end--end_2018,
location = {Brussels, Belgium},
title = {End-to-End Neural Entity Linking},
url = {https://aclanthology.org/K18-1050},
doi = {10.18653/v1/K18-1050},
abstract = {Entity Linking ({EL}) is an essential task for semantic text understanding and information extraction. Popular methods separately address the Mention Detection ({MD}) and Entity Disambiguation ({ED}) stages of {EL}, without leveraging their mutual dependency. We here propose the first neural end-to-end {EL} system that jointly discovers and links entities in a text document. The main idea is to consider all possible spans as potential mentions and learn contextual similarity scores over their entity candidates that are useful for both {MD} and {ED} decisions. Key components are context-aware mention embeddings, entity embeddings and a probabilistic mention - entity map, without demanding other engineered features. Empirically, we show that our end-to-end method significantly outperforms popular systems on the Gerbil platform when enough training data is available. Conversely, if testing datasets follow different annotation conventions compared to the training set (e.g. queries/ tweets vs news documents), our {ED} model coupled with a traditional {NER} system offers the best or second best {EL} accuracy.},
eventtitle = {{CoNLL} 2018},
pages = {519--529},
booktitle = {Proceedings of the 22nd Conference on Computational Natural Language Learning},
publisher = {Association for Computational Linguistics},
author = {Kolitsas, Nikolaos and Ganea, Octavian-Eugen and Hofmann, Thomas},
urldate = {2022-07-17},
date = {2018-10},
file = {Full Text PDF:C\:\\Users\\virgi\\Zotero\\storage\\PQ9K8FPQ\\Kolitsas et al. - 2018 - End-to-End Neural Entity Linking.pdf:application/pdf},
}
@book{poibeau_extraction_2003,
title = {Extraction automatique d'information : Du texte brut au web sémantique},
url = {https://hal.archives-ouvertes.fr/hal-00005506},
shorttitle = {Extraction automatique d'information},
abstract = {Les entreprises et les particuliers sont confrontés à une masse d'information sans cesse croissante. Partant de ce constat, de nombreux systèmes ont été conçus pour filtrer, trier et catégoriser l'information. L'offre est en revanche beaucoup plus faible en ce qui concerne l'analyse du contenu. Extraction automatique d'information - du texte brut au web sémantique présente les progrès récents en extraction d'information et en compréhension de textes. Les recherches effectuées ces dernières années dans le domaine du traitement automatique des langues rendent en effet possible l'annotation sémantique de documents, l'extraction d'information pertinente et la création de bases de connaissances structurées à partir de textes en langage naturel. L'ouvrage rappelle les grands courants de recherche qui ont marqué le domaine de la compréhension automatique de textes par ordinateur. Il se poursuit par la présentation détaillée d'un système appelé {SEMTEX}, qui est appliqué à une grande variété de textes et de situations différentes. Les applications détaillées donnent des perspectives sur le web sémantique et l'ingénierie des connaissances.},
pagetotal = {238},
publisher = {Lavoisier},
author = {Poibeau, Thierry},
urldate = {2022-07-18},
date = {2003},
keywords = {linguistique, extraction d'information, informatique, recherche d'information, traitement des langues},
}
@inproceedings{agirre_matching_2012,
location = {Istanbul, Turkey},
title = {Matching Cultural Heritage items to Wikipedia},
url = {http://www.lrec-conf.org/proceedings/lrec2012/pdf/1021_Paper.pdf},
abstract = {Digitised Cultural Heritage ({CH}) items usually have short descriptions and lack rich contextual information. Wikipedia articles, on the contrary, include in-depth descriptions and links to related articles, which motivate the enrichment of {CH} items with information from Wikipedia. In this paper we explore the feasibility of finding matching articles in Wikipedia for a given Cultural Heritage item. We manually annotated a random sample of items from Europeana, and performed a qualitative and quantitative study of the issues and problems that arise, showing that each kind of {CH} item is different and needs a nuanced definition of what “matching article” means. In addition, we test a well-known wikification (aka entity linking) algorithm on the task. Our results indicate that a substantial number of items can be effectively linked to their corresponding Wikipedia article.},
eventtitle = {{LREC} 2012},
pages = {1729--1735},
booktitle = {Proceedings of the Eighth International Conference on Language Resources and Evaluation ({LREC}'12)},
publisher = {European Language Resources Association ({ELRA})},
author = {Agirre, Eneko and Barrena, Ander and de Lacalle, Oier Lopez and Soroa, Aitor and Fernando, Samuel and Stevenson, Mark},
urldate = {2022-07-21},
date = {2012-05},
file = {Full Text PDF:C\:\\Users\\virgi\\Zotero\\storage\\EHY7TKBN\\Agirre et al. - 2012 - Matching Cultural Heritage items to Wikipedia.pdf:application/pdf},
}
@inproceedings{munnelly_investigating_2018,
location = {New York, {NY}, {USA}},
title = {Investigating Entity Linking in Early English Legal Documents},
isbn = {978-1-4503-5178-2},
url = {https://dl.acm.org/action/downloadSupplement?doi=10.1145%2F3197026.3197055&file=p59-munnelly_original.pdf},
doi = {10.1145/3197026.3197055},
series = {{JCDL} '18},
abstract = {In this paper we investigate the accuracy and overall suitability of a variety of Entity Linking systems for the task of disambiguating entities in 17th century depositions obtained during the 1641 Irish Rebellion. The depositions are extremely difficult for modern {NLP} tools to work with due to inconsistent spelling, use of language and archaic references. In order to assess the severity of difficulty faced by Entity Linking systems when working with the depositions we use them to create an evaluation corpus. This corpus is used as an input to the General Entity Annotator Benchmarking Framework a standard benchmarking platform for entity annotation systems. Based on this corpus and the results obtained from General Entity Annotator Benchmarking Framework we observe that the accuracy of existing Entity Linking systems is lacking when applied to content like these depositions. This is due to a number of issues ranging from problems with existing state-of-the-art systems to poor representation of historic entities in modern knowledge bases. We discuss some interesting questions raised by this evaluation and put forward a plan for future work in order to learn more.},
pages = {59--68},
booktitle = {Proceedings of the 18th {ACM}/{IEEE} on Joint Conference on Digital Libraries},
publisher = {Association for Computing Machinery},
author = {Munnelly, Gary and Lawless, Seamus},
urldate = {2022-07-21},
date = {2018-05-23},
keywords = {cultural heritage, digital humanities, named entity disambiguation},
file = {Texte intégral:C\:\\Users\\virgi\\Zotero\\storage\\XEUC734I\\Munnelly et Lawless - 2018 - Investigating Entity Linking in Early English Lega.pdf:application/pdf},
}
@inproceedings{hosseini_deezymatch_2020,
title = {{DeezyMatch}: A Flexible Deep Learning Approach to Fuzzy String Matching},
doi = {10.18653/v1/2020.emnlp-demos.9},
shorttitle = {{DeezyMatch}},
abstract = {We present {DeezyMatch}, a free, open-source software library written in Python for fuzzy string matching and candidate ranking. Its pair classifier supports various deep neural network architectures for training new classifiers and for fine-tuning a pretrained model, which paves the way for transfer learning in fuzzy string matching. This approach is especially useful where only limited training examples are available. The learned {DeezyMatch} models can be used to generate rich vector representations from string inputs. The candidate ranker component in {DeezyMatch} uses these vector representations to find, for a given query, the best matching candidates in a knowledge base. It uses an adaptive searching algorithm applicable to large knowledge bases and query sets. We describe {DeezyMatch}’s functionality, design and implementation, accompanied by a use case in toponym matching and candidate ranking in realistic noisy datasets.},
author = {Hosseini, Kasra and Nanni, Federico and Coll Ardanuy, Mariona},
date = {2020-10-01},
file = {Full Text PDF:C\:\\Users\\virgi\\Zotero\\storage\\YQ42J8QD\\Hosseini et al. - 2020 - DeezyMatch A Flexible Deep Learning Approach to F.pdf:application/pdf},
}
@article{soudani_adapting_nodate,
title = {Adapting a system for Named Entity Recognition and Linking for 19th century French Novels},
pages = {2},
author = {Soudani, Aicha},
langid = {english},
file = {Soudani - Adapting a system for Named Entity Recognition and.pdf:C\:\\Users\\virgi\\Zotero\\storage\\TETYM8A8\\Soudani - Adapting a system for Named Entity Recognition and.pdf:application/pdf},
}
@article{frontini_annotation_2016,
title = {Annotation of Toponyms in {TEI} Digital Literary Editions and Linking to the Web of Data},
url = {https://hal.archives-ouvertes.fr/hal-01363709},
doi = {10.14195/2182-8830_4-2_3},
number = {2},
journaltitle = {{MALTIT} : Materialities of literature},
author = {Frontini, Francesca and Brando, Carmen and Riguet, Marine and Jacquot, Clémence and Jolivet, Vincent},
urldate = {2022-07-21},
date = {2016-07},
keywords = {digital literary studies, geographic databases, maps and visualizations, semantic web, toponyms},
file = {HAL PDF Full Text:C\:\\Users\\virgi\\Zotero\\storage\\5C8PTHNF\\Frontini et al. - 2016 - Annotation of Toponyms in TEI Digital Literary Edi.pdf:application/pdf},
}
@inproceedings{suarez_establishing_2020,
title = {Establishing a New State-of-the-Art for French Named Entity Recognition},
url = {https://hal.inria.fr/hal-02617950},
abstract = {The French {TreeBank} developed at the University Paris 7 is the main source of morphosyntactic and syntactic annotations for French. However, it does not include explicit information related to named entities, which are among the most useful information for several natural language processing tasks and applications. Moreover, no large-scale French corpus with named entity annotations contain referential information, which complement the type and the span of each mention with an indication of the entity it refers to. We have manually annotated the French {TreeBank} with such information, after an automatic pre-annotation step. We sketch the underlying annotation guidelines and we provide a few figures about the resulting annotations.},
eventtitle = {{LREC} 2020 - 12th Language Resources and Evaluation Conference},
author = {Suárez, Pedro Javier Ortiz and Dupont, Yoann and Muller, Benjamin and Romary, Laurent and Sagot, Benoît},
urldate = {2022-07-21},
date = {2020-05-11},
langid = {english},
file = {Full Text PDF:C\:\\Users\\virgi\\Zotero\\storage\\YHPFNP3J\\Suárez et al. - 2020 - Establishing a New State-of-the-Art for French Nam.pdf:application/pdf},
}
@inproceedings{soudani_adaptation_2018,
location = {Montpellier, France},
title = {Adaptation et évaluation de systèmes de reconnaissance et de résolution des entités nommées pour le cas de textes littéraires français du 19ème siècle},
url = {https://hal.archives-ouvertes.fr/hal-01925816},
booktitle = {Atelier Humanités Numériques Spatialisées ({HumaNS}'2018)},
author = {Soudani, Aicha and Meherzi, Yosra and Bouhafs, Asma and Frontini, Francesca and Brando, Carmen and Dupont, Yoann and Mélanie-Becquet, Frédérique},
urldate = {2022-07-21},
date = {2018-11},
keywords = {reconnaissance des entités nommées, cartographie, cartography, entity recognition, named entity linking, résolution des entités nommées},
file = {HAL PDF Full Text:C\:\\Users\\virgi\\Zotero\\storage\\BE6XGQ32\\Soudani et al. - 2018 - Adaptation et évaluation de systèmes de reconnaiss.pdf:application/pdf},
}
@inproceedings{koudoro-parfait_reconnaissance_2022,
title = {Reconnaissance d'entités nommées sur des sorties {OCR} bruitées: des pistes pour la désambiguïsation morphologique automatique},
abstract = {Resolution of entity linking issues on noisy {OCR} output : automatic disambiguation tracks.},
pages = {45--55},
booktitle = {Traitement Automatique des Langues Naturelles},
author = {Koudoro-Parfait, Caroline and Lejeune, Gaël and Buth, Richy},
date = {2022},
langid = {french},
file = {Koudoro-Parfait et al. - Reconnaissance d'entités nommées sur des sorties O.pdf:C\:\\Users\\virgi\\Zotero\\storage\\KP9JLWF8\\Koudoro-Parfait et al. - Reconnaissance d'entités nommées sur des sorties O.pdf:application/pdf},
}
@inproceedings{koudoro-parfait_spatial_2021,
location = {New York, {NY}, {USA}},
title = {Spatial Named Entity Recognition in Literary Texts: What is the Influence of {OCR} Noise?},
isbn = {978-1-4503-9102-3},
url = {https://doi.org/10.1145/3486187.3490206},
doi = {10.1145/3486187.3490206},
series = {{GeoHumanities}'21},
shorttitle = {Spatial Named Entity Recognition in Literary Texts},
abstract = {Exploring text collections through named entities remains a very common need for scholars. Despite the recent advances of Named Entity Recognition ({NER}) systems, more efficient and easier to use, the task remains problematic when the data is not born digital and thus more prone to Optical Character Recognition errors. In this paper, we investigate the real influence of noise on the extraction of locations in a collection of ten books in French exhibiting different levels of difficulties for {NER} systems (digitization quality, complexity of layout, variation in language). We compare the results of various systems on the "clean" version of the documents and on different {OCRed} versions. We show that {NER} systems do not yield many more errors in noisy documents, most of the errors being already there on a reference version, and in some cases {NER} performs better on noisy versions. According to our results, the main problem is rare entities (especially hapax) which are more likely to disappear from of the output.},
pages = {13--21},
booktitle = {Proceedings of the 5th {ACM} {SIGSPATIAL} International Workshop on Geospatial Humanities},
publisher = {Association for Computing Machinery},
author = {Koudoro-Parfait, Caroline and Lejeune, Gaël and Roe, Glenn},
urldate = {2022-07-21},
date = {2021-11-02},
keywords = {Named Entity Recognition, Digital Humanities, Evaluation, Natural language Processing, Noise, Optical Character Recognition, Users},
}
@article{rijhwani_zero-shot_2019,
title = {Zero-Shot Neural Transfer for Cross-Lingual Entity Linking},
volume = {33},
rights = {Copyright (c) 2019 Association for the Advancement of Artificial Intelligence},
issn = {2374-3468},
url = {https://ojs.aaai.org/index.php/AAAI/article/view/4670},
doi = {10.1609/aaai.v33i01.33016924},
abstract = {Cross-lingual entity linking maps an entity mention in a source language to its corresponding entry in a structured knowledge base that is in a different (target) language. While previous work relies heavily on bilingual lexical resources to bridge the gap between the source and the target languages, these resources are scarce or unavailable for many low-resource languages. To address this problem, we investigate zero-shot cross-lingual entity linking, in which we assume no bilingual lexical resources are available in the source low-resource language. Specifically, we propose pivot-basedentity linking, which leverages information from a highresource “pivot” language to train character-level neural entity linking models that are transferred to the source lowresource language in a zero-shot manner. With experiments on 9 low-resource languages and transfer through a total of54 languages, we show that our proposed pivot-based framework improves entity linking accuracy 17\% (absolute) on average over the baseline systems, for the zero-shot scenario.1 Further, we also investigate the use of language-universal phonological representations which improves average accuracy (absolute) by 36\% when transferring between languages that use different scripts.},
pages = {6924--6931},
number = {1},
journaltitle = {Proceedings of the {AAAI} Conference on Artificial Intelligence},
author = {Rijhwani, Shruti and Xie, Jiateng and Neubig, Graham and Carbonell, Jaime},
urldate = {2022-07-23},
date = {2019-07-17},
langid = {english},
note = {Number: 01},
file = {Full Text PDF:C\:\\Users\\virgi\\Zotero\\storage\\AFAJ37VP\\Rijhwani et al. - 2019 - Zero-Shot Neural Transfer for Cross-Lingual Entity.pdf:application/pdf},
}
@incollection{linhares_pontes_linking_2020,
location = {New York, {NY}, {USA}},
title = {Linking Named Entities across Languages using Multilingual Word Embeddings},
isbn = {978-1-4503-7585-6},
url = {https://doi.org/10.1145/3383583.3398597},
abstract = {Digital libraries are online collections of digital objects that can include text, images, audio, or videos in several languages. It has long been observed that named entities ({NEs}) are key to the access to digital library portals as they are contained in most user queries. However, {NEs} can have different spellings for each language which reduces the performance of user queries to retrieve documents across languages. Cross-lingual named entity linking ({XEL}) connects {NEs} from documents in a source language to external knowledge bases in another (target) language. The {XEL} task is especially challenging due to the diversity of {NEs} across languages and contexts. This paper describes an {XEL} system applied and evaluated with several languages pairs including English and various low-resourced languages of different linguistic families such as Croatian, Finnish, Estonian, and Slovenian. We tested this approach to analyze documents and {NEs} in low-resourced languages and link them to the English version of Wikipedia. We present the resulting study of this analysis and the challenges involved in the case of degraded documents from digital libraries. Further works will make an extensive analysis of the impact of our approach on the {XEL} task with {OCRed} documents.},
pages = {329--332},
booktitle = {Proceedings of the {ACM}/{IEEE} Joint Conference on Digital Libraries in 2020},
publisher = {Association for Computing Machinery},
author = {Linhares Pontes, Elvys and Moreno, Jose G. and Doucet, Antoine},
urldate = {2022-07-23},
date = {2020-08-01},
keywords = {cross-lingual named entity linking, digital library, indexing, multilingual word embeddings},
file = {Linhares Pontes et al. - 2020 - Linking Named Entities across Languages using Mult.pdf:C\:\\Users\\virgi\\Zotero\\storage\\K379U7SM\\Linhares Pontes et al. - 2020 - Linking Named Entities across Languages using Mult.pdf:application/pdf},
}
@inproceedings{zhou_towards_2019,
location = {Hong Kong, China},
title = {Towards Zero-resource Cross-lingual Entity Linking},
url = {https://aclanthology.org/D19-6127},
doi = {10.18653/v1/D19-6127},
abstract = {Cross-lingual entity linking ({XEL}) grounds named entities in a source language to an English Knowledge Base ({KB}), such as Wikipedia. {XEL} is challenging for most languages because of limited availability of requisite resources. However, many works on {XEL} have been on simulated settings that actually use significant resources (e.g. source language Wikipedia, bilingual entity maps, multilingual embeddings) that are not available in truly low-resource languages. In this work, we first examine the effect of these resource assumptions and quantify how much the availability of these resource affects overall quality of existing {XEL} systems. We next propose three improvements to both entity candidate generation and disambiguation that make better use of the limited resources we do have in resource-scarce scenarios. With experiments on four extremely low-resource languages, we show that our model results in gains of 6-20\% end-to-end linking accuracy.},
pages = {243--252},
booktitle = {Proceedings of the 2nd Workshop on Deep Learning Approaches for Low-Resource {NLP} ({DeepLo} 2019)},
publisher = {Association for Computational Linguistics},
author = {Zhou, Shuyan and Rijhwani, Shruti and Neubig, Graham},
urldate = {2022-07-23},
date = {2019-11},
file = {Full Text PDF:C\:\\Users\\virgi\\Zotero\\storage\\UU8A2UD4\\Zhou et al. - 2019 - Towards Zero-resource Cross-lingual Entity Linking.pdf:application/pdf},
}
@incollection{goos_disambiguating_2001,
location = {Berlin, Heidelberg},
title = {Disambiguating Geographic Names in a Historical Digital Library},
volume = {2163},
isbn = {978-3-540-42537-3 978-3-540-44796-2},
url = {https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.7.8475&rep=rep1&type=pdf},
abstract = {Geographic interfaces provide natural, scalable visualizations for many digital library collections, but the wide range of data in digital libraries presents some particular problems for identifying and disambiguating place names. We describe the toponym-disambiguation system in the Perseus digital library and evaluate its performance. Name categorization varies significantly among different types of documents, but toponym disambiguation performs at a high level of precision and recall with a gazetteer an order of magnitude larger than most other applications.},
pages = {127--136},
booktitle = {Research and Advanced Technology for Digital Libraries},
publisher = {Springer Berlin Heidelberg},
author = {Smith, David A. and Crane, Gregory},
editor = {Constantopoulos, Panos and Sølvberg, Ingeborg T.},
editorb = {Goos, Gerhard and Hartmanis, Juris and van Leeuwen, Jan},
editorbtype = {redactor},
urldate = {2022-07-23},
date = {2001},
langid = {english},
doi = {10.1007/3-540-44796-2_12},
note = {Series Title: Lecture Notes in Computer Science},
file = {Smith et Crane - 2001 - Disambiguating Geographic Names in a Historical Di.pdf:C\:\\Users\\virgi\\Zotero\\storage\\H9SQTEI3\\Smith et Crane - 2001 - Disambiguating Geographic Names in a Historical Di.pdf:application/pdf},
}
@article{ruiz_mapping_2019,
title = {Mapping the Bentham Corpus: Concept-based Navigation},
volume = {Atelier Digit\_Hum},
url = {https://hal.archives-ouvertes.fr/hal-01915730},
doi = {10.46298/jdmdh.5044},
shorttitle = {Mapping the Bentham Corpus},
abstract = {British philosopher and reformer Jeremy Bentham (1748-1832) left over 60,000 folios of unpublished manuscripts. The Bentham Project, at University College London, is creating a {TEI} version of the manuscripts, via crowdsourced transcription verified by experts. We present here an interface to navigate these largely unedited manuscripts, and the language technologies the corpus was enriched with to facilitate navigation, i.e Entity Linking against the {DBpedia} knowledge base and keyphrase extraction. The challenges of tagging a historical domain-specific corpus with a contemporary knowledge base are discussed. The concepts extracted were used to create interactive co-occurrence networks, that serve as a map for the corpus and help navigate it, along with a search index. These corpus representations were integrated in a user interface. The interface was evaluated by domain experts with satisfactory results , e.g. they found the distributional semantics methods exploited here applicable in order to assist in retrieving related passages for scholarly editing of the corpus.},
journaltitle = {Journal of Data Mining and Digital Humanities},
author = {Ruiz, Pablo and Poibeau, Thierry},
urldate = {2022-07-23},
date = {2019-03},
note = {Publisher: Episciences.org},
keywords = {entity linking, corpus navigation, Jeremy Bentham, keyphrase extraction, manuscripts},
file = {HAL PDF Full Text:C\:\\Users\\virgi\\Zotero\\storage\\QMEWYMJJ\\Ruiz et Poibeau - 2019 - Mapping the Bentham Corpus Concept-based Navigati.pdf:application/pdf},
}
@inproceedings{linhares_pontes_impact_2019,
location = {Kuala Lumpur, Malaysia},
title = {Impact of {OCR} Quality on Named Entity Linking},
url = {https://hal.archives-ouvertes.fr/hal-02557116},
doi = {10.1007/978-3-030-34058-2_11},
abstract = {Digital libraries are online collections of digital objects that can include text, images, audio, or videos. It has long been observed that named entities ({NEs}) are key to the access to digital library portals as they are contained in most user queries. Combined or subsequent to the recognition of {NEs}, named entity linking ({NEL}) connects {NEs} to external knowledge bases. This allows to differentiate ambiguous geographical locations or names (John Smith), and implies that the descriptions from the knowledge bases can be used for semantic enrichment. However, the {NEL} task is especially challenging for large quantities of documents as the diversity of {NEs} is increasing with the size of the collections. Additionally digitized documents are indexed through their {OCRed} version which may contains numerous {OCR} errors. This paper aims to evaluate the performance of named entity linking over digitized documents with different levels of {OCR} quality. It is the first investigation that we know of to analyze and correlate the impact of document degradation on the performance of {NEL}. We tested state-of-the-art {NEL} techniques over several evaluation benchmarks, and experimented with various types of {OCR} noise. We present the resulting study and subsequent recommendations on the adequate documents and {OCR} quality levels required to perform reliable named entity linking.},
booktitle = {International Conference on Asia-Pacific Digital Libraries 2019},
author = {Linhares Pontes, Elvys and Hamdi, Ahmed and Sidère, Nicolas and Doucet, Antoine},
urldate = {2022-07-23},
date = {2019-11},
file = {HAL PDF Full Text:C\:\\Users\\virgi\\Zotero\\storage\\4QZWXB5I\\Linhares Pontes et al. - 2019 - Impact of OCR Quality on Named Entity Linking.pdf:application/pdf},
}
@inproceedings{haslhofer_augmenting_2010,
title = {Augmenting Europeana content with linked data resources},
doi = {10.1145/1839707.1839757},
abstract = {Annotations allow end users to augment digital items with information, which can then be exploited for search and retrieval. We are currently extending Europeana, a platform which links to millions of digital items in European institutions, with an annotation mechanism that exposes annotations as linked data and enriches newly created annotations with links to contextually relevant resources on the Web. In two demos we showcase how we integrated that kind of content augmentation into two clients that allow users to annotate videos and historic maps.},
author = {Haslhofer, Bernhard and Momeni, Elaheh and Gay, Manuel and Simon, Rainer},
date = {2010-01-01},
file = {Full Text PDF:C\:\\Users\\virgi\\Zotero\\storage\\7STVV4XL\\Haslhofer et al. - 2010 - Augmenting Europeana content with linked data reso.pdf:application/pdf},
}
@inproceedings{boeglin_pour_2016,
location = {Nice, France},
title = {Pour une cartographie romanesque de Paris au {XIXe} siècle. Proposition méthodologique},
url = {https://hal.archives-ouvertes.fr/hal-01619600},
series = {Actes de la conférence {SAGEO}'2016 - Spatial Analysis and {GEOmatics}},
booktitle = {Conférence Spatial Analysis and {GEOmatics}},
author = {Boeglin, Noémie and Depeyre, Michel and Joliveau, Thierry and Le Lay, Yves-François},
urldate = {2022-07-23},
date = {2016-12},
keywords = {Analyse textuelle assistée par ordinateur, Cartographie, Géomatique, Histoire urbaine, Littérature, Paris, Recherche Information Géographique, {SIG}, {XIXe} Siècle},
file = {HAL PDF Full Text:C\:\\Users\\virgi\\Zotero\\storage\\IBNUGBJA\\Boeglin et al. - 2016 - Pour une cartographie romanesque de Paris au XIXe .pdf:application/pdf},
}
@incollection{mika_agdistis_2014,
location = {Cham},
title = {{AGDISTIS} - Graph-Based Disambiguation of Named Entities Using Linked Data},
volume = {8796},
isbn = {978-3-319-11963-2 978-3-319-11964-9},
url = {http://link.springer.com/10.1007/978-3-319-11964-9_29},
abstract = {Over the last decades, several billion Web pages have been made available on the Web. The ongoing transition from the current Web of unstructured data to the Web of Data yet requires scalable and accurate approaches for the extraction of structured data in {RDF} (Resource Description Framework) from these websites. One of the key steps towards extracting {RDF} from text is the disambiguation of named entities. While several approaches aim to tackle this problem, they still achieve poor accuracy. We address this drawback by presenting {AGDISTIS}, a novel knowledge-base-agnostic approach for named entity disambiguation. Our approach combines the Hypertext-Induced Topic Search ({HITS}) algorithm with label expansion strategies and string similarity measures. Based on this combination, {AGDISTIS} can efficiently detect the correct {URIs} for a given set of named entities within an input text. We evaluate our approach on eight different datasets against state-of-theart named entity disambiguation frameworks. Our results indicate that we outperform the state-of-the-art approach by up to 29\% F-measure.},
pages = {457--471},
booktitle = {The Semantic Web – {ISWC} 2014},
publisher = {Springer International Publishing},
author = {Usbeck, Ricardo and Ngonga Ngomo, Axel-Cyrille and Röder, Michael and Gerber, Daniel and Coelho, Sandro Athaide and Auer, Sören and Both, Andreas},
editor = {Mika, Peter and Tudorache, Tania and Bernstein, Abraham and Welty, Chris and Knoblock, Craig and Vrandečić, Denny and Groth, Paul and Noy, Natasha and Janowicz, Krzysztof and Goble, Carole},
urldate = {2022-07-24},
date = {2014},
langid = {english},
doi = {10.1007/978-3-319-11964-9_29},
note = {Series Title: Lecture Notes in Computer Science},
file = {Usbeck et al. - 2014 - AGDISTIS - Graph-Based Disambiguation of Named Ent.pdf:C\:\\Users\\virgi\\Zotero\\storage\\N274W7GZ\\Usbeck et al. - 2014 - AGDISTIS - Graph-Based Disambiguation of Named Ent.pdf:application/pdf},
}
@inproceedings{heino_named_2017,
location = {Cham},
title = {Named Entity Linking in a Complex Domain: Case Second World War History},
isbn = {978-3-319-59888-8},
url = {https://helda.helsinki.fi/bitstream/handle/10138/310657/heino_et_al_nel_2017.pdf?sequence=1},
doi = {10.1007/978-3-319-59888-8_10},
series = {Lecture Notes in Computer Science},
shorttitle = {Named Entity Linking in a Complex Domain},
abstract = {This paper discusses the challenges of applying named entity linking in a rich, complex domain – specifically, the linking of (1) military units, (2) places and (3) people in the context of interlinked Second World War data. Multiple sub-scenarios are discussed in detail through concrete evaluations, analyzing the problems faced, and the solutions developed. A key contribution of this work is to highlight the heterogeneity of problems and approaches needed even inside a single domain, depending on both the source data as well as the target authority.},
pages = {120--133},
booktitle = {Language, Data, and Knowledge},
publisher = {Springer International Publishing},
author = {Heino, Erkki and Tamper, Minna and Mäkelä, Eetu and Leskinen, Petri and Ikkala, Esko and Tuominen, Jouni and Koho, Mikko and Hyvönen, Eero},
editor = {Gracia, Jorge and Bond, Francis and {McCrae}, John P. and Buitelaar, Paul and Chiarcos, Christian and Hellmann, Sebastian},
date = {2017},
langid = {english},
keywords = {Optical Character Recognition, Magazine Article, Military Unit, Name Entity Recognition, National Archive},
file = {Version acceptée:C\:\\Users\\virgi\\Zotero\\storage\\QZZ73GXZ\\Heino et al. - 2017 - Named Entity Linking in a Complex Domain Case Sec.pdf:application/pdf},
}
@inproceedings{mendes_dbpedia_2011,
location = {New York, {NY}, {USA}},
title = {{DBpedia} spotlight: shedding light on the web of documents},
isbn = {978-1-4503-0621-8},
url = {https://www.dbpedia-spotlight.org/docs/spotlight.pdf},
doi = {10.1145/2063518.2063519},
series = {I-Semantics '11},
shorttitle = {{DBpedia} spotlight},
abstract = {Interlinking text documents with Linked Open Data enables the Web of Data to be used as background knowledge within document-oriented applications such as search and faceted browsing. As a step towards interconnecting the Web of Documents with the Web of Data, we developed {DBpedia} Spotlight, a system for automatically annotating text documents with {DBpedia} {URIs}. {DBpedia} Spotlight allows users to configure the annotations to their specific needs through the {DBpedia} Ontology and quality measures such as prominence, topical pertinence, contextual ambiguity and disambiguation confidence. We compare our approach with the state of the art in disambiguation, and evaluate our results in light of three baselines and six publicly available annotation systems, demonstrating the competitiveness of our system. {DBpedia} Spotlight is shared as open source and deployed as a Web Service freely available for public use.},
pages = {1--8},
booktitle = {Proceedings of the 7th International Conference on Semantic Systems},
publisher = {Association for Computing Machinery},
author = {Mendes, Pablo N. and Jakob, Max and García-Silva, Andrés and Bizer, Christian},
urldate = {2022-07-24},
date = {2011-09-07},
keywords = {named entity disambiguation, {DBpedia}, linked data, text annotation},
}
@article{abadie_evaluation_2017,
title = {Evaluation de la qualité des sources du Web de Données pour la résolution d’entités nommées},
volume = {21},
url = {https://iieta.org/download/file/fid/27476},
number = {5},
journaltitle = {Revue des Sciences et Technologies de l'Information - Série {ISI} : Ingénierie des Systèmes d'Information},
author = {Abadie, Nathalie and Escobar, Carmen Brando and Frontini, Francesca},
urldate = {2022-07-27},
date = {2017-02-08},
langid = {french},
file = {Snapshot:C\:\\Users\\virgi\\Zotero\\storage\\UMEQ2KBM\\hal-01462256.html:text/html},
}
@inproceedings{frontini_domain-adapted_2015,
title = {Domain-adapted named-entity linker using Linked Data},
url = {https://hal.archives-ouvertes.fr/hal-01203356},
abstract = {We present {REDEN}, a tool for graph-based Named Entity Linking that allows for the disambiguation of entities using domainspecific Linked Data sources and different configurations (e.g. context size). It takes {TEI}-annotated texts as input and outputs them enriched with external references ({URIs}). The possibility of customizing indexes built from various knowledge sources by defining temporal and spatial extents makes {REDEN} particularly suited to handle domain-specific corpora such as enriched digital editions in the Digital Humanities.},
eventtitle = {Workshop on {NLP} Applications: Completing the Puzzle co-located with the 20th International Conference on Applications of Natural Language to Information Systems ({NLDB} 2015)},
author = {Frontini, Francesca and Brando, Carmen and Ganascia, Jean-Gabriel},
urldate = {2022-07-27},
date = {2015-06-17},
langid = {english},
file = {Snapshot:C\:\\Users\\virgi\\Zotero\\storage\\REVZ2HQR\\hal-01203356.html:text/html},
}
@article{van_hooland_exploring_2015,
title = {Exploring entity recognition and disambiguation for cultural heritage collections},
volume = {30},
issn = {2055-7671},
url = {https://doi.org/10.1093/llc/fqt067},
doi = {10.1093/llc/fqt067},
abstract = {Unstructured metadata fields such as ‘description’ offer tremendous value for users to understand cultural heritage objects. However, this type of narrative information is of little direct use within a machine-readable context due to its unstructured nature. This article explores the possibilities and limitations of named-entity recognition ({NER}) and term extraction ({TE}) to mine such unstructured metadata for meaningful concepts. These concepts can be used to leverage otherwise limited searching and browsing operations, but they can also play an important role to foster Digital Humanities research. To catalyze experimentation with {NER} and {TE}, the article proposes an evaluation of the performance of three third-party entity extraction services through a comprehensive case study, based on the descriptive fields of the Smithsonian Cooper–Hewitt National Design Museum in New York. To cover both {NER} and {TE}, we first offer a quantitative analysis of named entities retrieved by the services in terms of precision and recall compared with a manually annotated gold-standard corpus, and then complement this approach with a more qualitative assessment of relevant terms extracted. Based on the outcomes of this double analysis, the conclusions present the added value of entity extraction services, but also indicate the dangers of uncritically using {NER} and/or {TE}, and by extension Linked Data principles, within the Digital Humanities. All metadata and tools used within the article are freely available, making it possible for researchers and practitioners to repeat the methodology. By doing so, the article offers a significant contribution towards understanding the value of entity recognition and disambiguation for the Digital Humanities.},
pages = {262--279},
number = {2},
journaltitle = {Digital Scholarship in the Humanities},
shortjournal = {Digital Scholarship in the Humanities},
author = {van Hooland, Seth and De Wilde, Max and Verborgh, Ruben and Steiner, Thomas and Van de Walle, Rik},
urldate = {2022-07-24},
date = {2015-06-01},
file = {Snapshot:C\:\\Users\\virgi\\Zotero\\storage\\PFJGFUUT\\391093.html:text/html;Version soumise:C\:\\Users\\virgi\\Zotero\\storage\\SRXNGMAG\\van Hooland et al. - 2015 - Exploring entity recognition and disambiguation fo.pdf:application/pdf},
}
@inproceedings{huet_mining_2013,
location = {New York, {NY}, {USA}},
title = {Mining history with Le Monde},
isbn = {978-1-4503-2411-3},
url = {https://asiabiega.github.io/papers/lemonde_akbc2013.pdf},
doi = {10.1145/2509558.2509567},
series = {{AKBC} '13},
abstract = {The last decade has seen the rise of large knowledge bases, such as {YAGO}, {DBpedia}, Freebase, or {NELL}. In this paper, we show how this structured knowledge can help understand and mine trends in unstructured data. By combining {YAGO} with the archive of the French newspaper Le Monde, we can conduct analyses that would not be possible with word frequency statistics alone. We find indications about the increasing role that women play in politics, about the impact that the city of birth can have on a person's career, or about the average age of famous people in different professions.},
pages = {49--54},
booktitle = {Proceedings of the 2013 workshop on Automated knowledge base construction},
publisher = {Association for Computing Machinery},
author = {Huet, Thomas and Biega, Joanna and Suchanek, Fabian M.},
urldate = {2022-07-29},
date = {2013-10-27},
keywords = {culturomics, knowledge base, le monde, yago},
}
@incollection{linhares_pontes_entity_2020,
title = {Entity Linking for Historical Documents: Challenges and Solutions},
volume = {12504},
url = {https://hal.archives-ouvertes.fr/hal-03034492},
series = {Lecture Notes in Computer Science},
shorttitle = {Entity Linking for Historical Documents},
abstract = {Named entities ({NEs}) are among the most relevant type of information that can be used to efficiently index and retrieve digital documents. Furthermore, the use of Entity Linking ({EL}) to disambiguate and relate {NEs} to knowledge bases, provides supplementary information which can be useful to differentiate ambiguous elements such as geographical locations and peoples' names. In historical documents, the detection and disambiguation of {NEs} is a challenge. Most historical documents are converted into plain text using an optical character recognition ({OCR}) system at the expense of some noise. Documents in digital libraries will, therefore, be indexed with errors that may hinder their accessibility. {OCR} errors affect not only document indexing but the detection, disambiguation, and linking of {NEs}. This paper aims at analysing the performance of different {EL} approaches on two multilingual historical corpora, {CLEF} {HIPE} 2020 (English, French, German) and {NewsEye} (Finnish, French, German, Swedish), while proposes several techniques for alleviating the impact of historical data problems on the {EL} task. Our findings indicate that the proposed approaches not only outperform the baseline in both corpora but additionally they considerably reduce the impact of historical document issues on different subjects and languages.},
pages = {215--231},
booktitle = {22nd International Conference on Asia-Pacific Digital Libraries, {ICADL} 2020},
publisher = {Springer},
author = {Linhares Pontes, Elvys and Cabrera-Diego, Luis Adrián and Moreno, José G. and Boros, Emanuela and Hamdi, Ahmed and Sidère, Nicolas and Coustaty, Mickaël and Doucet, Antoine},
urldate = {2022-04-21},
date = {2020},
doi = {10.1007/978-3-030-64452-9_19},
keywords = {Deep learning, Digital libraries, Entity linking, Historical data},
file = {HAL PDF Full Text:C\:\\Users\\virgi\\Zotero\\storage\\WWGIKX63\\Pontes et al. - 2020 - Entity Linking for Historical Documents Challenge.pdf:application/pdf},
}
@incollection{morzy_disambiguation_2015,
location = {Cham},
title = {Disambiguation of Named Entities in Cultural Heritage Texts Using Linked Data Sets},
volume = {539},
isbn = {978-3-319-23200-3 978-3-319-23201-0},
url = {http://link.springer.com/10.1007/978-3-319-23201-0_51},
abstract = {This paper proposes a graph-based algorithm baptized {REDEN} for the disambiguation of authors’ names in French literary criticism texts and scientific essays from the 19th century. It leverages knowledge from different Linked Data sources in order to select candidates for each author mention, then performs fusion of {DBpedia} and {BnF} individuals into a single graph, and finally decides the best referent using the notion of graph centrality. Some experiments are conducted in order to identify the best size of disambiguation context and to assess the influence on centrality of specific relations represented as edges. This work will help scholars to trace the impact of authors’ ideas across different works and time periods.},
pages = {505--514},
booktitle = {New Trends in Databases and Information Systems},
publisher = {Springer International Publishing},
author = {Brando, Carmen and Frontini, Francesca and Ganascia, Jean-Gabriel},
editor = {Morzy, Tadeusz and Valduriez, Patrick and Bellatreche, Ladjel},
urldate = {2022-08-01},
date = {2015},
langid = {english},
doi = {10.1007/978-3-319-23201-0_51},
note = {Series Title: Communications in Computer and Information Science},
file = {Brando et al. - 2015 - Disambiguation of Named Entities in Cultural Herit.pdf:C\:\\Users\\virgi\\Zotero\\storage\\MYTAPH4H\\Brando et al. - 2015 - Disambiguation of Named Entities in Cultural Herit.pdf:application/pdf},
}
@article{santos_toponym_2018,
title = {Toponym matching through deep neural networks},
volume = {32},
issn = {1365-8816},
url = {https://doi.org/10.1080/13658816.2017.1390119},
doi = {10.1080/13658816.2017.1390119},
abstract = {Toponym matching, i.e. pairing strings that represent the same real-world location, is a fundamental problemfor several practical applications. The current state-of-the-art relies on string similarity metrics, either specifically developed for matching place names or integrated within methods that combine multiple metrics. However, these methods all rely on common sub-strings in order to establish similarity, and they do not effectively capture the character replacements involved in toponym changes due to transliterations or to changes in language and culture over time. In this article, we present a novel matching approach, leveraging a deep neural network to classify pairs of toponyms as either matching or nonmatching. The proposed network architecture uses recurrent nodes to build representations from the sequences of bytes that correspond to the strings that are to be matched. These representations are then combined and passed to feed-forward nodes, finally leading to a classification decision. We present the results of a wide-ranging evaluation on the performance of the proposed method, using a large dataset collected from the {GeoNames} gazetteer. These results show that the proposed method can significantly outperform individual similarity metrics from previous studies, as well as previous methods based on supervised machine learning for combining multiple metrics.},
pages = {324--348},
number = {2},
journaltitle = {International Journal of Geographical Information Science},
author = {Santos, Rui and Murrieta-Flores, Patricia and Calado, Pável and Martins, Bruno},
urldate = {2022-07-26},
date = {2018-02-01},
note = {Publisher: Taylor \& Francis
\_eprint: https://doi.org/10.1080/13658816.2017.1390119},
keywords = {approximate string matching, deep neural networks, duplicate detection, geographic information retrieval, recurrent neural networks, Toponym matching},
file = {Snapshot:C\:\\Users\\virgi\\Zotero\\storage\\SXK6KTCX\\13658816.2017.html:text/html;Version acceptée:C\:\\Users\\virgi\\Zotero\\storage\\AX69RARS\\Santos et al. - 2018 - Toponym matching through deep neural networks.pdf:application/pdf},
}
@online{alrahabi_tanagra_2022,
title = {Tanagra Mapping Tool},
url = {https://obtic.sorbonne-universite.fr/tanagra/map},
author = {Alrahabi, Motasem},
editora = {Allaire, Angélique and {OSM}},
editoratype = {collaborator},
urldate = {2022-08-01},
date = {2022},
file = {Location:C\:\\Users\\virgi\\Zotero\\storage\\CIRRRKGP\\map.html:text/html},
}
@article{brando_reden_2016,
title = {{REDEN}: Named Entity Linking in Digital Literary Editions Using Linked Data Sets},
url = {https://hal.sorbonne-universite.fr/hal-01396037},
doi = {10.7250/csimq.2016-7.04},
shorttitle = {{REDEN}},
abstract = {This paper proposes a graph-based Named Entity Linking ({NEL}) algorithm named {REDEN} for the disambiguation of authors' names in French literary criticism texts and scientific essays from the 19th and early 20th centuries. The algorithm is described and evaluated according to the two phases of {NEL} as reported in current state of the art, namely, candidate retrieval and candidate selection. {REDEN} leverages knowledge from different Linked Data sources in order to select candidates for each author mention, subsequently crawls data from other Linked Data sets using equivalence links (e.g., owl:{sameAs}), and, finally, fuses graphs of homologous individuals into a non-redundant graph well-suited for graph centrality calculation; the resulting graph is used for choosing the best referent. The {REDEN} algorithm is distributed in open-source and follows current standards in digital editions ({TEI}) and semantic Web ({RDF}). Its integration into an editorial workflow of digital editions in Digital humanities and cultural heritage projects is entirely plausible. Experiments are conducted along with the corresponding error analysis in order to test our approach and to help us to study the weaknesses and strengths of our algorithm, thereby to further improvements of {REDEN}.},
pages = {60},
number = {7},
journaltitle = {Complex Systems Informatics and Modeling Quarterly},
author = {Brando, Carmen and Frontini, Francesca and Ganascia, Jean-Gabriel},
urldate = {2022-07-23},
date = {2016-07-29},
langid = {english},
file = {Full Text PDF:C\:\\Users\\virgi\\Zotero\\storage\\MXHD57VS\\Brando et al. - 2016 - REDEN Named Entity Linking in Digital Literary Ed.pdf:application/pdf;Snapshot:C\:\\Users\\virgi\\Zotero\\storage\\6TQAEGKW\\hal-01396037.html:text/html},
}
@article{clavaud_ner4archives_nodate,
title = {{NER}4Archives (named entity recognition for archives): Conception et réalisation d'un outil de détection, de classification et de résolution des entités nommées dans les instruments de recherche archivistiques encodés en {XML}/{EAD}},
url = {https://hal.archives-ouvertes.fr/hal-03625734/document},
pages = {23},
author = {Clavaud, Florence and Romary, Laurent and Charbonnier, Pauline and Terriel, Lucas and Piraino, Gaetano and Verdese, Vincent},
langid = {french},
file = {Clavaud et al. - NER4Archives (named entity recognition for archive.pdf:C\:\\Users\\virgi\\Zotero\\storage\\LTZDVQUB\\Clavaud et al. - NER4Archives (named entity recognition for archive.pdf:application/pdf},
}
@online{guerin_actes_1881,
title = {Actes Royaux du Poitou (1302-1464)},
url = {http://corpus.enc.sorbonne.fr/actesroyauxdupoitou/},
author = {Guérin, Paul},
editora = {Celier, Léonce and Glorieux, Frédéric and Jolivet, Vincent},
editoratype = {collaborator},
urldate = {2022-08-04},
date = {1881},
file = {corpus.enc.sorbonne.fr/actesroyauxdupoitou/:C\:\\Users\\virgi\\Zotero\\storage\\HZA582TV\\actesroyauxdupoitou.html:text/html},
}
@inproceedings{monroc_comprehensive_2022,
location = {Cham},
title = {A Comprehensive Study of Open-Source Libraries for Named Entity Recognition on Handwritten Historical Documents},
isbn = {978-3-031-06555-2},
url = {https://teklia.com/publications/DAS2022_NER.pdf},
doi = {10.1007/978-3-031-06555-2_29},
series = {Lecture Notes in Computer Science},
abstract = {In this paper, we propose an evaluation of several state-of-the-art open-source natural language processing ({NLP}) libraries for named entity recognition ({NER}) on handwritten historical documents: {spaCy}, Stanza and Flair. The comparison is carried out on three low-resource multilingual datasets of handwritten historical documents: {HOME} (a multilingual corpus of medieval charters), Balsac (a corpus of parish records from Quebec), and Esposalles (a corpus of marriage records in Catalan). We study the impact of the document recognition processes (text line detection and handwriting recognition) on the performance of the {NER}. We show that current off-the-shelf {NER} libraries yield state-of-the-art results, even on low-resource languages or multilingual documents using multilingual models. We show, in an end-to-end evaluation, that text line detection errors have a greater impact than handwriting recognition errors. Finally, we also report state-of-the-art results on the public Esposalles dataset.},
pages = {429--444},
booktitle = {Document Analysis Systems},
publisher = {Springer International Publishing},
author = {Monroc, Claire Bizon and Miret, Blanche and Bonhomme, Marie-Laurence and Kermorvant, Christopher},
editor = {Uchida, Seiichi and Barney, Elisa and Eglin, Véronique},
date = {2022},
langid = {english},
keywords = {Named entity recognition, Handwritten historical documents, Text line detection},
}
@online{noauthor_arkindex_nodate,
title = {Arkindex {API} 1.3.1},
url = {https://arkindex.teklia.com/api-docs/},
urldate = {2022-08-05},
file = {Arkindex API 1.3.1:C\:\\Users\\virgi\\Zotero\\storage\\45YLJRMF\\api-docs.html:text/html},
}
@mvbook{glenisson_registres_1958,
location = {Paris, France},
title = {Registres du Trésor des chartes: inventaire analytique},
isbn = {978-2-86000-027-7},
shorttitle = {Registres du trésor des chartes},
volumes = {6},
publisher = {Impr. nationale},
author = {Glénisson, Jean and Guerout, Jean and Viard, Jules and Vallée-Karcher, Aline and Jassemin, Henri-Frédéric},
editor = {Fawtier, Robert},
date = {1958},
keywords = {Archives nationales (France), Archives nationales (France) -- Série {JJ}, France -- Histoire -- Sources},
file = {Library Catalog Entry Snapshot:C\:\\Users\\virgi\\Zotero\\storage\\MVVXR3L8\\SRCH.html:text/html},
}
@book{samaran_gascogne_1966,
location = {Paris},
title = {La Gascogne dans les registres du Trésor des chartes},
series = {Collection de documents inédits sur l'histoire de France},
number = {Vol. 4},
publisher = {Bibliothèque nationale},
author = {Samaran, Charles and Rouleau, Pierre},
date = {1966},
}
@book{dossat_languedoc_1983,
location = {Paris},
title = {Le Languedoc et le Rouergue dans le Trésor des chartes},
series = {Collection de documents inédits sur l'histoire de France},
number = {16},
publisher = {{CTHS}},
author = {Dossat, Yves and Lemasson, Anne-Marie and Wolff, Philippe},
date = {1983},
}
@book{chevalier_les_1993,
location = {Paris},
title = {Les pays de la Loire moyenne dans le Trésor des chartes: Berry, Blésois, Chartrain, Orléanais, Touraine, 1350-1502 Archives nationale, {JJ} 80-235},
series = {Collection de documents inédits sur l'histoire de France},
shorttitle = {Les pays de la Loire moyenne dans le Trésor des chartes},
number = {22},
publisher = {{CTHS}},
author = {Chevalier, Bernard},
editora = {Archives nationales},
editoratype = {collaborator},
date = {1993},
}
@book{viard_documents_1899,
location = {Paris},
title = {Documents parisiens du règne de Philippe {VI} de valois: 1328-1350},
series = {Société de l'histoire de Paris et de l'Ile-de-France},
shorttitle = {Documents parisiens du règne de Philippe {VI} de valois},
publisher = {Champion},
author = {Viard, Jules},
editora = {Société de l'histoire de Paris et de l'Ile-de-France,},
editoratype = {collaborator},
date = {1899},
}
@book{longnon_paris_1878,
location = {Paris},
title = {Paris pendant la domination anglaise (1420-1436), documents extraits des registres de la chancellerie de France, par Auguste Longnon},
publisher = {H. Champion},
author = {Longnon, Auguste},
date = {1878},
}
@book{le_cacheux_actes_1907,
location = {Rouen},
title = {Actes de la chancellerie d'Henri {VI} concernant la Normandie sous la domination anglaise (1422-1435), extraits des registres du Trésor des chartes aux Archives nationales, publiés avec introductions et notes},
author = {Le Cacheux, Paul},
date = {1907},
}
@book{maugis_documents_1908,
location = {Amiens Paris},
title = {Documents inédits concernant la ville et le siège du bailliage d'Amiens extraits des registres du Parlement de Paris et du Trésor des chartes: {XIVe}-{XVe} siècle (1296-1471)},
series = {Mémoires de la Société des antiquaires de Picardie. Documents inédits concernant la province},
shorttitle = {Documents inédits concernant la ville et le siège du bailliage d'Amiens extraits des registres du Parlement de Paris et du trésor des chartes},
number = {t. 17, 19 et 20},
publisher = {Yvert et Tellier, A. Picard},
author = {Maugis, Edouard},
date = {1908},
}
@thesis{dupont_structuration_2017,
title = {La structuration dans les entités nommées},
url = {https://tel.archives-ouvertes.fr/tel-01772268},
abstract = {La reconnaissance des entités nommées et une discipline cruciale du domaine du {TAL}. Elle sert à l'extraction de relations entre entités nommées, ce qui permet la construction d'une base de connaissance (Surdeanu and Ji, 2014), le résumé automatique (Nobata et al., 2002), etc... Nous nous intéressons ici aux phénomènes de structurations qui les entourent.Nous distinguons ici deux types d'éléments structurels dans une entité nommée. Les premiers sont des sous-chaînes récurrentes, que nous appelerons les affixes caractéristiques d'une entité nommée. Le second type d'éléments est les tokens ayant un fort pouvoir discriminant, appelés des tokens déclencheurs. Nous détaillerons l'algorithme que nous avons mis en place pour extraire les affixes caractéristiques, que nous comparerons à Morfessor (Creutz and Lagus, 2005b). Nous appliquerons ensuite notre méthode pour extraire les tokens déclencheurs, utilisés pour l'extraction d'entités nommées du Français et d'adresses postales.Une autre forme de structuration pour les entités nommées est de nature syntaxique, qui suit généralement une structure d'imbrications ou arborée. Nous proposons un type de cascade d'étiqueteurs linéaires qui n'avait jusqu'à présent jamais été utilisé pour la reconnaissance d'entités nommées, généralisant les approches précédentes qui ne sont capables de reconnaître des entités de profondeur finie ou ne pouvant modéliser certaines particularités des entités nommées structurées.Tout au long de cette thèse, nous comparons deux méthodes par apprentissage automatique, à savoir les {CRF} et les réseaux de neurones, dont nous présenterons les avantages et inconvénients de chacune des méthodes.},
institution = {Université Sorbonne Paris Cité},
type = {Thèse de doctorat},
author = {Dupont, Yoann},
urldate = {2022-03-28},
date = {2017},
langid = {french},
file = {Full Text PDF:C\:\\Users\\virgi\\Zotero\\storage\\VU3VXDXK\\Dupont - 2017 - La structuration dans les entités nommées.pdf:application/pdf;Snapshot:C\:\\Users\\virgi\\Zotero\\storage\\38NVP4LD\\tel-01772268.html:text/html},
}
@online{reignier_lindex_2022,
title = {De l’index papier à l’indexation automatique},
rights = {All rights reserved},
url = {https://himanis.hypotheses.org/1106},
abstract = {Dans le cadre des travaux sur la Reconnaissances d’Entités Nommées ({REN}) réalisés à partir du projet Himanis (registres du Trésor des chartes, Paris, Archives nationale, {JJ}35-{JJ}211), nous travaillons actuellement à l’amélioration des modèles pour associer les entités nommées reconnues à une identification précise et désambiguïsée. Nous avons donc commencé notre travail à partir des référentiels … Continue reading De l’index papier à l’indexation automatique →},
titleaddon = {Himanis},
type = {Billet},
author = {Reignier, Virgile},
urldate = {2022-08-26},
date = {2022-08-26},
langid = {american},
file = {Snapshot:C\:\\Users\\virgi\\Zotero\\storage\\T4EAGS6P\\1106.html:text/html},
}
@online{reignier_carte_2022,
title = {Carte des lieux mentionnés dans les registres {JJ} 37 à {JJ} 50 du Trésor des chartes d'après l'index de "Robert Fawtier (dir.), Registres du Trésor des chartes: inventaire analytique, tome I : Règne de Philippe de Bel, Paris, 1958" (géoréférencé et structuré par Virgile Reignier)},
rights = {All rights reserved},
url = {https://virgile-reignier.github.io/Carte-JJ37-50/},
shorttitle = {Carte des lieux mentionnés dans les registres {JJ} 37 à {JJ} 50 du Trésor des chartes},
abstract = {Lieux mentionnés dans les registres {JJ} 37 à {JJ} 50 du Trésor des chartes},
titleaddon = {Carte {JJ} 37-50},
type = {Carte by leaflet},
author = {Reignier, Virgile},
urldate = {2022-08-26},
date = {2022},
file = {Carte JJ 37-50:C\:\\Users\\virgi\\Zotero\\storage\\2UN5RB2W\\Carte-JJ37-50.html:text/html},
}
@article{mcdonough_named_2019,
title = {Named entity recognition goes to old regime France: geographic text analysis for early modern French corpora},
volume = {33},
doi = {10.1080/13658816.2019.1620235},
shorttitle = {Named entity recognition goes to old regime France},
abstract = {Geographic text analysis ({GTA}) research in the digital humanities has focused on projects analyzing modern English-language corpora. These projects depend on temporally specific lexicons and gazetteers that enable place name identification and georesolution. Scholars working on the early modern period (1400-1800) lack temporally appropriate geoparsers and gazetteers and have been reliant on general purpose linked open data services like Geonames. These anachronistic resources introduce significant information retrieval and ethical challenges for early modernists. Using the geography entries of the canonical eighteenth-century Encyclopédie, we evaluate rule-based named entity recognition ({NER}) systems to pinpoint areas where they would benefit from adjustments for processing historical corpora. As we demonstrate, annotating nested and extended place information is one way to improve early modern {GTA}. Working with Enlightenment sources also motivates a critique of the landscape of digital geospatial data.},
journaltitle = {International Journal of Geographical Information Science},
shortjournal = {International Journal of Geographical Information Science},
author = {{McDonough}, Katherine and Moncla, Ludovic and Camp, Matje},
date = {2019-05-27},
file = {Full Text PDF:C\:\\Users\\virgi\\Zotero\\storage\\KQBXVG82\\McDonough et al. - 2019 - Named entity recognition goes to old regime France.pdf:application/pdf},
}
@online{hugi_vous_2013,
title = {Vous avez dit Linked Open Data?},
url = {https://recherchemid.wordpress.com/2013/11/11/vous-avez-dit-linked-open-data/},
abstract = {Il y a quelque temps, nous avons été priés d’expliquer les avantages que représentent les Linked Open Data en bibliothèque. Mais avant de se lancer dans une telle démarche, nous souhaitons to…},
titleaddon = {Recherche d'{ID}},
author = {Hügi, Jasmin and Prongué, Nicolas},
urldate = {2022-09-06},
date = {2013-11-11},
langid = {french},
file = {Snapshot:C\:\\Users\\virgi\\Zotero\\storage\\5IMNYWD9\\vous-avez-dit-linked-open-data.html:text/html},
}
@online{ccsd_principes_nodate,
title = {Principes {FAIR}},
url = {https://www.ccsd.cnrs.fr/principes-fair/},
author = {{CCSD}},
urldate = {2022-11-08},
file = {Principes FAIR | CCSD:C\:\\Users\\virgi\\Zotero\\storage\\SQE74NTR\\principes-fair.html:text/html},
}
@online{stutzmann_himanis_2022,
title = {Himanis / Home},
url = {https://heurist.huma-num.fr/heurist/?db=stutzmann_himanis&website},
author = {Stutzmann, Dominique},
editora = {Hamel, Sebastien and Torres Aguilar, Sergio and Reignier, Virgile and Chaffenet, Paul},
editoratype = {collaborator},
urldate = {2022-11-09},
date = {2022},
file = {Himanis / Home:C\:\\Users\\virgi\\Zotero\\storage\\22W4SQAJ\\heurist.html:text/html},
}