Skip to content

Commit

Permalink
Merge branch 'main' of github.com:clarin-eric/clarin-resource-families
Browse files Browse the repository at this point in the history
  • Loading branch information
kreetrapper committed Oct 28, 2024
2 parents a6836c9 + 8c6ac8e commit 2500bbc
Show file tree
Hide file tree
Showing 16 changed files with 249 additions and 0 deletions.
16 changes: 16 additions & 0 deletions corpora/manually-annotated-corpora/affect-in-tweets.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
{
"Name": "Affect in Tweets PT",
"URL": "https://hdl.handle.net/21.11129/0000-000E-75BA-D",
"Family": "Manually annotated corpora",
"Description": "This is a data set of Portuguese tweets labelled with the emotion conveyed in the tweet.\nEach tweet is labelled with an emotion (i.e., anger, fear, joy, sadness).\nThe corpus is available from PORTULAN.",
"Language": ["por"],
"Licence": "CC BY",
"Size": ["11,219 tweets"],
"Annotation": ["sentiment analysis"],
"Infrastructure": "CLARIN",
"Group": ["Sentiment analysis"],
"Access": {
"Download": "https://hdl.handle.net/21.11129/0000-000E-75BA-D"
},
"Publication": ""
}
16 changes: 16 additions & 0 deletions corpora/manually-annotated-corpora/deepbankpt.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
{
"Name": "DeepBankPT",
"URL": "https://hdl.handle.net/21.11129/0000-000B-D350-C",
"Family": "Manually annotated corpora",
"Description": "This is a corpus of grammatical analyses conforming to the <a href=\"https://en.wikipedia.org/wiki/Head-driven_phrase_structure_grammar\">Head Driven Phrase Structure Grammar</a> framework.\nThe sentences are translations from the Wall Street Journal.\nThe corpus is available from PORTULAN.",
"Language": ["por"],
"Licence": "CC BY",
"Size": ["3,406 sentences", "44,598 tokens"],
"Annotation": ["grammatical structure"],
"Infrastructure": "CLARIN",
"Group": ["Syntactic parsing"],
"Access": {
"Download": "https://hdl.handle.net/21.11129/0000-000B-D350-C"
},
"Publication": ""
}
16 changes: 16 additions & 0 deletions corpora/manually-annotated-corpora/dependency-bank-pt.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
{
"Name": "DependencyBankPT",
"URL": "https://hdl.handle.net/21.11129/0000-000B-D34C-2",
"Family": "Manually annotated corpora",
"Description": "This is a corpus of syntactic dependencies.\nThe sentences are translations from the Wall Street Journal.\nThe corpus is available from PORTULAN.",
"Language": ["por"],
"Licence": "CC BY",
"Size": ["3,406 sentences", "44,598 tokens"],
"Annotation": ["grammatical structure"],
"Infrastructure": "CLARIN",
"Group": ["Syntactic parsing"],
"Access": {
"Download": "https://hdl.handle.net/21.11129/0000-000B-D34C-2"
},
"Publication": ""
}
16 changes: 16 additions & 0 deletions corpora/manually-annotated-corpora/logical-form-bank-pt.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
{
"Name": "LogicalFormBankPT",
"URL": "https://hdl.handle.net/21.11129/0000-000B-D34E-0",
"Family": "Manually annotated corpora",
"Description": "This is a corpus of sentences annotated with logical forms. The sentences are translations from the Wall Street Journal.\nThe corpus is available from PORTULAN.",
"Language": ["por"],
"Licence": "CC BY",
"Size": ["3,406 sentences", "44,598 tokens"],
"Annotation": ["Semantic tags"],
"Infrastructure": "CLARIN",
"Group": ["Other annotation layers"],
"Access": {
"Download": "https://hdl.handle.net/21.11129/0000-000B-D34E-0"
},
"Publication": ""
}
16 changes: 16 additions & 0 deletions corpora/manually-annotated-corpora/manual-for-teaching.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
{
"Name": "Manually annotated corpora for teaching and learning purposes of Brazilian Portuguese, Dutch, Estonian, and Slovene",
"URL": "https://hdl.handle.net/21.11129/0000-0010-05DA-3 ",
"Family": "Manually annotated corpora",
"Description": "These are manually annotated corpora for teaching and learning purposes of Brazilian Portuguese, Dutch, Estonian, and Slovene.\nSentences are annotated with “problematic” or “non-problematic” labels, from the point of usage for pedagogical purposes.\nThe corpus is available from PORTULAN.",
"Language": ["est", "nld", "slv", "por"],
"Licence": "CC BY",
"Size": ["10,000 sentences"],
"Annotation": ["error tagging"],
"Infrastructure": "CLARIN",
"Group": ["Other annotation layers"],
"Access": {
"Download": "https://hdl.handle.net/21.11129/0000-0010-05DA-3"
},
"Publication": ""
}
16 changes: 16 additions & 0 deletions corpora/manually-annotated-corpora/propbankpt.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
{
"Name": "PropBankPT",
"URL": "https://hdl.handle.net/21.11129/0000-000B-D34B-3",
"Family": "Manually annotated corpora",
"Description": "This is a corpus of sentences annotated with their constituency structure and semantic role tags. The sentences are translations from the Wall Street Journal.\nThe corpus is available from PORTULAN.",
"Language": ["por"],
"Licence": "CC BY",
"Size": ["3,406 sentences", "44,598 tokens"],
"Annotation": ["Syntactic parsing", "Semantic role tags"],
"Infrastructure": "CLARIN",
"Group": ["Syntactic parsing", "Other annotation layers"],
"Access": {
"Download": "https://hdl.handle.net/21.11129/0000-000B-D34B-3"
},
"Publication":""
}
16 changes: 16 additions & 0 deletions corpora/manually-annotated-corpora/treebank-pt.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
{
"Name": "TreeBankPT",
"URL": "https://hdl.handle.net/21.11129/0000-000B-D34B-3",
"Family": "Manually annotated corpora",
"Description": "This is a corpus of syntactic constituency trees. The sentences are translations from the Wall Street Journal.\nThe corpus is available from PORTULAN.",
"Language": ["por"],
"Licence": "CC BY",
"Size": ["3,406 sentences", "4,598 tokens"],
"Annotation": ["Syntactic parsing"],
"Infrastructure": "CLARIN",
"Group": ["Syntactic parsing"],
"Access": {
"Download": "https://hdl.handle.net/21.11129/0000-000B-D34B-3"
},
"Publication": ""
}
15 changes: 15 additions & 0 deletions corpora/newspaper-corpora/corpus-vu-dnc.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
{
"Name": "Corpus VU-DNC",
"URL": "http://hdl.handle.net/10032/tm-a2-g4",
"Family": "Newspaper corpora",
"Description": "This corpus consists of data from five newspapers, covering 3 separate years (1950, 1951, and 2002).\nThe corpus is available from the Dutch Language Institute.",
"Language": ["nld"],
"Licence": "",
"Size": [""],
"Annotation": [""],
"Infrastructure": "CLARIN",
"Access": {
"Concordancer": "https://ivdnt.org/wp-content/apps/vu-dnc/index.html"
},
"Publication":""
}
15 changes: 15 additions & 0 deletions corpora/newspaper-corpora/couranten-corpus.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
{
"Name": "Couranten Corpus",
"URL": "http://hdl.handle.net/10032/tm-a2-u9 ",
"Family": "Newspaper corpora",
"Description": "This corpus contains thirteen seventeenth-century Dutch newspapers (altogether 109,532 articles) published between 1619 and 1700.\nThe corpus is available from the Dutch Language Institute.",
"Language": ["nld"],
"Licence": "",
"Size": ["18.9 million words"],
"Annotation": ["", ""],
"Infrastructure": "CLARIN",
"Access": {
"Concordancer": "https://couranten.ivdnt.org/corpus-frontend/couranten/search/"
},
"Publication":""
}
15 changes: 15 additions & 0 deletions corpora/newspaper-corpora/wablieft-corpus.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
{
"Name": "Wablieft Corpus",
"URL": "https://hdl.handle.net/10032/tm-a2-q6",
"Family": "Newspaper corpora",
"Description": "This corpus contains the digital archive of the <a href=\"https://www.wablieft.be/nl\">Wablieft newspaper</a> from 2011 to 2017.\nThe corpus is available from the Dutch Language Institute.",
"Language": ["nld"],
"Licence": "CC BY",
"Size": ["2 million words"],
"Annotation": ["PoS-tagged", "lemmatised", "named entities", "syntactic dependencies"],
"Infrastructure": "CLARIN",
"Access": {
"Download": "https://hdl.handle.net/10032/tm-a2-q6"
},
"Publication":""
}
15 changes: 15 additions & 0 deletions corpora/newspaper-corpora/wai-not-corpus.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
{
"Name": "WAI-NOT Corpus",
"URL": "http://hdl.handle.net/10032/tm-a2-t9",
"Family": "Newspaper corpora",
"Description": "This corpus contains the digital archive of the <a href=\"https://www.wai-not.be/\">WAI-NOT newspaper</a> between 2009 and 2021.\nThe corpus is available from the Dutch Language Institute.",
"Language": ["nld"],
"Licence": "",
"Size": [""],
"Annotation": ["", ""],
"Infrastructure": "CLARIN",
"Access": {
"Download": "http://hdl.handle.net/10032/tm-a2-t9 "
},
"Publication":""
}
17 changes: 17 additions & 0 deletions corpora/parallel-corpora/proiel-collection.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
{
"Name": "PROIEL collection",
"URL": "http://hdl.handle.net/11509/114",
"Family": "Parallel corpora",
"Description": "This is a collection of dependency treebanks for early attestations of Indo-European, including a set of parallel treebanks of the New Testament.\nThe languages that are included are Gothic, Ancient Greek, Church Slavic, Latin, and Classical Armenian.\nThe corpus is available from the CLARINO repository.",
"Language": ["5 languages"],
"Licence": "CC BY-NC-SA 4.0",
"Size": ["46,406 sentences", "530,666 words"],
"Annotation": ["MSD-tagged", "information structure"],
"Infrastructure": "CLARIN",
"Group": "Multilingual corpora",
"Access": {
"Browse": "http://hdl.handle.net/11495/DA68-56C8-439C-0",
"Download": "http://hdl.handle.net/11509/114"
},
"Publication":""
}
15 changes: 15 additions & 0 deletions corpora/reference-corpora/corpus-hedendaags-nederlands.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
{
"Name": "Corpus Hedendaags Nederlands",
"URL": "http://hdl.handle.net/10032/tm-a2-s8",
"Family": "Reference corpora",
"Description": "This is a corpus of books, blogs, newspapers, magazines and news broadcasts from the Netherlands, Flanders, Suriname and the Netherlands Antilles.\nThe corpus is available from the Dutch Language Institute.",
"Language": ["nld"],
"Licence": "",
"Size": ["3 billion words"],
"Annotation": ["PoS-tagged", "lemmatised"],
"Infrastructure": "CLARIN",
"Access": {
"Concordancer": "https://chn.ivdnt.org/"
},
"Publication":""
}
15 changes: 15 additions & 0 deletions corpora/spoken-corpora/c-oral.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
{
"Name": "C-ORAL-ROM_EXM",
"URL": "https://hdl.handle.net/21.11129/0000-000B-D4FF-7",
"Family": "Spoken corpora",
"Description": "This is a corpus of formal and informal speech.\nThe corpus is available from PORTULAN.",
"Language": ["por"],
"Licence": "The MIT licence",
"Size": ["300,000 words"],
"Annotation": ["Orthographically aligned", "Phonemically alligned", "PoS tagged"],
"Infrastructure": "CLARIN",
"Access": {
"Download": "https://hdl.handle.net/21.11129/0000-000B-D4FF-7"
},
"Publication":""
}
15 changes: 15 additions & 0 deletions corpora/spoken-corpora/perfil.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
{
"Name": "Perfil Sociolinguístico da Fala Bracarense",
"URL": "https://hdl.handle.net/21.11129/0000-000D-F928-E",
"Family": "90 hours",
"Description": "The corpus is composed by 1 hour interviews with speakers of the same area (around Braga, Portugal).\nThe interviews are stratified according to gender, age and level of education; the transcriptions are aligned with <a href=\"https://exmaralda.org/en/\"EXMARaLDA</a>.\nThe corpus is available from PORTULAN.",
"Language": ["por"],
"Licence": "CC BY-NC-ND",
"Size": ["90 hours"],
"Annotation": ["transcriptions aligned"],
"Infrastructure": "CLARIN",
"Access": {
"Download": "https://hdl.handle.net/21.11129/0000-000D-F928-E"
},
"Publication":""
}
15 changes: 15 additions & 0 deletions corpora/spoken-corpora/spoken-dutch-corpus.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
{
"Name": "Spoken Dutch Corpus",
"URL": "https://hdl.handle.net/10032/tm-a2-k6",
"Family": "Spoken corpora",
"Description": "This is a corpus of standard Dutch spoken in Flanders and the Netherlands.",
"Language": ["nld"],
"Licence": "",
"Size": ["900 hours"],
"Annotation": ["PoS-tagged", "syntactically parsed", "phonetically transcribed", "phonemically transcribed"],
"Infrastructure": "CLARIN",
"Access": {
"Download": "https://hdl.handle.net/10032/tm-a2-k6"
},
"Publication":""
}

0 comments on commit 2500bbc

Please sign in to comment.