diff --git a/data-export-meta/README.rst b/data-export-meta/README.rst index c50b54b0..7ef82681 100644 --- a/data-export-meta/README.rst +++ b/data-export-meta/README.rst @@ -2,7 +2,27 @@ Data exports ============ This directory contains [Frictionless data](https://frictionlessdata.io/) [data package](https://specs.frictionlessdata.io/data-package/) -files to describe and validate Project data exports. +files to describe and validate Project data exports, along with utility scripts for auto-generating portions of dataset readmes, data dictionaries, and list of members and books changes from previous published versions of the datasets. -They are currently generated and maintained manually; they should be updated -for deposit with revised data exports as needed. \ No newline at end of file +Datapackage files are currently generated and maintained manually; they should be updated +for deposit with revised data exports as needed. + +Validation +^^^^^^^^^^ + +To validate datapackage files and associated data files, use frictionless: + +1. `pip install frictionless` +2. `frictionless validate vX.X/datapakage.json` + +This will report any errors in the datapackage file as well as any validation errors where the types or pattern constraints specified in the data package file do not match the data in the associated CSV files. + + +Scripts +^^^^^^^ + +All scripts require pandas (`pip install pandas`). + +- `readme_info.py` - use to generate dataset summary information for inclusion in plain-text readme (number of fields, number of rows, optional list of fields with descriptions); can also be used to generate a CSV data dictionary. Takes a path to the datapackage file; resource paths referenced in the datapackage must resolve. +- `member_changes.py` - for members in an old version not in the new version, creates a csv of changes with new ids for member ids that changed; requires pandas. Must be updated for new versions and should be added to changes from previous versions. +- `book_changes.py` - same as above, but for book ids \ No newline at end of file diff --git a/data-export-meta/readme_info.py b/data-export-meta/readme_info.py index 8f5223ba..a1e2f80a 100755 --- a/data-export-meta/readme_info.py +++ b/data-export-meta/readme_info.py @@ -8,11 +8,14 @@ import json import sys +import argparse +import pathlib +import csv import pandas as pd -def readme_info(df, dp_resource): +def readme_info(df, dp_resource, field_list=True): print("1. Number of fields: %d\n" % len(df.columns)) print("2. Number of rows: {:,}\n".format(len(df))) schema_fields = dp_resource["schema"]["fields"] @@ -20,21 +23,72 @@ def readme_info(df, dp_resource): assert len(schema_fields) == len(df.columns) field_info = {field["name"]: field for field in schema_fields} - print("3. Field List:") - for col in df.columns: - print("%s : %s" % (col, field_info[col]["description"])) + if field_list: + print("3. Field List:") + for col in df.columns: + print("%s : %s" % (col, field_info[col]["description"])) if __name__ == "__main__": - if len(sys.argv) < 2: - print("Please provide path to frictionless datapackage file") - exit(0) + parser = argparse.ArgumentParser( + "Generate dataset info readme from datapackage and data files" + ) + parser.add_argument("datapackage", type=pathlib.Path) + # flag to determine whether fields be listed + parser.add_argument( + "--field-list", + help="Generate field list in readme.txt format", + action=argparse.BooleanOptionalAction, + default=True, + ) + parser.add_argument( + "-dd", + "--data-dictionary", + help="Create a data dictionary in the specified file", + type=pathlib.Path, + ) - with open(sys.argv[1]) as packagejson: + args = parser.parse_args() + + if args.data_dictionary: + if args.data_dictionary.exists(): + print( + f"Requested data dictionary file {args.data_dictionary} already exists" + ) + raise SystemExit(1) + with args.datapackage.open() as packagejson: datapackage = json.load(packagejson) - csvfile = datapackage["resources"][0]["path"] - print("Inspecting %s...\n\n" % csvfile) + for resource in datapackage["resources"]: + # resource path should be relative to the datapackage file + datafile = args.datapackage.parent / resource["path"] + print("\n\nInspecting %s...\n\n" % datafile) + with datafile.open() as csvfile: + df = pd.read_csv(csvfile) + readme_info(df, resource, field_list=args.field_list) - df = pd.read_csv(csvfile) - readme_info(df, datapackage["resources"][0]) + if args.data_dictionary: + print(f"\n\nWriting data dictionary to {args.data_dictionary}") + with args.data_dictionary.open("w", encoding="utf-8") as csv_datadict: + fieldnames = [ + "Filename", + "Variable", + "Variable name", + "Description", + "Type", + "Format", + ] + csvwriter = csv.DictWriter(csv_datadict, fieldnames=fieldnames) + csvwriter.writeheader() + for resource in datapackage["resources"]: + for field in resource["schema"]["fields"]: + csvwriter.writerow( + { + "Filename": resource["path"], + "Variable": field["title"], + "Variable name": field["name"], + "Description": field["description"], + "Type": field["type"], + "Format": field.get("format"), + } + ) diff --git a/data-export-meta/SCoData_books_v1.2_2022-12_datapackage.json b/data-export-meta/v1.2/SCoData_books_v1.2_2022-12_datapackage.json similarity index 100% rename from data-export-meta/SCoData_books_v1.2_2022-12_datapackage.json rename to data-export-meta/v1.2/SCoData_books_v1.2_2022-12_datapackage.json diff --git a/data-export-meta/SCoData_combined_v1.2_2022-01_datapackage.json b/data-export-meta/v1.2/SCoData_combined_v1.2_2022-01_datapackage.json similarity index 100% rename from data-export-meta/SCoData_combined_v1.2_2022-01_datapackage.json rename to data-export-meta/v1.2/SCoData_combined_v1.2_2022-01_datapackage.json diff --git a/data-export-meta/SCoData_events_v1.2_2022-01_datapackage.json b/data-export-meta/v1.2/SCoData_events_v1.2_2022-01_datapackage.json similarity index 100% rename from data-export-meta/SCoData_events_v1.2_2022-01_datapackage.json rename to data-export-meta/v1.2/SCoData_events_v1.2_2022-01_datapackage.json diff --git a/data-export-meta/SCoData_members_v1.2_2022-01_datapackage.json b/data-export-meta/v1.2/SCoData_members_v1.2_2022-01_datapackage.json similarity index 100% rename from data-export-meta/SCoData_members_v1.2_2022-01_datapackage.json rename to data-export-meta/v1.2/SCoData_members_v1.2_2022-01_datapackage.json diff --git a/data-export-meta/v2.0/SCoData_v2.0_2024_datapackage.json b/data-export-meta/v2.0/SCoData_v2.0_2024_datapackage.json new file mode 100644 index 00000000..13d822a8 --- /dev/null +++ b/data-export-meta/v2.0/SCoData_v2.0_2024_datapackage.json @@ -0,0 +1,758 @@ +{ + "profile": "tabular-data-package", + "title": "Shakespeare and Company Project Dataset: Lending Library Members, Events, and Books", + "homepage": "https://shakespeareandco.princeton.edu/about/data/", + "version": "2.0", + "image": "https://shakespeareandco.princeton.edu/static/img/social.png", + "resources": [ + { + "name": "members", + "path": "SCoData_members_v2.0_2024-10.csv", + "profile": "tabular-data-resource", + "schema": { + "fields": [ + { + "name": "id", + "type": "string", + "format": "default", + "title": "Member identifier", + "description": "unique identifier for member" + }, + { + "name": "uri", + "type": "string", + "format": "uri", + "title": "Member URI", + "description": "full URI for member; member detail page on https://shakespeareandco.princeton.edu" + }, + { + "name": "name", + "type": "string", + "format": "default", + "title": "Name ", + "description": "full name; may include variant names, name as written on lending library card; for more, see https://shakespeareandco.princeton.edu/about/faq/#names" + }, + { + "name": "sort_name", + "type": "string", + "format": "default", + "title": "Sort name ", + "description": "authorized name" + }, + { + "name": "title", + "type": "any", + "format": "default", + "title": "Title ", + "description": "honorific address if known, e.g. Mr., Mrs. etc." + }, + { + "name": "gender", + "type": "string", + "format": "default", + "title": "Gender ", + "description": "male, female, nonbinary, unknown; for more, see https://shakespeareandco.princeton.edu/about/faq/#gender" + }, + { + "name": "is_organization", + "type": "boolean", + "format": "default", + "title": "Is an organization?", + "description": "member is an organization instead of a person (boolean)" + }, + { + "name": "has_card", + "type": "boolean", + "format": "default", + "title": "Has a member card? ", + "description": "member has an extant lending library card (boolean)" + }, + { + "name": "birth_year", + "type": "date", + "format": "%Y", + "title": "Birth year ", + "description": "birth year, if known" + }, + { + "name": "death_year", + "type": "date", + "format": "%Y", + "title": "Death year", + "description": "death year, if known" + }, + { + "name": "membership_years", + "type": "string", + "format": "default", + "title": "Years of membership", + "description": "list of known active membership years (multiple, separated by semicolons)" + }, + { + "name": "viaf_url", + "type": "string", + "format": "uri", + "title": "VIAF URL", + "description": "URL for Virtual Internet Authority File (VIAF, https://viaf.org/) identifier, if available" + }, + { + "name": "wikipedia_url", + "type": "string", + "format": "uri", + "title": "Wikipedia URL", + "description": "URL for Wikipedia page, if available" + }, + { + "name": "nationalities", + "type": "string", + "format": "default", + "title": "Nationalities", + "description": "countries for known nationality (if multiple, separated by semicolons)" + }, + { + "name": "addresses", + "type": "string", + "format": "default", + "title": "Address(es)", + "description": "list of known addresses (if multiple, separated by semicolons)" + }, + { + "name": "postal_codes", + "type": "string", + "format": "default", + "title": "Postal code(s)", + "description": "list of postal addresses from addresses (if multiple, separated by semicolons; order matches addresses)" + }, + { + "name": "arrondissements", + "type": "string", + "format": "default", + "title": "Arrondissement(s)", + "description": "list of Paris arrondissements (integer; if multiple, separated by semicolons; order matches addresses)" + }, + { + "name": "coordinates", + "type": "string", + "format": "default", + "title": "Coordinates", + "description": "list of geographical coordinates for known addresses (pairs of latitude, longitude; if multiple, separated by semicolons; order matches addresses)" + }, + { + "name": "notes", + "type": "string", + "format": "default", + "title": "Notes", + "description": "more information (text with markdown formatting)" + }, + { + "name": "updated", + "type": "datetime", + "format": "default", + "title": "Date updated", + "description": "timestamp record was last modified in the Shakespeare and Company Project database before export" + } + ] + } + }, + { + "name": "events", + "path": "SCoData_events_v2.0_2024-10.csv", + "profile": "tabular-data-resource", + "schema": { + "fields": [ + { + "name": "event_type", + "type": "string", + "format": "default", + "title": "Event type", + "description": "type of event" + }, + { + "name": "start_date", + "type": "string", + "rdfType": "https://schema.org/Date", + "constraints": { + "pattern": "(\\d{4}|-)?(?:-([01]\\d))?(?:-([0-3]\\d))?" + }, + "title": "Start date", + "description": "start date, if known (ISO 8601 format; YYYY, YY-MM, YYYY-MM-DD, or --MM-DD)" + }, + { + "name": "end_date", + "type": "any", + "format": "default", + "title": "End date", + "description": "end date, if known (ISO 8601 format; YYYY, YY-MM, YYYY-MM-DD, or --MM-DD)" + }, + { + "name": "member_ids", + "type": "string", + "format": "default", + "title": "Member indentifier", + "description": "unique identifier for members associated with this event (if multiple, separated by semicolons)" + }, + { + "name": "member_uris", + "type": "string", + "format": "default", + "title": "Member URI", + "description": "list of URIs for members associated with this event (if multiple, separated by semicolons)" + }, + { + "name": "member_names", + "type": "string", + "format": "default", + "title": "Member name", + "description": "list of full member names with variants (if multiple, separated by semicolons; order matches member_uris)" + }, + { + "name": "member_sort_names", + "type": "string", + "format": "default", + "title": "Member sort name", + "description": "list of member authorized sort names (if multiple, separated by semicolons; order matches member_uris)" + }, + { + "name": "subscription_price_paid", + "type": "number", + "format": "default", + "title": "Subscription price paid", + "description": "amount paid for a subscription event (numeric)" + }, + { + "name": "subscription_deposit", + "type": "number", + "format": "default", + "title": "Subscription deposit", + "description": "amount deposited for a new subscription (numeric)" + }, + { + "name": "subscription_duration", + "type": "string", + "format": "default", + "title": "Subscription duration", + "description": "logical subscription duration (human readable, e.g. 6 months, 1 year)" + }, + { + "name": "subscription_duration_days", + "type": "integer", + "format": "default", + "title": "Subscription duration in days", + "description": "actual subscription duration in days (integer)" + }, + { + "name": "subscription_volumes", + "type": "integer", + "format": "default", + "title": "Number of subscription volumes", + "description": "number of volumes paid for in the subscription" + }, + { + "name": "subscription_category", + "type": "string", + "format": "default", + "title": "Subscription categogry", + "description": "subscription plan, if any; see https://shakespeareandco.princeton.edu/about/faq/#lending-library-plans " + }, + { + "name": "subscription_purchase_date", + "type": "string", + "format": "default", + "title": "Subscription purchase date", + "description": "date the subscription was purchased (ISO 8601 format; YYYY, YYYY-MM, YYYY-MM-DD, or --MM-DD)" + }, + { + "name": "reimbursement_refund", + "type": "number", + "format": "default", + "title": "Reimbursement refund", + "description": "amount refunded for a reimbursement event (numeric)" + }, + { + "name": "borrow_status", + "type": "string", + "format": "default", + "title": "Borrow status", + "description": "status code indicating how a borrowing event ended (returned, bought, missing, unknown)" + }, + { + "name": "borrow_duration_days", + "type": "integer", + "format": "default", + "title": "Borrow duration in days", + "description": "borrow duration in days, if known (integer)" + }, + { + "name": "purchase_price", + "type": "number", + "format": "default", + "title": "Purchase price", + "description": "amount paid for a purchase" + }, + { + "name": "currency", + "type": "string", + "format": "default", + "title": "Type of currency ", + "description": "currency code indicating currency of subscription price paid, deposit, reimbursement refund, or purchase price (ISO 4217 currency code)" + }, + { + "name": "item_uri", + "type": "string", + "format": "default", + "title": "Item URI ", + "description": "identifier for book associated with this event, if there is one" + }, + { + "name": "item_title", + "type": "string", + "format": "default", + "title": "Item title", + "description": "title of the book associated with this event" + }, + { + "name": "item_volume", + "type": "string", + "format": "default", + "title": "Item volume", + "description": "volume / issue of this work for this event, if item is a multivolume work or periodical and volume/issue information is known" + }, + { + "name": "item_authors", + "type": "string", + "format": "default", + "title": "Item author(s)", + "description": "list of authors for this work; authorized names, last name first (if multiple, separated by semicolon)" + }, + { + "name": "item_year", + "type": "string", + "format": "default", + "title": "Item year", + "description": "publication year" + }, + { + "name": "item_notes", + "type": "string", + "format": "default", + "title": "Item notes", + "description": "notes about the item" + }, + { + "name": "source_type", + "type": "string", + "format": "default", + "title": "Source type", + "description": "type of source this data was drawn from (could be multiple; separated by semicolons)" + }, + { + "name": "source_citation", + "type": "string", + "format": "default", + "title": "Source citation ", + "description": "bibliographic citation for the source of this data" + }, + { + "name": "source_manifest", + "type": "string", + "format": "default", + "title": "Source manifest", + "description": "IIIF Presentation manifest URL for a digitized edition of the source of this data (if multiple, separated by semicolons)" + }, + { + "name": "source_image", + "type": "string", + "format": "default", + "title": "Source image", + "description": "IIIF Image URL for the digitized image in the IIIF manifest documenting this event, if known (if multiple, separated by semicolons)" + } + ], + "missingValues": [ + "" + ], + "foreignKeys": [ + { + "fields": "member_uris", + "reference": { + "resource": "members", + "fields": "uri" + }, + "fields": "item_uri", + "reference": { + "resource": "books", + "fields": "uri" + } + } + ] + } + }, + { + "name": "books", + "path": "SCoData_books_v2.0_2024-10.csv", + "profile": "tabular-data-resource", + "schema": { + "fields": [ + { + "name": "id", + "type": "string", + "format": "default", + "title": "Book identifier", + "description": "unique identifier for book" + }, + { + "name": "uri", + "type": "string", + "format": "uri", + "title": "Book URI", + "description": "full URI for book; book detail page on https://shakespeareandco.princeton.edu" + }, + { + "name": "title", + "type": "string", + "format": "default", + "title": "Title ", + "description": "title of the book or other item" + }, + { + "name": "author", + "type": "string", + "format": "default", + "title": "Author", + "description": "author or authors, last name first (multiple, separated by semicolon)" + }, + { + "name": "editor", + "type": "string", + "format": "default", + "title": "Editor(s)", + "description": "editor(s) of work" + }, + { + "name": "translator", + "type": "string", + "format": "default", + "title": "Translator(s)", + "description": "translator(s) of work" + }, + { + "name": "introduction", + "type": "string", + "format": "default", + "title": "Author of introduction", + "description": "author of an introduction to work" + }, + { + "name": "illustrator", + "type": "string", + "format": "default", + "description": "illustrator(s) of work", + "title": "Illustrator(s)" + }, + { + "name": "photographer", + "type": "string", + "format": "default", + "title": "Photographer(s)", + "description": "photographer(s) featured in work" + }, + { + "name": "year", + "type": "year", + "format": "default", + "description": "year published", + "title": "Publication year" + }, + { + "name": "format", + "type": "string", + "format": "default", + "title": "Format", + "description": "type of item (article, book, periodical, phonograph record, photograph, print)" + }, + { + "name": "genre_category", + "type": "string", + "format": "default", + "title": "Genre", + "description": "genre of work (drama, fiction, nonfiction, periodical, poetry; if multiple, separated by semicolons)" + }, + { + "name": "uncertain", + "type": "boolean", + "format": "default", + "title": "Item is uncertain?", + "description": "boolean indicating if item is ambiguous or unidentifiable" + }, + { + "name": "ebook_url", + "type": "string", + "format": "uri", + "title": "Ebook URL", + "description": "link to a digital edition of this work" + }, + { + "name": "volumes_issues", + "type": "any", + "format": "default", + "title": "Volume/Issue numbers", + "description": "list of multivolume volumes or periodical issues known to have circulated (separated by semicolon)" + }, + { + "name": "notes", + "type": "any", + "format": "default", + "title": "Notes", + "description": "more information, e.g. about uncertain titles (text with markdown formatting)" + }, + { + "name": "event_count", + "type": "integer", + "format": "default", + "title": "Associated events count", + "description": "total number of events associated with this title (integer)" + }, + { + "name": "borrow_count", + "type": "integer", + "format": "default", + "title": "Borrow count", + "description": "total number of borrowing events associated with this title (integer)" + }, + { + "name": "purchase_count", + "type": "integer", + "format": "default", + "title": "Purchase count", + "description": "total number of purchase events associated with this title (integer)" + }, + { + "name": "circulation_years", + "type": "string", + "rdfType": "https://schema.org/ItemList", + "format": "default", + "constraints": { + "pattern": "(\\d{4})?(;\\d{4})*" + }, + "title": "Circulation years", + "description": "list of years of known activity for this title (if multiple, separated by semicolon)" + }, + { + "name": "updated", + "type": "datetime", + "format": "default", + "title": "Date updated", + "description": "timestamp record was last modified in the Shakespeare and Company Project database before export" + } + ] + } + }, + { + "name": "member_addresses", + "path": "SCoData_member_addresses_v2.0_2024-10.csv", + "profile": "tabular-data-resource", + "schema": { + "fields": [ + { + "name": "member_ids", + "type": "string", + "format": "default", + "title": "Member identifier", + "description": "unique identifier for member; member detail page on https://shakespeareandco.princeton.edu" + }, + { + "name": "member_names", + "type": "string", + "format": "default", + "title": "Member name ", + "description": "full name of member; may include variant names; for more, see https://shakespeareandco.princeton.edu/about/faq/#names" + }, + { + "name": "member_sort_names", + "type": "string", + "format": "default", + "title": "Sort name", + "description": "authorized name" + }, + { + "name": "member_uris", + "type": "string", + "format": "uri", + "title": "Member URI", + "description": "full URI for member; member detail page on https://shakespeareandco.princeton.edu" + }, + { + "name": "care_of_person_id", + "type": "string", + "format": "default", + "title": "Care of person identifier", + "description": "identifier of person who relays correspondance to intended recipient " + }, + { + "name": "care_of_person_name", + "type": "string", + "format": "default", + "title": "Care of person name", + "description": "name of person who relays correspondance to intended recipient " + }, + { + "name": "location_name", + "type": "string", + "format": "default", + "description": "location name ", + "title": "Location name" + }, + { + "name": "street_address", + "type": "string", + "format": "default", + "title": "Street address", + "description": "address (house number and street name) " + }, + { + "name": "postal_code", + "type": "string", + "format": "default", + "title": "Postal code", + "description": "postal code associated with address" + }, + { + "name": "city", + "type": "string", + "format": "default", + "title": "City", + "description": "city associated with address" + }, + { + "name": "arrondissement", + "type": "integer", + "format": "default", + "title": "Arrondissement", + "description": "Paris arrondissements for addresses in Paris (integer)" + }, + { + "name": "country", + "type": "string", + "format": "default", + "title": "Country", + "description": "country associated with address" + }, + { + "name": "longitude", + "type": "number", + "format": "default", + "title": "Longitude ", + "description": "longitudinal coordinate of address (numeric) " + }, + { + "name": "latitude", + "type": "number", + "format": "default", + "title": "Latitude", + "description": "latitudinal coordinate of address (numeric) " + } + ], + "missingValues": [ + "" + ] + } + }, + { + "name": "book_creators", + "path": "SCoData_book_creators_v2.0_2024-10.csv", + "profile": "tabular-data-resource", + "schema": { + "fields": [ + { + "name": "id", + "type": "string", + "format": "default", + "title": "Creator identifier", + "description": "unique identifier for creator" + }, + { + "name": "name", + "type": "string", + "format": "default", + "title": "Name ", + "description": "full name; may include variant names; for more, see https://shakespeareandco.princeton.edu/about/faq/#names" + }, + { + "name": "sort_name", + "type": "string", + "format": "default", + "title": "Sort name ", + "description": "authorized name" + }, + { + "name": "gender", + "type": "string", + "format": "default", + "title": "Gender", + "description": "male, female, nonbinary, unknown; for more, see https://shakespeareandco.princeton.edu/about/faq/#gender\n" + }, + { + "name": "is_organization", + "type": "boolean", + "format": "default", + "title": "Is an organization? ", + "description": "creator is an organization instead of a person (boolean)" + }, + { + "name": "birth_year", + "type": "integer", + "title": "Birth year", + "description": "birth year, if known" + }, + { + "name": "death_year", + "type": "integer", + "title": "Death year", + "description": "death year, if known" + }, + { + "name": "viaf_url", + "type": "string", + "format": "uri", + "title": "VIAF url ", + "description": "URL for Virtual Internet Authority File (VIAF, https://viaf.org/) identifier, if available" + }, + { + "name": "wikipedia_url", + "type": "string", + "format": "uri", + "title": "Wikipedia url", + "description": "URL for Wikipedia page, if available" + }, + { + "name": "nationalities", + "type": "string", + "format": "default", + "title": "Nationalities", + "description": "countries for known nationality (if multiple, separated by semicolons)" + }, + { + "name": "member_uri", + "type": "string", + "format": "uri", + "title": "Member URI", + "description": "identifier; member detail page on https://shakespeareandco.princeton.edu" + }, + { + "name": "notes", + "type": "string", + "format": "default", + "title": "Notes", + "description": "more information (text with markdown formatting)" + }, + { + "name": "updated", + "type": "datetime", + "format": "default", + "title": "Date updated", + "description": "timestamp record was last modified in the Shakespeare and Company Project database before export" + } + ] + } + } + ] +} \ No newline at end of file