From 1c4cd7905fecca4376eaaaf8081f29972626fe28 Mon Sep 17 00:00:00 2001 From: Sophia Tao Date: Fri, 16 Mar 2018 17:18:01 -0700 Subject: [PATCH] uploading scripts for review --- R/stao/CFEC_BIT_processing_199.R | 67 ++++++++++++ R/stao/CFEC_BIT_submission_199.R | 136 ++++++++++++++++++++++++ R/stao/commercial_crew_processing_114.R | 35 ++++++ R/stao/commercial_crew_submission_114.R | 112 +++++++++++++++++++ 4 files changed, 350 insertions(+) create mode 100644 R/stao/CFEC_BIT_processing_199.R create mode 100644 R/stao/CFEC_BIT_submission_199.R create mode 100644 R/stao/commercial_crew_processing_114.R create mode 100644 R/stao/commercial_crew_submission_114.R diff --git a/R/stao/CFEC_BIT_processing_199.R b/R/stao/CFEC_BIT_processing_199.R new file mode 100644 index 0000000..48539be --- /dev/null +++ b/R/stao/CFEC_BIT_processing_199.R @@ -0,0 +1,67 @@ +################ +#Issue #199: CFEC Basic Information Table (BIT) +#Data Processing +#February 2018 +#Sophia Tao +################ + + + +# read in +bit <- read.csv("/home/stao/my-sasap/199_CFEC/BIT.csv", header = T) + +# delete unnecessary column +bit$X....Preliminary <- NULL + +# replace "." with "NA" +bit$Average.Permit.Price[bit$Average.Permit.Price=="."] <- NA +bit$Total.Permits.Fished[bit$Total.Permits.Fished=="."] <- NA +bit$Resident.Total.Pounds[bit$Resident.Total.Pounds=="."] <- NA +bit$Nonresident.Total.Pounds[bit$Nonresident.Total.Pounds=="."] <- NA +bit$Total.Pounds[bit$Total.Pounds=="."] <- NA +bit$Resident.Average.Pounds[bit$Resident.Average.Pounds=="."] <- NA +bit$Nonresident.Average.Pounds[bit$Nonresident.Average.Pounds=="."] <- NA +bit$Average.Pounds[bit$Average.Pounds=="."] <- NA +bit$Resident.Total.Earnings[bit$Resident.Total.Earnings=="."] <- NA +bit$Nonresident.Total.Earnings[bit$Nonresident.Total.Earnings=="."] <- NA +bit$Total.Earnings[bit$Total.Earnings=="."] <- NA +bit$Resident.Average.Earnings[bit$Resident.Average.Earnings=="."] <- NA +bit$Nonresident.Average.Earnings[bit$Nonresident.Average.Earnings=="."] <- NA +bit$Average.Earnings[bit$Average.Earnings=="."] <- NA +bit$Average.Permit.Price[bit$Average.Permit.Price=="."] <- NA + +# remove commas +bit$Resident.Interim.Permits.Issued <- gsub(",", "", bit$Resident.Interim.Permits.Issued) +bit$Resident.Interim.Permits.Issued <- gsub(",", "", bit$Resident.Interim.Permits.Issued) +bit$Nonresident.Interim.Permits.Issued <- gsub(",", "", bit$Nonresident.Interim.Permits.Issued) +bit$Total.Interim.Permits.Issued <- gsub(",", "", bit$Total.Interim.Permits.Issued) +bit$Resident.Permits.Issued.Renewed <- gsub(",", "", bit$Resident.Permits.Issued.Renewed) +bit$Nonresident.Permits.Issued.Renewed <- gsub(",", "", bit$Nonresident.Permits.Issued.Renewed) +bit$Total.Permits.Issued.Renewed <- gsub(",", "", bit$Total.Permits.Issued.Renewed) +bit$Resident.Total.Permits.Fished <- gsub(",", "", bit$Resident.Total.Permits.Fished) +bit$Total.Permits.Fished <- gsub(",", "", bit$Total.Permits.Fished) +bit$Resident.Total.Pounds <- gsub(",", "", bit$Resident.Total.Pounds) +bit$Nonresident.Total.Pounds <- gsub(",", "", bit$Nonresident.Total.Pounds) +bit$Total.Pounds <- gsub(",", "", bit$Total.Pounds) +bit$Resident.Average.Pounds <- gsub(",", "", bit$Resident.Average.Pounds) +bit$Nonresident.Average.Pounds <- gsub(",", "", bit$Nonresident.Average.Pounds) +bit$Average.Pounds <- gsub(",", "", bit$Average.Pounds) +bit$Resident.Total.Earnings <- gsub(",", "", bit$Resident.Total.Earnings) +bit$Nonresident.Total.Earnings <- gsub(",", "", bit$Nonresident.Total.Earnings) +bit$Total.Earnings <- gsub(",", "", bit$Total.Earnings) +bit$Resident.Average.Earnings <- gsub(",", "", bit$Resident.Average.Earnings) +bit$Nonresident.Average.Earnings <- gsub(",", "", bit$Nonresident.Average.Earnings) +bit$Average.Earnings <- gsub(",", "", bit$Average.Earnings) + +# remove "$" +bit$Resident.Total.Earnings <- gsub("[$]", "", bit$Resident.Total.Earnings) +bit$Nonresident.Total.Earnings <- gsub("[$]", "", bit$Nonresident.Total.Earnings) +bit$Total.Earnings <- gsub("[$]", "", bit$Total.Earnings) +bit$Resident.Average.Earnings <- gsub("[$]", "", bit$Resident.Average.Earnings) +bit$Nonresident.Average.Earnings <- gsub("[$]", "", bit$Nonresident.Average.Earnings) +bit$Average.Earnings <- gsub("[$]", "", bit$Average.Earnings) + +# export +write.csv(bit, "/home/stao/my-sasap/199_CFEC/BIT.csv", row.names = F) + + diff --git a/R/stao/CFEC_BIT_submission_199.R b/R/stao/CFEC_BIT_submission_199.R new file mode 100644 index 0000000..353e611 --- /dev/null +++ b/R/stao/CFEC_BIT_submission_199.R @@ -0,0 +1,136 @@ +################ +#Issue #199: CFEC Basic Information Table (BIT) +#Data Submission +#February 2018 +#Sophia Tao +################ + + + +# load libraries +library(arcticdatautils) +library(dataone) +library(EML) +library(XML) +library(digest) +library(shiny) +library(rhandsontable) + +# set environment +cn <- CNode('PROD') +mn <- getMNode(cn,'urn:node:KNB') +# set authentication token + + + +# publish data object +bit_path <- '/home/stao/my-sasap/199_CFEC/BIT.csv' +# bitID <- publish_object(mn, bit_path, format_id = 'text/csv') +bitID <- "urn:uuid:aa308395-54f6-412c-9ca3-0112a1d67938" + + + +# edit EML +eml_path <- '/home/stao/my-sasap/199_CFEC/CFEC_BIT.xml' +eml <- read_eml(eml_path) + +# add SASAP project info +source('~/sasap-data/data-submission/Helpers/SasapProjectCreator.R') +eml@dataset@project <- sasap_project() + +# generate attribute table +attributes1 <- data.frame( + attributeName = c('Fishery','Fishery.Description','Year','Resident.Permanent.Permits.Renewed','Nonresident.Permanent.Permits.Renewed','Total.Permanent.Permits.Renewed','Resident.Interim.Permits.Issued','Nonresident.Interim.Permits.Issued','Total.Interim.Permits.Issued','Resident.Permits.Issued.Renewed','Nonresident.Permits.Issued.Renewed','Total.Permits.Issued.Renewed','Resident.Total.Permits.Fished','Nonresident.Total.Permits.Fished','Total.Permits.Fished','Resident.Total.Pounds','Nonresident.Total.Pounds','Total.Pounds','Resident.Average.Pounds','Nonresident.Average.Pounds','Average.Pounds','Resident.Total.Earnings','Nonresident.Total.Earnings','Total.Earnings','Resident.Average.Earnings','Nonresident.Average.Earnings','Average.Earnings','Average.Permit.Price'), + domain = c('textDomain','textDomain','dateTimeDomain','numericDomain','numericDomain','numericDomain','numericDomain','numericDomain','numericDomain','numericDomain','numericDomain','numericDomain','numericDomain','numericDomain','numericDomain','numericDomain','numericDomain','numericDomain','numericDomain','numericDomain','numericDomain','numericDomain','numericDomain','numericDomain','numericDomain','numericDomain','numericDomain','numericDomain'), + attributeDefinition = c('fishery code comprises of a species code, a gear code, and an area code','description for fishery code','year pertaining to the correlating information','number of permanent permits renewed by residents','number of permanent permits renewed by nonresidents','total number of permanent permits renewed','number of interim permits issued to residents','number of interim permits issued to nonresidents','total number of interim permits issued','number of permits issued to or renewed by residents','number of permits issued to or renewed by nonresidents','total number of permits issued or renewed','number of permits used to fish by residents','number of permits used to fish by nonresidents','total number of permits used to fish overall','total pounds of fish landed by residents','total pounds of fish landed by nonresidents','total pounds of fish landed overall','average pounds of fish landed by residents','average pounds of fish landed by nonresidents','average pounds of fish landed overall','total earnings of residents','total earnings of nonresidents','total earnings overall','average earnings of residents','average earnings of nonresidents','average earnings overall','average permit price'), + definition = c('fishery code comprises of a species code, a gear code, and an area code','description for fishery code',NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA), + measurementScale = c('nominal','nominal','dateTime','ratio','ratio','ratio','ratio','ratio','ratio','ratio','ratio','ratio','ratio','ratio','ratio','ratio','ratio','ratio','ratio','ratio','ratio','ratio','ratio','ratio','ratio','ratio','ratio','ratio'), + formatString = c(NA,NA,'YYYY',NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA), + numberType = c(NA,NA,NA,'whole','whole','whole','whole','whole','whole','whole','whole','whole','whole','whole','whole','whole','whole','whole','whole','whole','whole','whole','whole','whole','whole','whole','whole','whole'), + unit = c(NA,NA,NA,'number','number','number','number','number','number','number','number','number','number','number','number','pound','pound','pound','pound','pound','pound','dimensionless','dimensionless','dimensionless','dimensionless','dimensionless','dimensionless','dimensionless'), + missingValueCode = c('NA','NA','NA','NA','NA','NA','NA','NA','NA','NA','NA','NA','NA','NA','NA','NA','NA','NA','NA','NA','NA','NA','NA','NA','NA','NA','NA','NA'), + missingValueCodeExplanation = c('information not provided/reported','information not provided/reported','information not provided/reported','information not provided/reported','information not provided/reported','information not provided/reported','information not provided/reported','information not provided/reported','information not provided/reported','information not provided/reported','information not provided/reported','information not provided/reported','information not provided/reported','information not provided/reported','information not provided/reported','information not provided/reported','information not provided/reported','information not provided/reported','information not provided/reported','information not provided/reported','information not provided/reported','information not provided/reported','information not provided/reported','information not provided/reported','information not provided/reported','information not provided/reported','information not provided/reported','information not provided/reported'), + stringsAsFactors = FALSE) +attributeList1 <- set_attributes(attributes1) + +# generate physical +physical1 <- pid_to_eml_physical(mn, pkg$data) + +# generate dataTable +dataTable1 <- new('dataTable', + entityName = 'BIT.csv', + entityDescription = 'Slightly modified csv file of CFEC Basic Information Table (BIT) including number of permits, total pounds landed, total estimated gross earnings, and average estimated gross earnings per permit for each permit fishery by year, from 1975 to 2016', + physical = c(physical1[[1]]), + attributeList = attributeList1) + +# add dataTable to EML +eml@dataset@dataTable <- c(dataTable1) + +# change geographic coverage +geocov1 <- new("geographicCoverage", geographicDescription = "The geographic region includes all commercial fishery management areas in Alaska as well as all areas of residence of permit holders.", + boundingCoordinates = new("boundingCoordinates", + northBoundingCoordinate = new("northBoundingCoordinate", 72), + eastBoundingCoordinate = new("eastBoundingCoordinate", -129), + southBoundingCoordinate = new("southBoundingCoordinate", 51), + westBoundingCoordinate = new("westBoundingCoordinate", -179))) +eml@dataset@coverage@geographicCoverage <- c(geocov1) + +# change abstract +eml@dataset@abstract <- new("abstract", .Data = "The Commercial Fisheries Entry Commission (CFEC) is an independent, autonomous agency of the State of Alaska which regulates entry into Alaska's commercial fisheries. The CFEC is committed to promoting conservation and sustained-yield management of Alaska's unique fishery resources and boosting economic stability among fishermen and their dependents. The CFEC is responsible for leasing all commercial fishing permits including permits for limited-entry fisheries. Limited-entry fisheries include all state salmon fisheries, most herring fisheries, and various other fisheries. The number of permits, total pounds landed, total estimated gross earnings, and average estimated gross earnings per permit are provided for each permit fishery by year, from 1975-2016. Information is subtotaled by resident type. Estimated permit values at year-end are shown for limited fisheries with a sufficient number of permit sales. The data included in this package has been modified slightly from what was provided by CFEC: currency columns were reformatted as numeric, and an empty column was removed.") + +# write and validate EML +write_eml(eml, eml_path) +eml_validate(eml_path) + + + +# create resource map +# rm <- create_resource_map(mn, "knb.92196.1", bitID) +rm <- update_resource_map(mn, "resource_map_urn:uuid:ac8a0a24-79f1-47d3-92be-113132808913", "knb.92196.4", pkg$data, public = T, check_first = T) + +# get package +pkg <- get_package(mn, NEWEST, file_names = T) + +# set rights & access +set_rights_and_access(mn, + c(pkg$metadata, pkg$data, pkg$resource_map), + subject = 'CN=SASAP,DC=dataone,DC=org', + permissions = c('read', 'write', 'changePermission')) + +# update package +publish_update(mn, + metadata_pid = pkg$metadata, + resource_map_pid = pkg$resource_map, + metadata_path = eml_path, + data_pid = pkg$data, + check_first = T, + use_doi = F, + public = T) + +NEWEST <- "urn:uuid:a6a4b230-799c-42ea-b331-62a2b1013ee4" + + + + + +# accidentally published another data package with the same metadata..... +# publish metadata +# metadata <- publish_object(mn, eml_path, format_id = format_eml()) +metadata <- "urn:uuid:d4c2f2df-24db-4863-8f7c-734dfb087059" +remove_public_read(mn, metadata) + +# use sysmeta to 'obselete' duplicated data package +get_all_versions(mn, pkg$metadata) +old <- "knb.92196.1" +sysmeta <- getSystemMetadata(mn, old) +sysmeta@obsoletes = metadata +updateSystemMetadata(mn, old, sysmeta) + +sysmetaOB <- getSystemMetadata(mn, metadata) +sysmetaOB@obsoletedBy = old +updateSystemMetadata(mn, metadata, sysmetaOB) + +# set public read +set_public_read(mn, c(pkg$data, pkg$metadata, pkg$resource_map)) + + diff --git a/R/stao/commercial_crew_processing_114.R b/R/stao/commercial_crew_processing_114.R new file mode 100644 index 0000000..1db0b9c --- /dev/null +++ b/R/stao/commercial_crew_processing_114.R @@ -0,0 +1,35 @@ +################ +#Issue #114: Commercial Crew Member Data +#Data Processing +#February 2018 +#Sophia Tao +################ + + + +a <- read.csv('/home/stao/my-sasap/114_commercial_crew/Commercial Crew data 2012-2016.csv', + header = T, + stringsAsFactors = F, + na.strings = c("", "Not Available")) + +# correct typos +typo <- which(a$Full.Name == ",ARL A. HIXSON") +a$Full.Name[typo] <- "CARL A. HIXSON" +typo2 <- which(a$First.Name == ",ARL") +a$First.Name[typo2] <- "CARL" +typo3 <- which(a$Full.Name == "2021682 R. J") +a$Full.Name[typo3] <- "R. J" +a$First.Name[a$First.Name == "2021682"] <- NA +typo4 <- which(a$Full.Name == "A;BERT R. ZAQREB") +a$Full.Name[typo4] <- "ALBERT R. ZAQREB" +typo5 <- which(a$First.Name == "A;BERT") +a$First.Name[typo5] <- "ALBERT" + +# remove "?" +a$Full.Name <- gsub("[?]", "", a$Full.Name) +a$First.Name <- gsub("[?]", "", a$First.Name) +a$Last.Name <- gsub("[?]", "", a$Last.Name) + +write.csv(a, '/home/stao/my-sasap/114_commercial_crew/Commercial_Crew_data_2012-2016_formatted.csv', row.names = F) + + diff --git a/R/stao/commercial_crew_submission_114.R b/R/stao/commercial_crew_submission_114.R new file mode 100644 index 0000000..e6ccfa8 --- /dev/null +++ b/R/stao/commercial_crew_submission_114.R @@ -0,0 +1,112 @@ +################ +#Issue #114: Commercial Crew Member Data +#Data Submission +#February 2018 +#Sophia Tao +################ + + + +# load libraries +library(arcticdatautils) +library(dataone) +library(EML) +library(XML) +library(digest) +library(shiny) +library(rhandsontable) + +# set environment +cn <- CNode('PROD') +mn <- getMNode(cn,'urn:node:KNB') +# set authentication token + + +# publish file +# formatted <- publish_object(mn, '/home/stao/my-sasap/114_commercial_crew/Commercial_Crew_data_2012-2017_formatted.csv', format_id = 'text/csv') +formatted <- "urn:uuid:f8a813e8-4937-40ed-9312-ddb177a6469b" + + + +# edit EMl +eml_path <- '/home/stao/my-sasap/114_commercial_crew/CommercialCrewEML.xml' +eml <- read_eml(eml_path) + +# add SASAP project info +source('~/sasap-data/data-submission/Helpers/SasapProjectCreator.R') +eml@dataset@project <- sasap_project() + +# add intellectual rights +eml@dataset@intellectualRights <- new('intellectualRights', + .Data = "CFEC retains intellectual property rights to data collected by or for CFEC. Any dissemination of the data must credit CFEC as the source, with a disclaimer that exonerates the department for errors or deficiencies in reproduction, subsequent analysis, or interpretation. Please see http://www.adfg.alaska.gov/index.cfm?adfg=home.copyright for further information.") + +# change abstract +eml@dataset@abstract <- new('abstract', + .Data = "The Commercial Fisheries Entry Commission (CFEC) is an independent, autonomous agency of the State of Alaska which regulates entry into Alaska's commercial fisheries. The CFEC is committed to promoting conservation and sustained-yield management of Alaska's unique fishery resources and boosting economic stability among fishermen and their dependents. The CFEC is responsible for leasing all commercial fishing permits including permits for limited-entry fisheries. Limited-entry fisheries include all state salmon fisheries, most herring fisheries, and various other fisheries. A person engaged in commercial fishing is considered a commercial fisherman and must hold a commercial fishing license. Commercial fisherman means an individual who fishes commercially for, takes, or attempts to take fish, shellfish, or other fishery resources of the state by any means, and includes every individual aboard a boat operated for fishing purposes, or in a fishing operation, who participates directly or indirectly in the taking of these raw fishery products, whether participation is on shares or as an employee or otherwise; however, this definition does not apply to anyone aboard a licensed vessel as a visitor or guest who does not directly or indirectly participate in the taking; and the term 'commercial fisherman' includes the crews of tenders, processors, catcher processors or other floating craft used in transporting fish. Persons who need to obtain a crew member license include persons handling fishing gear, the cook, the engineer and any crewmembers who assist at all in maintenance, navigation, docking and operation of the vessel (including taking aboard fish from tenders or catcher vessels). +This dataset includes information about commercial crew members including license type, number, year, crew member name, and residence. The data included in this package has been modified slightly from what was provided by CFEC: typos with names were corrected and special characters were removed.") + +# create attributes table +attributes1 <- data.frame( + attributeName = c('License.Year','Full.Name','First.Name','Middle.Initial','Last.Name','Gender','Residency','Mailing.City','Mailing.Country','Mailing.State','Mailing.Street1','Mailing.Street2','Mailing.Zip','Vendor','Type.Description','License.Number'), + domain = c('dateTimeDomain','textDomain','textDomain','textDomain','textDomain','textDomain','textDomain','textDomain','textDomain','textDomain','textDomain','textDomain','textDomain','textDomain','textDomain','textDomain'), + attributeDefinition = c('year of crew member license','full name of crew member','first name of crew member','middle initial of crew member','last name of crew member','gender of crew member','residency status of crew member','city of mailing address','country of mailing address','state of mailing address','street 1 of mailing address','street 2 of mailing address','zip code of mailing address','vendor','crew member license type description','crew member license number'), + definition = c(NA,'full name of crew member','first name of crew member','middle initial of crew member','last name of crew member','gender of crew member','residency status of crew member','city of mailing address','country of mailing address','state of mailing address','street 1 of mailing address','street 2 of mailing address','zip code of mailing address','vendor','crew member license type description; For a period, they allowed 7 day crewmember licenses in the hopes of spurring a “dude fishing” industry. While optimistic, the industry never appeared, but savvy fishermen realized that for fisheries that are prosecuted over only a few weeks (i.e., Bristol Bay Salmon) it was cheaper to buy a few 7 day licenses than an annual license. This loophole was closed last year.','crew member license number'), + measurementScale = c('dateTime','nominal','nominal','nominal','nominal','nominal','nominal','nominal','nominal','nominal','nominal','nominal','nominal','nominal','nominal','nominal'), + formatString = c('YYYY',NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA), + numberType = c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA), + unit = c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA), + missingValueCode = c(NA,'NA','NA','NA','NA','NA','NA','NA','NA','NA','NA','NA','NA','NA',NA,NA), + missingValueCodeExplanation = c(NA,'The majority of licenses come in from vendors on paper forms. These forms are data entered by data entry crew. If they can’t read a name they either type a question mark or leave it blank. As a consequence, these data are rife with errors.','information not provided/recorded','crew member does not have a middle name, or information not provided/recorded','information not provided/recorded','gender is not required','information not provided/recorded','information not provided/recorded','information not provided/recorded','information not provided/recorded','information not provided/recorded','information not provided/recorded','information not provided/recorded','information not provided/recorded',NA,NA), + stringsAsFactors = FALSE) +attributeList1 <- set_attributes(attributes1) + +# generate physical +phys <- pid_to_eml_physical(mn, formatted) + +# generate dataTables +dataTable1 <- new('dataTable', + entityName = 'Commercial_Crew_data_2012-2017_formatted.csv', + physical = phys, + attributeList = attributeList1) + +# add dataTable into EML +eml@dataset@dataTable <- c(dataTable1) + + +# write and validate EML +write_eml(eml, eml_path) +eml_validate(eml_path) + + + +# create resource map +# rm <- create_resource_map(mn, 'knb.92220.2', data_pids = formatted) +rm <- "resource_map_urn:uuid:cefd2b8e-096b-437a-926d-451a1dae7f2d" + +# get package +pkg <- get_package(mn, NEWEST, file_names = T) + +# update package +publish_update(mn, + metadata_pid = pkg$metadata, + resource_map_pid = pkg$resource_map, + metadata_path = eml_path, + data_pid = pkg$data, + check_first = T, + use_doi = F, + public = F) + +# set rights and access +set_rights_and_access(mn, c(pkg$metadata, pkg$data, pkg$resource_map), 'CN=SASAP,DC=dataone,DC=org', permissions = c('read', 'write', 'changePermission')) + +NEWEST <- "urn:uuid:107e0084-8d30-4601-8869-19211b34f967" + + +# change file name through sysmeta +sysmeta1 <- getSystemMetadata(mn, pkg$data) +sysmeta1@fileName <- 'Commercial_Crew_data_2012-2017_formatted.csv' +updateSystemMetadata(mn, pkg$data, sysmeta1) + + +# qa data and attributes tables +qa_package(mn, NEWEST, readData = F)