-
Notifications
You must be signed in to change notification settings - Fork 14
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
uploading scripts for review #5
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
################ | ||
#Issue #199: CFEC Basic Information Table (BIT) | ||
#Data Processing | ||
#February 2018 | ||
#Sophia Tao | ||
################ | ||
|
||
|
||
|
||
# read in | ||
bit <- read.csv("/home/stao/my-sasap/199_CFEC/BIT.csv", header = T) | ||
|
||
# delete unnecessary column | ||
bit$X....Preliminary <- NULL | ||
|
||
# replace "." with "NA" | ||
bit$Average.Permit.Price[bit$Average.Permit.Price=="."] <- NA | ||
bit$Total.Permits.Fished[bit$Total.Permits.Fished=="."] <- NA | ||
bit$Resident.Total.Pounds[bit$Resident.Total.Pounds=="."] <- NA | ||
bit$Nonresident.Total.Pounds[bit$Nonresident.Total.Pounds=="."] <- NA | ||
bit$Total.Pounds[bit$Total.Pounds=="."] <- NA | ||
bit$Resident.Average.Pounds[bit$Resident.Average.Pounds=="."] <- NA | ||
bit$Nonresident.Average.Pounds[bit$Nonresident.Average.Pounds=="."] <- NA | ||
bit$Average.Pounds[bit$Average.Pounds=="."] <- NA | ||
bit$Resident.Total.Earnings[bit$Resident.Total.Earnings=="."] <- NA | ||
bit$Nonresident.Total.Earnings[bit$Nonresident.Total.Earnings=="."] <- NA | ||
bit$Total.Earnings[bit$Total.Earnings=="."] <- NA | ||
bit$Resident.Average.Earnings[bit$Resident.Average.Earnings=="."] <- NA | ||
bit$Nonresident.Average.Earnings[bit$Nonresident.Average.Earnings=="."] <- NA | ||
bit$Average.Earnings[bit$Average.Earnings=="."] <- NA | ||
bit$Average.Permit.Price[bit$Average.Permit.Price=="."] <- NA | ||
|
||
# remove commas | ||
bit$Resident.Interim.Permits.Issued <- gsub(",", "", bit$Resident.Interim.Permits.Issued) | ||
bit$Resident.Interim.Permits.Issued <- gsub(",", "", bit$Resident.Interim.Permits.Issued) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. here's one example alternative method:
This way returns your data frame as a matrix, so it's worth checking to make sure it doesn't mess anything else up. There are also tidyverse ways of doing things like this, but I'll leave that for you to explore :) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is really cool |
||
bit$Nonresident.Interim.Permits.Issued <- gsub(",", "", bit$Nonresident.Interim.Permits.Issued) | ||
bit$Total.Interim.Permits.Issued <- gsub(",", "", bit$Total.Interim.Permits.Issued) | ||
bit$Resident.Permits.Issued.Renewed <- gsub(",", "", bit$Resident.Permits.Issued.Renewed) | ||
bit$Nonresident.Permits.Issued.Renewed <- gsub(",", "", bit$Nonresident.Permits.Issued.Renewed) | ||
bit$Total.Permits.Issued.Renewed <- gsub(",", "", bit$Total.Permits.Issued.Renewed) | ||
bit$Resident.Total.Permits.Fished <- gsub(",", "", bit$Resident.Total.Permits.Fished) | ||
bit$Total.Permits.Fished <- gsub(",", "", bit$Total.Permits.Fished) | ||
bit$Resident.Total.Pounds <- gsub(",", "", bit$Resident.Total.Pounds) | ||
bit$Nonresident.Total.Pounds <- gsub(",", "", bit$Nonresident.Total.Pounds) | ||
bit$Total.Pounds <- gsub(",", "", bit$Total.Pounds) | ||
bit$Resident.Average.Pounds <- gsub(",", "", bit$Resident.Average.Pounds) | ||
bit$Nonresident.Average.Pounds <- gsub(",", "", bit$Nonresident.Average.Pounds) | ||
bit$Average.Pounds <- gsub(",", "", bit$Average.Pounds) | ||
bit$Resident.Total.Earnings <- gsub(",", "", bit$Resident.Total.Earnings) | ||
bit$Nonresident.Total.Earnings <- gsub(",", "", bit$Nonresident.Total.Earnings) | ||
bit$Total.Earnings <- gsub(",", "", bit$Total.Earnings) | ||
bit$Resident.Average.Earnings <- gsub(",", "", bit$Resident.Average.Earnings) | ||
bit$Nonresident.Average.Earnings <- gsub(",", "", bit$Nonresident.Average.Earnings) | ||
bit$Average.Earnings <- gsub(",", "", bit$Average.Earnings) | ||
|
||
# remove "$" | ||
bit$Resident.Total.Earnings <- gsub("[$]", "", bit$Resident.Total.Earnings) | ||
bit$Nonresident.Total.Earnings <- gsub("[$]", "", bit$Nonresident.Total.Earnings) | ||
bit$Total.Earnings <- gsub("[$]", "", bit$Total.Earnings) | ||
bit$Resident.Average.Earnings <- gsub("[$]", "", bit$Resident.Average.Earnings) | ||
bit$Nonresident.Average.Earnings <- gsub("[$]", "", bit$Nonresident.Average.Earnings) | ||
bit$Average.Earnings <- gsub("[$]", "", bit$Average.Earnings) | ||
|
||
# export | ||
write.csv(bit, "/home/stao/my-sasap/199_CFEC/BIT.csv", row.names = F) | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,136 @@ | ||
################ | ||
#Issue #199: CFEC Basic Information Table (BIT) | ||
#Data Submission | ||
#February 2018 | ||
#Sophia Tao | ||
################ | ||
|
||
|
||
|
||
# load libraries | ||
library(arcticdatautils) | ||
library(dataone) | ||
library(EML) | ||
library(XML) | ||
library(digest) | ||
library(shiny) | ||
library(rhandsontable) | ||
|
||
# set environment | ||
cn <- CNode('PROD') | ||
mn <- getMNode(cn,'urn:node:KNB') | ||
# set authentication token | ||
|
||
|
||
|
||
# publish data object | ||
bit_path <- '/home/stao/my-sasap/199_CFEC/BIT.csv' | ||
# bitID <- publish_object(mn, bit_path, format_id = 'text/csv') | ||
bitID <- "urn:uuid:aa308395-54f6-412c-9ca3-0112a1d67938" | ||
|
||
|
||
|
||
# edit EML | ||
eml_path <- '/home/stao/my-sasap/199_CFEC/CFEC_BIT.xml' | ||
eml <- read_eml(eml_path) | ||
|
||
# add SASAP project info | ||
source('~/sasap-data/data-submission/Helpers/SasapProjectCreator.R') | ||
eml@dataset@project <- sasap_project() | ||
|
||
# generate attribute table | ||
attributes1 <- data.frame( | ||
attributeName = c('Fishery','Fishery.Description','Year','Resident.Permanent.Permits.Renewed','Nonresident.Permanent.Permits.Renewed','Total.Permanent.Permits.Renewed','Resident.Interim.Permits.Issued','Nonresident.Interim.Permits.Issued','Total.Interim.Permits.Issued','Resident.Permits.Issued.Renewed','Nonresident.Permits.Issued.Renewed','Total.Permits.Issued.Renewed','Resident.Total.Permits.Fished','Nonresident.Total.Permits.Fished','Total.Permits.Fished','Resident.Total.Pounds','Nonresident.Total.Pounds','Total.Pounds','Resident.Average.Pounds','Nonresident.Average.Pounds','Average.Pounds','Resident.Total.Earnings','Nonresident.Total.Earnings','Total.Earnings','Resident.Average.Earnings','Nonresident.Average.Earnings','Average.Earnings','Average.Permit.Price'), | ||
domain = c('textDomain','textDomain','dateTimeDomain','numericDomain','numericDomain','numericDomain','numericDomain','numericDomain','numericDomain','numericDomain','numericDomain','numericDomain','numericDomain','numericDomain','numericDomain','numericDomain','numericDomain','numericDomain','numericDomain','numericDomain','numericDomain','numericDomain','numericDomain','numericDomain','numericDomain','numericDomain','numericDomain','numericDomain'), | ||
attributeDefinition = c('fishery code comprises of a species code, a gear code, and an area code','description for fishery code','year pertaining to the correlating information','number of permanent permits renewed by residents','number of permanent permits renewed by nonresidents','total number of permanent permits renewed','number of interim permits issued to residents','number of interim permits issued to nonresidents','total number of interim permits issued','number of permits issued to or renewed by residents','number of permits issued to or renewed by nonresidents','total number of permits issued or renewed','number of permits used to fish by residents','number of permits used to fish by nonresidents','total number of permits used to fish overall','total pounds of fish landed by residents','total pounds of fish landed by nonresidents','total pounds of fish landed overall','average pounds of fish landed by residents','average pounds of fish landed by nonresidents','average pounds of fish landed overall','total earnings of residents','total earnings of nonresidents','total earnings overall','average earnings of residents','average earnings of nonresidents','average earnings overall','average permit price'), | ||
definition = c('fishery code comprises of a species code, a gear code, and an area code','description for fishery code',NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA), | ||
measurementScale = c('nominal','nominal','dateTime','ratio','ratio','ratio','ratio','ratio','ratio','ratio','ratio','ratio','ratio','ratio','ratio','ratio','ratio','ratio','ratio','ratio','ratio','ratio','ratio','ratio','ratio','ratio','ratio','ratio'), | ||
formatString = c(NA,NA,'YYYY',NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA), | ||
numberType = c(NA,NA,NA,'whole','whole','whole','whole','whole','whole','whole','whole','whole','whole','whole','whole','whole','whole','whole','whole','whole','whole','whole','whole','whole','whole','whole','whole','whole'), | ||
unit = c(NA,NA,NA,'number','number','number','number','number','number','number','number','number','number','number','number','pound','pound','pound','pound','pound','pound','dimensionless','dimensionless','dimensionless','dimensionless','dimensionless','dimensionless','dimensionless'), | ||
missingValueCode = c('NA','NA','NA','NA','NA','NA','NA','NA','NA','NA','NA','NA','NA','NA','NA','NA','NA','NA','NA','NA','NA','NA','NA','NA','NA','NA','NA','NA'), | ||
missingValueCodeExplanation = c('information not provided/reported','information not provided/reported','information not provided/reported','information not provided/reported','information not provided/reported','information not provided/reported','information not provided/reported','information not provided/reported','information not provided/reported','information not provided/reported','information not provided/reported','information not provided/reported','information not provided/reported','information not provided/reported','information not provided/reported','information not provided/reported','information not provided/reported','information not provided/reported','information not provided/reported','information not provided/reported','information not provided/reported','information not provided/reported','information not provided/reported','information not provided/reported','information not provided/reported','information not provided/reported','information not provided/reported','information not provided/reported'), | ||
stringsAsFactors = FALSE) | ||
attributeList1 <- set_attributes(attributes1) | ||
|
||
# generate physical | ||
physical1 <- pid_to_eml_physical(mn, pkg$data) | ||
|
||
# generate dataTable | ||
dataTable1 <- new('dataTable', | ||
entityName = 'BIT.csv', | ||
entityDescription = 'Slightly modified csv file of CFEC Basic Information Table (BIT) including number of permits, total pounds landed, total estimated gross earnings, and average estimated gross earnings per permit for each permit fishery by year, from 1975 to 2016', | ||
physical = c(physical1[[1]]), | ||
attributeList = attributeList1) | ||
|
||
# add dataTable to EML | ||
eml@dataset@dataTable <- c(dataTable1) | ||
|
||
# change geographic coverage | ||
geocov1 <- new("geographicCoverage", geographicDescription = "The geographic region includes all commercial fishery management areas in Alaska as well as all areas of residence of permit holders.", | ||
boundingCoordinates = new("boundingCoordinates", | ||
northBoundingCoordinate = new("northBoundingCoordinate", 72), | ||
eastBoundingCoordinate = new("eastBoundingCoordinate", -129), | ||
southBoundingCoordinate = new("southBoundingCoordinate", 51), | ||
westBoundingCoordinate = new("westBoundingCoordinate", -179))) | ||
eml@dataset@coverage@geographicCoverage <- c(geocov1) | ||
|
||
# change abstract | ||
eml@dataset@abstract <- new("abstract", .Data = "The Commercial Fisheries Entry Commission (CFEC) is an independent, autonomous agency of the State of Alaska which regulates entry into Alaska's commercial fisheries. The CFEC is committed to promoting conservation and sustained-yield management of Alaska's unique fishery resources and boosting economic stability among fishermen and their dependents. The CFEC is responsible for leasing all commercial fishing permits including permits for limited-entry fisheries. Limited-entry fisheries include all state salmon fisheries, most herring fisheries, and various other fisheries. The number of permits, total pounds landed, total estimated gross earnings, and average estimated gross earnings per permit are provided for each permit fishery by year, from 1975-2016. Information is subtotaled by resident type. Estimated permit values at year-end are shown for limited fisheries with a sufficient number of permit sales. The data included in this package has been modified slightly from what was provided by CFEC: currency columns were reformatted as numeric, and an empty column was removed.") | ||
|
||
# write and validate EML | ||
write_eml(eml, eml_path) | ||
eml_validate(eml_path) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. generally better to
|
||
|
||
|
||
|
||
# create resource map | ||
# rm <- create_resource_map(mn, "knb.92196.1", bitID) | ||
rm <- update_resource_map(mn, "resource_map_urn:uuid:ac8a0a24-79f1-47d3-92be-113132808913", "knb.92196.4", pkg$data, public = T, check_first = T) | ||
|
||
# get package | ||
pkg <- get_package(mn, NEWEST, file_names = T) | ||
|
||
# set rights & access | ||
set_rights_and_access(mn, | ||
c(pkg$metadata, pkg$data, pkg$resource_map), | ||
subject = 'CN=SASAP,DC=dataone,DC=org', | ||
permissions = c('read', 'write', 'changePermission')) | ||
|
||
# update package | ||
publish_update(mn, | ||
metadata_pid = pkg$metadata, | ||
resource_map_pid = pkg$resource_map, | ||
metadata_path = eml_path, | ||
data_pid = pkg$data, | ||
check_first = T, | ||
use_doi = F, | ||
public = T) | ||
|
||
NEWEST <- "urn:uuid:a6a4b230-799c-42ea-b331-62a2b1013ee4" | ||
|
||
|
||
|
||
|
||
|
||
# accidentally published another data package with the same metadata..... | ||
# publish metadata | ||
# metadata <- publish_object(mn, eml_path, format_id = format_eml()) | ||
metadata <- "urn:uuid:d4c2f2df-24db-4863-8f7c-734dfb087059" | ||
remove_public_read(mn, metadata) | ||
|
||
# use sysmeta to 'obselete' duplicated data package | ||
get_all_versions(mn, pkg$metadata) | ||
old <- "knb.92196.1" | ||
sysmeta <- getSystemMetadata(mn, old) | ||
sysmeta@obsoletes = metadata | ||
updateSystemMetadata(mn, old, sysmeta) | ||
|
||
sysmetaOB <- getSystemMetadata(mn, metadata) | ||
sysmetaOB@obsoletedBy = old | ||
updateSystemMetadata(mn, metadata, sysmetaOB) | ||
|
||
# set public read | ||
set_public_read(mn, c(pkg$data, pkg$metadata, pkg$resource_map)) | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
################ | ||
#Issue #114: Commercial Crew Member Data | ||
#Data Processing | ||
#February 2018 | ||
#Sophia Tao | ||
################ | ||
|
||
|
||
|
||
a <- read.csv('/home/stao/my-sasap/114_commercial_crew/Commercial Crew data 2012-2016.csv', | ||
header = T, | ||
stringsAsFactors = F, | ||
na.strings = c("", "Not Available")) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. hmm it looks like |
||
|
||
# correct typos | ||
typo <- which(a$Full.Name == ",ARL A. HIXSON") | ||
a$Full.Name[typo] <- "CARL A. HIXSON" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If you want, I think it's also possible to do all this in one line (without which): It does get a little bit harder to read though, so you could go either way. |
||
typo2 <- which(a$First.Name == ",ARL") | ||
a$First.Name[typo2] <- "CARL" | ||
typo3 <- which(a$Full.Name == "2021682 R. J") | ||
a$Full.Name[typo3] <- "R. J" | ||
a$First.Name[a$First.Name == "2021682"] <- NA | ||
typo4 <- which(a$Full.Name == "A;BERT R. ZAQREB") | ||
a$Full.Name[typo4] <- "ALBERT R. ZAQREB" | ||
typo5 <- which(a$First.Name == "A;BERT") | ||
a$First.Name[typo5] <- "ALBERT" | ||
|
||
# remove "?" | ||
a$Full.Name <- gsub("[?]", "", a$Full.Name) | ||
a$First.Name <- gsub("[?]", "", a$First.Name) | ||
a$Last.Name <- gsub("[?]", "", a$Last.Name) | ||
|
||
write.csv(a, '/home/stao/my-sasap/114_commercial_crew/Commercial_Crew_data_2012-2016_formatted.csv', row.names = F) | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If you want to do this for the whole data frame, here's an alternative:
df[df=="."] <- NA
https://stackoverflow.com/questions/19503266/replace-all-particular-values-in-a-data-frameThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
also, to make code more readable, it's generally good to put spaces around
=
or==
(which I didn't do, oops!) http://style.tidyverse.org/