Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

uploading scripts for review #5

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 67 additions & 0 deletions R/stao/CFEC_BIT_processing_199.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
################
#Issue #199: CFEC Basic Information Table (BIT)
#Data Processing
#February 2018
#Sophia Tao
################



# read in
bit <- read.csv("/home/stao/my-sasap/199_CFEC/BIT.csv", header = T)

# delete unnecessary column
bit$X....Preliminary <- NULL

# replace "." with "NA"
bit$Average.Permit.Price[bit$Average.Permit.Price=="."] <- NA
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you want to do this for the whole data frame, here's an alternative: df[df=="."] <- NA https://stackoverflow.com/questions/19503266/replace-all-particular-values-in-a-data-frame

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

also, to make code more readable, it's generally good to put spaces around = or == (which I didn't do, oops!) http://style.tidyverse.org/

bit$Total.Permits.Fished[bit$Total.Permits.Fished=="."] <- NA
bit$Resident.Total.Pounds[bit$Resident.Total.Pounds=="."] <- NA
bit$Nonresident.Total.Pounds[bit$Nonresident.Total.Pounds=="."] <- NA
bit$Total.Pounds[bit$Total.Pounds=="."] <- NA
bit$Resident.Average.Pounds[bit$Resident.Average.Pounds=="."] <- NA
bit$Nonresident.Average.Pounds[bit$Nonresident.Average.Pounds=="."] <- NA
bit$Average.Pounds[bit$Average.Pounds=="."] <- NA
bit$Resident.Total.Earnings[bit$Resident.Total.Earnings=="."] <- NA
bit$Nonresident.Total.Earnings[bit$Nonresident.Total.Earnings=="."] <- NA
bit$Total.Earnings[bit$Total.Earnings=="."] <- NA
bit$Resident.Average.Earnings[bit$Resident.Average.Earnings=="."] <- NA
bit$Nonresident.Average.Earnings[bit$Nonresident.Average.Earnings=="."] <- NA
bit$Average.Earnings[bit$Average.Earnings=="."] <- NA
bit$Average.Permit.Price[bit$Average.Permit.Price=="."] <- NA

# remove commas
bit$Resident.Interim.Permits.Issued <- gsub(",", "", bit$Resident.Interim.Permits.Issued)
bit$Resident.Interim.Permits.Issued <- gsub(",", "", bit$Resident.Interim.Permits.Issued)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

here's one example alternative method:

df <- data.frame(x = c("a", "1,x", "b", "c"),
        y = c("b,b", "a,", "b", "f"),
        z = c("a", "a", "g,,fhj", "a"),
        stringsAsFactors = FALSE)

sapply(df, function(col){gsub(",", "", col)})

This way returns your data frame as a matrix, so it's worth checking to make sure it doesn't mess anything else up. There are also tidyverse ways of doing things like this, but I'll leave that for you to explore :)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is really cool

bit$Nonresident.Interim.Permits.Issued <- gsub(",", "", bit$Nonresident.Interim.Permits.Issued)
bit$Total.Interim.Permits.Issued <- gsub(",", "", bit$Total.Interim.Permits.Issued)
bit$Resident.Permits.Issued.Renewed <- gsub(",", "", bit$Resident.Permits.Issued.Renewed)
bit$Nonresident.Permits.Issued.Renewed <- gsub(",", "", bit$Nonresident.Permits.Issued.Renewed)
bit$Total.Permits.Issued.Renewed <- gsub(",", "", bit$Total.Permits.Issued.Renewed)
bit$Resident.Total.Permits.Fished <- gsub(",", "", bit$Resident.Total.Permits.Fished)
bit$Total.Permits.Fished <- gsub(",", "", bit$Total.Permits.Fished)
bit$Resident.Total.Pounds <- gsub(",", "", bit$Resident.Total.Pounds)
bit$Nonresident.Total.Pounds <- gsub(",", "", bit$Nonresident.Total.Pounds)
bit$Total.Pounds <- gsub(",", "", bit$Total.Pounds)
bit$Resident.Average.Pounds <- gsub(",", "", bit$Resident.Average.Pounds)
bit$Nonresident.Average.Pounds <- gsub(",", "", bit$Nonresident.Average.Pounds)
bit$Average.Pounds <- gsub(",", "", bit$Average.Pounds)
bit$Resident.Total.Earnings <- gsub(",", "", bit$Resident.Total.Earnings)
bit$Nonresident.Total.Earnings <- gsub(",", "", bit$Nonresident.Total.Earnings)
bit$Total.Earnings <- gsub(",", "", bit$Total.Earnings)
bit$Resident.Average.Earnings <- gsub(",", "", bit$Resident.Average.Earnings)
bit$Nonresident.Average.Earnings <- gsub(",", "", bit$Nonresident.Average.Earnings)
bit$Average.Earnings <- gsub(",", "", bit$Average.Earnings)

# remove "$"
bit$Resident.Total.Earnings <- gsub("[$]", "", bit$Resident.Total.Earnings)
bit$Nonresident.Total.Earnings <- gsub("[$]", "", bit$Nonresident.Total.Earnings)
bit$Total.Earnings <- gsub("[$]", "", bit$Total.Earnings)
bit$Resident.Average.Earnings <- gsub("[$]", "", bit$Resident.Average.Earnings)
bit$Nonresident.Average.Earnings <- gsub("[$]", "", bit$Nonresident.Average.Earnings)
bit$Average.Earnings <- gsub("[$]", "", bit$Average.Earnings)

# export
write.csv(bit, "/home/stao/my-sasap/199_CFEC/BIT.csv", row.names = F)


136 changes: 136 additions & 0 deletions R/stao/CFEC_BIT_submission_199.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
################
#Issue #199: CFEC Basic Information Table (BIT)
#Data Submission
#February 2018
#Sophia Tao
################



# load libraries
library(arcticdatautils)
library(dataone)
library(EML)
library(XML)
library(digest)
library(shiny)
library(rhandsontable)

# set environment
cn <- CNode('PROD')
mn <- getMNode(cn,'urn:node:KNB')
# set authentication token



# publish data object
bit_path <- '/home/stao/my-sasap/199_CFEC/BIT.csv'
# bitID <- publish_object(mn, bit_path, format_id = 'text/csv')
bitID <- "urn:uuid:aa308395-54f6-412c-9ca3-0112a1d67938"



# edit EML
eml_path <- '/home/stao/my-sasap/199_CFEC/CFEC_BIT.xml'
eml <- read_eml(eml_path)

# add SASAP project info
source('~/sasap-data/data-submission/Helpers/SasapProjectCreator.R')
eml@dataset@project <- sasap_project()

# generate attribute table
attributes1 <- data.frame(
attributeName = c('Fishery','Fishery.Description','Year','Resident.Permanent.Permits.Renewed','Nonresident.Permanent.Permits.Renewed','Total.Permanent.Permits.Renewed','Resident.Interim.Permits.Issued','Nonresident.Interim.Permits.Issued','Total.Interim.Permits.Issued','Resident.Permits.Issued.Renewed','Nonresident.Permits.Issued.Renewed','Total.Permits.Issued.Renewed','Resident.Total.Permits.Fished','Nonresident.Total.Permits.Fished','Total.Permits.Fished','Resident.Total.Pounds','Nonresident.Total.Pounds','Total.Pounds','Resident.Average.Pounds','Nonresident.Average.Pounds','Average.Pounds','Resident.Total.Earnings','Nonresident.Total.Earnings','Total.Earnings','Resident.Average.Earnings','Nonresident.Average.Earnings','Average.Earnings','Average.Permit.Price'),
domain = c('textDomain','textDomain','dateTimeDomain','numericDomain','numericDomain','numericDomain','numericDomain','numericDomain','numericDomain','numericDomain','numericDomain','numericDomain','numericDomain','numericDomain','numericDomain','numericDomain','numericDomain','numericDomain','numericDomain','numericDomain','numericDomain','numericDomain','numericDomain','numericDomain','numericDomain','numericDomain','numericDomain','numericDomain'),
attributeDefinition = c('fishery code comprises of a species code, a gear code, and an area code','description for fishery code','year pertaining to the correlating information','number of permanent permits renewed by residents','number of permanent permits renewed by nonresidents','total number of permanent permits renewed','number of interim permits issued to residents','number of interim permits issued to nonresidents','total number of interim permits issued','number of permits issued to or renewed by residents','number of permits issued to or renewed by nonresidents','total number of permits issued or renewed','number of permits used to fish by residents','number of permits used to fish by nonresidents','total number of permits used to fish overall','total pounds of fish landed by residents','total pounds of fish landed by nonresidents','total pounds of fish landed overall','average pounds of fish landed by residents','average pounds of fish landed by nonresidents','average pounds of fish landed overall','total earnings of residents','total earnings of nonresidents','total earnings overall','average earnings of residents','average earnings of nonresidents','average earnings overall','average permit price'),
definition = c('fishery code comprises of a species code, a gear code, and an area code','description for fishery code',NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA),
measurementScale = c('nominal','nominal','dateTime','ratio','ratio','ratio','ratio','ratio','ratio','ratio','ratio','ratio','ratio','ratio','ratio','ratio','ratio','ratio','ratio','ratio','ratio','ratio','ratio','ratio','ratio','ratio','ratio','ratio'),
formatString = c(NA,NA,'YYYY',NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA),
numberType = c(NA,NA,NA,'whole','whole','whole','whole','whole','whole','whole','whole','whole','whole','whole','whole','whole','whole','whole','whole','whole','whole','whole','whole','whole','whole','whole','whole','whole'),
unit = c(NA,NA,NA,'number','number','number','number','number','number','number','number','number','number','number','number','pound','pound','pound','pound','pound','pound','dimensionless','dimensionless','dimensionless','dimensionless','dimensionless','dimensionless','dimensionless'),
missingValueCode = c('NA','NA','NA','NA','NA','NA','NA','NA','NA','NA','NA','NA','NA','NA','NA','NA','NA','NA','NA','NA','NA','NA','NA','NA','NA','NA','NA','NA'),
missingValueCodeExplanation = c('information not provided/reported','information not provided/reported','information not provided/reported','information not provided/reported','information not provided/reported','information not provided/reported','information not provided/reported','information not provided/reported','information not provided/reported','information not provided/reported','information not provided/reported','information not provided/reported','information not provided/reported','information not provided/reported','information not provided/reported','information not provided/reported','information not provided/reported','information not provided/reported','information not provided/reported','information not provided/reported','information not provided/reported','information not provided/reported','information not provided/reported','information not provided/reported','information not provided/reported','information not provided/reported','information not provided/reported','information not provided/reported'),
stringsAsFactors = FALSE)
attributeList1 <- set_attributes(attributes1)

# generate physical
physical1 <- pid_to_eml_physical(mn, pkg$data)

# generate dataTable
dataTable1 <- new('dataTable',
entityName = 'BIT.csv',
entityDescription = 'Slightly modified csv file of CFEC Basic Information Table (BIT) including number of permits, total pounds landed, total estimated gross earnings, and average estimated gross earnings per permit for each permit fishery by year, from 1975 to 2016',
physical = c(physical1[[1]]),
attributeList = attributeList1)

# add dataTable to EML
eml@dataset@dataTable <- c(dataTable1)

# change geographic coverage
geocov1 <- new("geographicCoverage", geographicDescription = "The geographic region includes all commercial fishery management areas in Alaska as well as all areas of residence of permit holders.",
boundingCoordinates = new("boundingCoordinates",
northBoundingCoordinate = new("northBoundingCoordinate", 72),
eastBoundingCoordinate = new("eastBoundingCoordinate", -129),
southBoundingCoordinate = new("southBoundingCoordinate", 51),
westBoundingCoordinate = new("westBoundingCoordinate", -179)))
eml@dataset@coverage@geographicCoverage <- c(geocov1)

# change abstract
eml@dataset@abstract <- new("abstract", .Data = "The Commercial Fisheries Entry Commission (CFEC) is an independent, autonomous agency of the State of Alaska which regulates entry into Alaska's commercial fisheries. The CFEC is committed to promoting conservation and sustained-yield management of Alaska's unique fishery resources and boosting economic stability among fishermen and their dependents. The CFEC is responsible for leasing all commercial fishing permits including permits for limited-entry fisheries. Limited-entry fisheries include all state salmon fisheries, most herring fisheries, and various other fisheries. The number of permits, total pounds landed, total estimated gross earnings, and average estimated gross earnings per permit are provided for each permit fishery by year, from 1975-2016. Information is subtotaled by resident type. Estimated permit values at year-end are shown for limited fisheries with a sufficient number of permit sales. The data included in this package has been modified slightly from what was provided by CFEC: currency columns were reformatted as numeric, and an empty column was removed.")

# write and validate EML
write_eml(eml, eml_path)
eml_validate(eml_path)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

generally better to eml_validate before you write_eml just so you don't overwrite your file with bad eml! So that'd be:

eml_validate(eml)
write_eml(eml, eml_path)




# create resource map
# rm <- create_resource_map(mn, "knb.92196.1", bitID)
rm <- update_resource_map(mn, "resource_map_urn:uuid:ac8a0a24-79f1-47d3-92be-113132808913", "knb.92196.4", pkg$data, public = T, check_first = T)

# get package
pkg <- get_package(mn, NEWEST, file_names = T)

# set rights & access
set_rights_and_access(mn,
c(pkg$metadata, pkg$data, pkg$resource_map),
subject = 'CN=SASAP,DC=dataone,DC=org',
permissions = c('read', 'write', 'changePermission'))

# update package
publish_update(mn,
metadata_pid = pkg$metadata,
resource_map_pid = pkg$resource_map,
metadata_path = eml_path,
data_pid = pkg$data,
check_first = T,
use_doi = F,
public = T)

NEWEST <- "urn:uuid:a6a4b230-799c-42ea-b331-62a2b1013ee4"





# accidentally published another data package with the same metadata.....
# publish metadata
# metadata <- publish_object(mn, eml_path, format_id = format_eml())
metadata <- "urn:uuid:d4c2f2df-24db-4863-8f7c-734dfb087059"
remove_public_read(mn, metadata)

# use sysmeta to 'obselete' duplicated data package
get_all_versions(mn, pkg$metadata)
old <- "knb.92196.1"
sysmeta <- getSystemMetadata(mn, old)
sysmeta@obsoletes = metadata
updateSystemMetadata(mn, old, sysmeta)

sysmetaOB <- getSystemMetadata(mn, metadata)
sysmetaOB@obsoletedBy = old
updateSystemMetadata(mn, metadata, sysmetaOB)

# set public read
set_public_read(mn, c(pkg$data, pkg$metadata, pkg$resource_map))


35 changes: 35 additions & 0 deletions R/stao/commercial_crew_processing_114.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
################
#Issue #114: Commercial Crew Member Data
#Data Processing
#February 2018
#Sophia Tao
################



a <- read.csv('/home/stao/my-sasap/114_commercial_crew/Commercial Crew data 2012-2016.csv',
header = T,
stringsAsFactors = F,
na.strings = c("", "Not Available"))
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hmm it looks like na.strings solved the problem you had to deal with in the other file ("." --> NA)! Cool, I didn't know about this argument!


# correct typos
typo <- which(a$Full.Name == ",ARL A. HIXSON")
a$Full.Name[typo] <- "CARL A. HIXSON"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you want, I think it's also possible to do all this in one line (without which):
a$Full.Name[a$Full.Name == ",ARL A. HIXSON"] <- "CARL A. HIXSON"

It does get a little bit harder to read though, so you could go either way.

typo2 <- which(a$First.Name == ",ARL")
a$First.Name[typo2] <- "CARL"
typo3 <- which(a$Full.Name == "2021682 R. J")
a$Full.Name[typo3] <- "R. J"
a$First.Name[a$First.Name == "2021682"] <- NA
typo4 <- which(a$Full.Name == "A;BERT R. ZAQREB")
a$Full.Name[typo4] <- "ALBERT R. ZAQREB"
typo5 <- which(a$First.Name == "A;BERT")
a$First.Name[typo5] <- "ALBERT"

# remove "?"
a$Full.Name <- gsub("[?]", "", a$Full.Name)
a$First.Name <- gsub("[?]", "", a$First.Name)
a$Last.Name <- gsub("[?]", "", a$Last.Name)

write.csv(a, '/home/stao/my-sasap/114_commercial_crew/Commercial_Crew_data_2012-2016_formatted.csv', row.names = F)


Loading