Merge pull request #1 from PumasAI-Labs/juanjose/initial-draft

Initial draft
PumasAI-Labs · Jul 18, 2023 · 51cf4d8 · 51cf4d8
2 parents 8e21017 + 8f0c384
commit 51cf4d8
Show file tree

Hide file tree

Showing 21 changed files with 789 additions and 40 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,6 @@
 # Python env
 env/
 .venv/
+
+# Files created when running the code
+*_new*
diff --git a/01-files.jl b/01-files.jl
@@ -0,0 +1,99 @@
+# Reading and writing external files
+
+## CSV: probably the most common type of data file you will find
+using CSV
+using DataFrames
+
+# Note: go to the workshop directory before reading the CSV file
+# by right-clicking on the desired directory and selecting
+# `Julia: Change to this directory
+df = CSV.read("demographics.csv", DataFrame) # read(<filepath>, <sink>)
+
+# Writing files
+## As an example, let's change some column names and then save it
+renamed_df = rename(
+    df,
+    Dict("AGE" => "AGE (years)", "WEIGHT" => "WEIGHT (kg)")
+)
+
+## Tip: you can rename columns programmatically by passing a function
+lowercase_df = rename(lowercase, df) # Make all columns be lowercase
+
+# Now we are ready to save the new file
+CSV.write("demographics_new.csv", renamed_df) # write(<filepath>, <DataFrame>)
+# CSV.write("demographics.csv", renamed_df) # Watch out: This would overwrite our original dataset
+
+# Check our new files using VS Code
+
+## Tip: you can read/save data to a folder
+CSV.write("data/demographics_new.csv", renamed_df)
+CSV.read("data/demographics_new.csv", DataFrame)
+
+## Custom specifications (keyword arguments):
+readlines("demographics_eu.csv")[1:3]
+readlines("demographics.csv")[1:3] # Standard format
+
+# - delim: CSV files are separated by commas most of the time, but sometimes other
+#   characters like ';' or '\t' are used.
+CSV.read("demographics_eu.csv", DataFrame; delim = ';') # Works, but the numbers were parsed as strings
+
+# - decimal: if the file contains Floats and they are separated by something different than
+#   '.' (e.g 3.14), you must specify which character is used. If you ever need to use this, 
+#   it will probably be because decimals are separated by commas (e.g 3,14)
+CSV.read("demographics_eu.csv", DataFrame; delim = ';', decimal = ',')
+
+# You can also use these keyword arguments to write files
+CSV.write("demographics_eu_new.csv", renamed_df; delim = ';', decimal = ',')
+readlines("demographics_eu_new.csv")[1:3]
+
+# There are many more options: https://csv.juliadata.org/stable/reading.html#CSV.read
+
+## Excel (.xlsx)
+using XLSX
+
+# Reading files
+excel_file = XLSX.readtable("demographics.xlsx", "Sheet1") # readtable(<filepath>, <sheetname>)
+df_excel = DataFrame(excel_file) # You will most definitely want to convert it to a DataFrame
+
+## Tip: get all sheets from an Excel file
+file = XLSX.readxlsx("demographics.xlsx") # You can see Sheet1 here
+XLSX.sheetnames(file) # You can get a vector of sheet names too
+
+## Tip: you can also use index numbers to refer to sheets
+DataFrame(XLSX.readtable("demographics.xlsx", 1)) # We get the first sheet
+
+# You can also read XLSX files from a folder
+DataFrame(XLSX.readtable("data/demographics.xlsx", "Sheet1"))
+
+# Allow XLSX to infer types (columns will be Any by default)
+DataFrame(XLSX.readtable("demographics.xlsx", "Sheet1"; infer_eltypes=true)) # You will most definitely want to infer the columns types
+
+# Writing files
+XLSX.writetable("demographics_new.xlsx", renamed_df) # Same syntax as CSV.write (<filepath>, <DataFrame>)
+XLSX.writetable("data/demographics_new.xlsx", renamed_df) # Save to a folder
+
+## Watch out: if you try to write a file that already exists, you will get an error
+XLSX.writetable("demographics_new.xlsx", lowercase_df) # Won't overwrite, like CSV would
+
+## SAS files
+using ReadStatTables
+
+# Reading files
+## .sas7bdat
+DataFrame(readstat("iv_bolus_sd.sas7bdat"))
+## .xpt
+DataFrame(readstat("iv_bolus_sd.xpt"))
+
+## Note: ReadStatTables supports other file formats: 
+## https://junyuan-chen.github.io/ReadStatTables.jl/stable/#Supported-File-Formats
+
+# Writing files
+## Currently, ReadStatTables only supports reading files (writing is experimental only)
+
+##############################################################################################
+# Optional: run this to delete all the files created in the examples
+begin
+    root_files = filter(contains("new"), readdir())
+    data_files = joinpath.("data", filter(contains("new"), readdir("data")))
+    foreach(rm, vcat(root_files, data_files))
+end
diff --git a/02-select_subset.jl b/02-select_subset.jl
@@ -0,0 +1,64 @@
+# We often want to retrieve only certain parts of a DataFrame
+df = CSV.read("demographics.csv", DataFrame) # Load the demographics dataset from before
+
+# Columns
+names(df) # Get all column names
+
+## Get a single column as a vector
+df.AGE # DataFrame.column_name
+df.WEIGHT 
+
+df[!, "AGE"] # Indexing, as if it was a matrix
+df[!, "WEIGHT"]
+
+## Tip: get a copy of the column (instead of the actual column)
+df[:, "AGE"] # If you modify this, you won't be modifying the original DataFrame
+
+## Get multiple columns
+df[!, ["AGE", "WEIGHT"]] # This gets messy quickly
+
+### @select macro
+using DataFramesMeta # You don't need to import DataFrames if you import DataFramesMeta
+
+@select df :AGE :WEIGHT # We use Symbols instead of Strings
+@select(df, :AGE, :WEIGHT) # We can also call it in a similar way to functions
+
+@select df begin # block syntax, probably the best alternative for multiple columns
+    :ID
+    :AGE
+    :WEIGHT    
+end
+
+## Tip: select columns the other way around
+@select df $(Not([:AGE, :WEIGHT])) # Get all columns, except the ones we specify
+
+# Rows
+## Indexing
+df[1:10, ["AGE", "WEIGHT"]] # Get the first 10 rows
+df[4:16, All()] # Get rows 4 to 16 for all columns
+
+## The @subset macro
+## Allows selecting rows based on conditional statements
+@subset df :AGE .> 60 # Get all subjects that are more than 60 years old
+
+# You can also have multiple conditions
+@subset df begin
+    :AGE .> 60
+    :ISMALE .== 1 # Get males only
+    :WEIGHT .< 50 # Get subjects that weigh less than 50 kg
+end
+
+## Tip: use @rsubset instead of broadcasting everything (.>, .==, etc.)
+@rsubset df begin
+    :AGE > 60
+    :ISMALE == 1
+    :WEIGHT < 50
+end
+
+## You don't always want to use @rsubset
+@rsubset df :WEIGHT > mean(:WEIGHT)
+@subset df :WEIGHT .> mean(:WEIGHT)
+
+## Common use case: remove rows that have missing values in one column
+df_iv = DataFrame(readstat("iv_bolus_sd.xpt"))
+@rsubset df_iv !ismissing(:conc)
diff --git a/03-transform.jl b/03-transform.jl
@@ -0,0 +1,40 @@
+# Apply some transformation to one or more columns in our data
+include("02-select.jl")
+
+# Change the sex encoding (ISMALE)
+df
+@transform df :SEX = [i == 0 ? "Female" : "Male" for i in :ISMALE] # Create a new column
+@transform df :ISMALE = [i == 0 ? "Female" : "Male" for i in :ISMALE] # Modify an existing column
+
+## Tip: use @rtransform to avoid specifying the entire column at once
+@rtransform df :SEX = :ISMALE == 0 ? "Female" : "Male"
+@rtransform df :ISMALE = :ISMALE == 0 ? "Female" : "Male"
+
+# You can also apply multiple transformations at once
+@rtransform df begin
+    :ISMALE = :ISMALE == 0 ? " Female" : "Male"
+    :AGE = Int(round(:AGE, digits=0)) # Round age to an integer
+    :AGE_months = :AGE * 12 # Calculate age in months
+end
+
+# Notice that our age in months was not computed from the rounded version of the AGE column
+## We have to use @astable to be able to use intermediate results
+@rtransform df @astable begin
+    :AGE = Int(round(:AGE, digits=0))
+    :AGE_months = :AGE * 12
+end
+
+# Modify the original DataFrame
+@rtransform df :SEX = :ISMALE == 0 ? "Female" : "Male" # Creates a new DataFrame
+df # Our original DataFrame remains unchanged
+
+@rtransform! df :SEX = :ISMALE == 0 ? "Female" : "Male" # Use ! at the end to modify the source
+df # Watch out: we lost the original DataFrame (we would have to reread our source file)
+
+## Tip: this works for all of DataFramesMeta.jl's macros
+@rsubset! df :SEX == "Female"
+df # Now we only have female subjects
+
+@select! df :AGE :WEIGHT :SEX
+df # Now we lost the rest of the columns
+
diff --git a/04-grouping.jl b/04-grouping.jl
@@ -0,0 +1,39 @@
+# Some times we want to group our data and apply operations according to that grouping
+df = CSV.read("demographics.csv", DataFrame) # Load a fresh copy of our dataset
+
+# The groupby function
+groupby(df, :ISMALE) # Group subjects according to sex
+
+## More complicated example: @transform + groupby
+@rtransform! df :WEIGHT_cat = :WEIGHT > 70 ? "Over 70 kg" : "Under 70 kg"
+groupby(df, :WEIGHT_cat)
+
+## Tip: groupby can take multiple columns as grouping keys
+groupby(df, [:ISMALE, :WEIGHT_cat]) # Now we get 4 groups
+
+# Summarizing (@combine)
+## A common thing to do after grouping data is to combine it back with some operation.
+
+# Example: mean age for each sex group
+grouped_df = groupby(df, :ISMALE)
+@combine grouped_df :AGE = mean(:AGE)
+mean((@rsubset df :ISMALE == 0).AGE) # Check the results
+
+# You can also use DataFrames that have been grouped with multiple columns
+combined_df = @combine groupby(df, [:WEIGHT_cat, :ISMALE]) :AGE = mean(:AGE)
+@orderby combined_df :ISMALE # Fix awkward ordering with @orderby
+@orderby combined_df :ISMALE :WEIGHT_cat # Use multiple columns in @orderby 
+
+## Tip: you can include multiple calculations inside of @combine
+@combine grouped_df begin
+    :AGE = mean(:AGE)
+    :WEIGHT = mean(:WEIGHT)
+    :n = length(:AGE) # Calculate the number of subjects for each group
+end
+
+# the @by macro: groupby + @combine in one call
+@by df :ISMALE begin
+    :AGE = mean(:AGE)
+    :WEIGHT = mean(:WEIGHT)
+    :n = length(:AGE)
+end
diff --git a/05-chaining.jl b/05-chaining.jl
@@ -0,0 +1,37 @@
+# Perform all your data wrangling operations in one block with @chain
+df = CSV.read("demographics.csv", DataFrame)
+
+# Get ages for all female subjects
+@chain df begin
+    @rsubset :ISMALE == 0
+    @select :ID :AGE # We didn't have to pass df as an argument
+end
+
+# More complicated example
+@chain df begin
+
+    @rtransform begin
+        :SEX = :ISMALE == 0 ? "Female" : "Male" # Create the new sex column
+        :WEIGHT_cat = :WEIGHT > 70 ? "Over 70 kg" : "Under 70 kg" # Create weight categories
+    end
+
+    @by [:SEX, :WEIGHT_cat] begin # Calculate mean values for each column
+        :AGE = mean(:AGE)
+        :SCR = mean(:SCR)
+        :eGFR = mean(:eGFR)
+        :n = length(:AGE)
+    end
+
+    @orderby :SEX :WEIGHT_cat # Fix ordering
+
+    # Make column names more readable
+    rename(
+        Dict(
+            :SEX => :Sex,
+            :WEIGHT_cat => :Weight,
+            :AGE => :Age
+        )
+    )
+
+end
+
diff --git a/README.md b/README.md
@@ -1,14 +1,7 @@
-# Pumas-AI Workshop Templates
+# Pumas-AI Data Wrangling Workshop
 
 [![CC BY-SA 4.0](https://img.shields.io/badge/License-CC%20BY--SA%204.0-lightgrey.svg)](http://creativecommons.org/licenses/by-sa/4.0/)
 
-## How to use this template
-
-1. Click on the green button `Use this template`
-1. Edit all the `PLACEHOLDER` in `mkdocs.yml` with respect to `site_name`, `repo_name` and `repo_url`.
-1. Edit all the `PLACEHOLDER` in `docs/index.md`, `docs/reference.md` and `docs/instructor.md`.
-1. Add appropriate content to `docs/index.md`, `docs/reference.md` and `docs/instructor.md`.
-
 ## How to contribute
 
 We use [Material for MkDocs](https://github.com/squidfunk/mkdocs-material)
@@ -22,9 +15,7 @@ We use [Material for MkDocs](https://github.com/squidfunk/mkdocs-material)
 
 ## Authors
 
-- Author 1 - <email@example.com>
-- Author 2 - <email@example.com>
-- Author 3 - <email@example.com>
+- Juan José González Oneto - <j.oneto@pumas.ai>
 
 ## License