diff --git a/SAS/log-analysis.pdf b/SAS/log-analysis.pdf index 7116cb7..3324ad2 100644 Binary files a/SAS/log-analysis.pdf and b/SAS/log-analysis.pdf differ diff --git a/SAS/log-cleaning.pdf b/SAS/log-cleaning.pdf index 5bb636a..afd82b3 100644 Binary files a/SAS/log-cleaning.pdf and b/SAS/log-cleaning.pdf differ diff --git a/SAS/output-analysis.pdf b/SAS/output-analysis.pdf index 41aad5a..253d118 100644 Binary files a/SAS/output-analysis.pdf and b/SAS/output-analysis.pdf differ diff --git a/SAS/output-cleaning.pdf b/SAS/output-cleaning.pdf index 5e4cb9f..4c3f582 100644 Binary files a/SAS/output-cleaning.pdf and b/SAS/output-cleaning.pdf differ diff --git a/code/analysis.SAS b/code/analysis.SAS index 28effbf..5959140 100644 --- a/code/analysis.SAS +++ b/code/analysis.SAS @@ -36,13 +36,19 @@ RUN; * Plot outcome time series; *******************************************************************************; -FOOTNOTE; +FOOTNOTE; * Remove the footnote so it isn't on the graphs; +* Plot the time series of log grain price over time. This makes a separate + time series line for each grain.; TITLE2 "PRICE PER BUSHEL OF GRAINS OVER TIME"; PROC SGPLOT DATA = HOME.GRAINS; SERIES X = YEAR Y = LPE / GROUP = GRN; RUN; +* Color the points of the time series by the President's political party. The + default colors are already red and blue so we don't need to change them! + Also plots a gray line underneath the points, since the JOIN option for + the SCATTER statement will not connect the points in order.; TITLE2 "GRAIN PRICES AND PRESIDENT'S POLITICAL PARTY OVER TIME"; PROC SGPANEL DATA = HOME.GRAINS; PANELBY GRN; @@ -51,14 +57,22 @@ PROC SGPANEL DATA = HOME.GRAINS; MARKERATTRS = (SYMBOL = CIRCLEFILLED); RUN; -TITLE2 "PERCENT CHANGE DISTRIBUTION BY PRESIDENT'S POLITICAL PARTY"; +* Make a boxplot of log price vs. president's political party. This ignores the + time series information, but can tell us if either party has more high or + low years compared to the other.; +TITLE2 "LOG PRICE DISTRIBUTION BY PRESIDENT'S POLITICAL PARTY"; PROC SGPANEL DATA = HOME.GRAINS; PANELBY GRN; - HBOX PCT / GROUP = PARTY; + HBOX LPE / GROUP = PARTY; RUN; ODS GRAPHICS / WIDTH = 6in HEIGHT = 9in; +* Make a scatterplot of the log price vs each covariate, ignoring the time + series component of the data. There is not an easy way to connect the points + like a phase portrait using PROC SGPLOT.; +* I divided this into two plots so they would fit on one page nicer. In the final + manuscript they could be put side by side.; TITLE2 "SCATTERPLOTS OF PRICE VS COVARIATES"; PROC SGSCATTER DATA = HOME.GRAINS; PLOT LPE * (ACR HVT LNR PRD YLD) / REG @@ -74,6 +88,8 @@ RUN; * Plots of covariates across time; *******************************************************************************; +* Plot the time series of each covariate, to assess how they change. I split + this one into two plots to prevent the plots being too small as before.; TITLE2 "CHANGE IN COVARIATES ACROSS TIME"; PROC SGSCATTER DATA = HOME.GRAINS; PLOT (ACR HVT LNR PRD YLD) * YEAR / @@ -89,7 +105,7 @@ RUN; * Univariate analyses; *******************************************************************************; -* Univariate analysis of main outcome by grain type; +* Univariate analysis of main outcome (log price) by grain type; TITLE2 "UNIVARIATE SUMMARY OF GRAIN DATA OVER TIME"; ODS GRAPHICS / WIDTH = 4in HEIGHT = 4in; @@ -103,13 +119,19 @@ RUN; * Bivariate analyses of price and covariates, ignoring time; *******************************************************************************; -* Correlations; +TITLE2 "BIVARIATE CORRELATIONS ACROSS NUMERICAL VARIABLES"; +* Correlations -- check to see which covariates are correlated with the outcome, + and which are correlated with each other and should not be modeled + together.; PROC CORR PEARSON SPEARMAN DATA = HOME.GRAINS; VAR LPE ACR HVT LNR PRD YLD INFL PWR TEMP VALUE; BY GRN; RUN; -* Mean difference in LPE by party; +TITLE2 "SUMMARY STATISTICS BY PRESIDENTIAL PARTY AND GRAIN"; +* Mean difference in LPE by party -- proc corr does not have a point biserial + option, so we can check the difference/overlap in means and standard errors + to assess if party seems to impact log price for any of the grains.; PROC MEANS DATA = HOME.GRAINS MEAN STDERR MEDIAN RANGE NWAY; VAR LPE; CLASS PARTY; @@ -120,6 +142,13 @@ RUN; * Simple and multiple OLS regression models; *******************************************************************************; +TITLE2 "SIMPLE LINEAR REGRESSION MODELS"; +* Write a macro to fit all regression models of the form + MODEL COVAR GRN COVAR * GRN + without having to type out all of the PROC GLM statements. This model will + be parametrized without an intercept, and will generate all appropriate + diagnostic plots for the model.; + %MACRO ALLSIMPLE(DAT = , RESP = , PRED = ); %LET N = %SYSFUNC(COUNTW(&DAT)); %DO I = 1 %TO &N; @@ -136,12 +165,18 @@ RUN; PRED = ACR PRD INFL TEMP PWR YEAR ); +* Fit the same model that was used as before, but with party as a covariate. + Party needs to be in the class statement, and is the only categorical + variable, so it wasn't worth modifying the above macro to use party + correctly and I did it manually.; PROC GLM DATA = HOME.GRAINS PLOTS = ALL; CLASS GRN PARTY; MODEL LPE = GRN | PARTY; RUN; -* 1866 FULL MODEL; +TITLE2 "1866 FULL MODEL"; +* 1866 FULL MODEL: this model includes all non-correlated predictors that were + measured in 1866.; PROC MIXED DATA = HOME.GRAINS PLOTS = ALL; CLASS GRN PARTY; MODEL LPE = @@ -151,12 +186,15 @@ PROC MIXED DATA = HOME.GRAINS PLOTS = ALL; ; RUN; -* FULL MODEL WITH TEMP (1880 MODEL); +TITLE2 "1880 FULL MODEL"; +* FULL MODEL WITH TEMP (1880 MODEL): this model is the same as the previous + model, but also includes the temperature anomaly information. Consequently, + it only uses data from 1880 onwards (even less for sorghum).; PROC MIXED DATA = HOME.GRAINS PLOTS = ALL; CLASS PARTY GRN; MODEL LPE = - GRN HVT PRD INFL PWR YEAR - GRN*HVT GRN*PRD GRN*INFL GRN*PWR GRN*YEAR GRN*PARTY / + GRN HVT PRD INFL PWR YEAR PARTY TEMP + GRN*HVT GRN*PRD GRN*INFL GRN*PWR GRN*YEAR GRN*PARTY TEMP*PARTY/ NOINT SOLUTION ; RUN; @@ -165,6 +203,10 @@ RUN; * GLS multiple regression analysis; *******************************************************************************; +* Take the better fitting (by AIC) of the two previous models, and run a model + that can account for correlation using generalized least squares. + This model assumes exchangeable correlations between each of the time points.; + PROC MIXED DATA = HOME.GRAINS PLOTS = ALL; CLASS PARTY GRN; MODEL LPE = HVT PRD INFL PWR YEAR GRN GRN*HVT GRN*PRD / @@ -176,32 +218,46 @@ RUN; * Simple forecasting; *******************************************************************************; +* Now instead of just using regression models, we can try to fit a more + flexible forecasting model using PROC ARIMA. + First, we need a time variable that is actually a SAS date, so we create + that first.; DATA TS_DAT; SET HOME.GRAINS; T = MDY(1, 1, YEAR); RUN; +* Next we use the IDENTIFY modeling stage. We check up to 30 lags in the + first ARIMA modeling stage, and also explicitly test for stationarity at + the first 10 differences using the random walk with drift test. We + also use the SCAN method, which is a heuristic for identifying + candidate ARIMA models.; PROC ARIMA DATA = TS_DAT; IDENTIFY VAR = LPE NLAG = 30 SCAN STATIONARITY = (RW = 10); + BY GRN; RUN; -PROC ARIMA DATA = TS_DAT; - IDENTIFY VAR = LPE(1); -RUN; - +* Next we use the ESTIMATE modeling stage. We fit several different ARIMA + models to the data in order to see which fits our time series the best, + and if any have white noise as the error term.; PROC ARIMA DATA = TS_DAT; IDENTIFY VAR = LPE; ESTIMATE P = 1; ESTIMATE Q = 1; ESTIMATE P = 1 Q = 1; ESTIMATE P = 2; + BY GRN; RUN; +* Finally, we use the best fitting model to make some simple forecasts in + the FORECAST modeling stage. We also identify outliers of the best + fitting model.; PROC ARIMA DATA = TS_DAT; IDENTIFY VAR = LPE; ESTIMATE P = 1; OUTLIER; FORECAST LEAD = 10 INTERVAL = YEAR ID = T OUT = GRAIN_FC; + BY GRN; RUN; diff --git a/code/cleaning.sas b/code/cleaning.sas index 2b9fbb8..a185e7d 100644 --- a/code/cleaning.sas +++ b/code/cleaning.sas @@ -180,8 +180,11 @@ RUN; FILENAME FDGRN '/home/u59465388/SAS-Grain-Prices/fg-sheet1.csv'; DATA ALLGRNS; - * Import the CSV file; - INFILE FDGRN DLM = ',' DSD FIRSTOBS = 9 MISSOVER; + * Import the CSV file. The option DSD is necessary to read in consecutive + delimiters as missing data, and the MISSOVER option is necessary as + there are missing values at the end of lines, so the INPUT specification + should be interpreted strictly.; + INFILE FDGRN DLM = ',' FIRSTOBS = 9 DSD MISSOVER; * SAS doesn't like the missing values being denoted by ,, even with the DSD option, and has a hard time parsing the numeric values. So, I'll import diff --git a/data/SAS/grains.sas7bdat b/data/SAS/grains.sas7bdat index a809a31..69a36c5 100644 Binary files a/data/SAS/grains.sas7bdat and b/data/SAS/grains.sas7bdat differ