- upgraded all R and Python packages to current versions

- went through the chapters to see what issues occur. Most are minor (renames), but at least two were not trivial so I now just commented them out: - shifterator does not seem to work with recent packages (something about a YTick attribute missing) - eli5 does not work with recent sklearn - R keras no longer seems to work (?!), giving an error py_get_attr_impl(x, name, silent) : AttributeError: module 'tensorflow.keras' has no attribute '__version__'. I also commented out the R mnist example in ch 14 as mnist continuously timed oiut This obviously needs more attention at some point...
vanatteveldt · Dec 3, 2023 · a2297f0 · a2297f0
1 parent 5885a4b
commit a2297f0
Show file tree

Hide file tree

Showing 10 changed files with 1,258 additions and 2,091 deletions.
diff --git a/.gitignore b/.gitignore
@@ -19,4 +19,5 @@ env
 /.Rhistory
 
 /_book/
-/.vscode/
+/.vscode/
+.Rproj.user
diff --git a/content/chapter02.qmd b/content/chapter02.qmd
diff --git a/content/chapter06.qmd b/content/chapter06.qmd
@@ -80,7 +80,7 @@ This means that the original `d` is overwritten.
 ::: {.callout-note icon=false collapse=true}
 
 In R, the *tidyverse* function `select` is quite versatile.
-  You can specify multiple columns using `select(d, column1, column2)` 
+  You can specify multiple columns using `select(d, column1, column2)`
   or by specifying a range of columns: `select(d, column1:column3)`.
   Both commands keep only the specified columns.
   As in the example, you can also specify a negative selection with the minus sign:
@@ -129,7 +129,7 @@ d2
 ```{r data-filter-r}
 url="https://cssbook.net/d/guns-polls.csv"
 d = read_csv(url)
-d = rename(d, rep=`Republican Support`, 
+d = rename(d, rep=`Republican Support`,
            dem=`Democratic Support`)
 d = select(d, -URL)
 
@@ -195,15 +195,15 @@ d2 = pd.read_csv(url)
 # Note that when creating a new column,
 # you have to use df["col"] rather than df.col
 d2["rep2"] = d2.rep.str.replace("[^0-9\\.]", "")
-d2["rep2"] = pd.to_numeric(d2.rep2)
-d2["Support2"] = d2.Support.fillna(d.Support.mean())
+d2["rep2"] = pd.to_numeric(d2.rep2, errors='coerce')
+d2["Support2"] = d2.Support.fillna(d2.Support.mean())
 
 # Alternatively, clean with .assign
 # Note the need to use an anonymous function
 # (lambda) to chain calculations
 cleaned = d2.assign(
     rep2=d2.rep.str.replace("[^0-9\\.]", ""),
-    rep3=lambda d2: pd.to_numeric(d2.rep2),
+    rep3=lambda d2: pd.to_numeric(d2.rep2, errors='coerce'),
     Support2=d2.Support.fillna(d2.Support.mean()),
 )
 
@@ -222,20 +222,20 @@ cleaned.head()
 url="https://cssbook.net/d/guns-polls-dirty.csv"
 d2 = read_csv(url)
 
-# Option 1: clean with direct assignment. 
+# Option 1: clean with direct assignment.
 # Note the need to specify d2$ everywhere
 d2$rep2=str_replace_all(d2$rep, "[^0-9\\.]", "")
 d2$rep2 = as.numeric(d2$rep2)
-d2$Support2 = replace_na(d2$Support, 
+d2$Support2 = replace_na(d2$Support,
                         mean(d2$Support, na.rm=T))
 
 # Alternative, clean with mutate
-# No need to specify d2$, 
+# No need to specify d2$,
 # and we can assign to a new or existing object
-cleaned = mutate(d2, 
+cleaned = mutate(d2,
     rep2 = str_replace_all(rep, "[^0-9\\.]", ""),
     rep2 = as.numeric(rep2),
-    Support2 = replace_na(Support, 
+    Support2 = replace_na(Support,
         mean(Support, na.rm=TRUE)))
 
 # Finally, you can create your own function
@@ -270,7 +270,7 @@ Note that all these versions work fine and produce the same result.
 In the end, it is up to the researcher to determine which feels most natural given the circumstances.
 As noted above, in R we would generally prefer `mutate` over direct assignment,
 mostly because it fits nicely into the *tidyverse* workflow and you do not need to repeat the data frame name.
-In Python, we would generally prefer the direct assignment, unless a copy of the data with the changes made is convenient, 
+In Python, we would generally prefer the direct assignment, unless a copy of the data with the changes made is convenient,
 in which case `assign` can be more useful.
 
 ## Grouping and Aggregating {#sec-grouping}
@@ -366,7 +366,7 @@ d.groupby("Question").agg({"Support": ["mean", "std"]})
 ```
 ## R code
 ```{r aggregate2-r}
-d %>% group_by(Question) %>% 
+d %>% group_by(Question) %>%
   summarize(m=mean(Support), sd=sd(Support))
 ```
 :::
@@ -403,8 +403,8 @@ d.head()
 ```
 ## R code
 ```{r transform-r}
-d = d %>% group_by(Question) %>% 
-  mutate(mean = mean(Support), 
+d = d %>% group_by(Question) %>%
+  mutate(mean = mean(Support),
          deviation=Support - mean)
 head(d)
 ```
@@ -523,9 +523,9 @@ capital_fr.head()
 ```
 ## R code
 ```{r capital_1-r}
-private_fr = private %>% 
+private_fr = private %>%
     select(Year, fr_private=France)
-public_fr = public %>% 
+public_fr = public %>%
     select(Year, fr_public=France)
 capital_fr = full_join(private_fr, public_fr)
 # Data for Figure 3.6 (Piketty, 2014, p 128)
@@ -544,7 +544,7 @@ print(f"Pearson correlation: rho={r:.2},p={p:.3}")
 ## R code
 ```{r capital_2-r}
 # Are private and public capital correlated?
-cor.test(capital_fr$fr_private, 
+cor.test(capital_fr$fr_private,
          capital_fr$fr_public)
 ```
 :::
@@ -645,7 +645,7 @@ results.head()
 ## R code
 ```{r primary-r}
 r="https://cssbook.net/d/2016_primary_results.csv"
-results = read_csv(r) 
+results = read_csv(r)
 head(results)
 ```
 :::
@@ -695,10 +695,10 @@ r.head()
 ```
 ## R code
 ```{r nested-r}
-c = counties %>% 
+c = counties %>%
   select("fips", "area_name", "Race_black_pct")
-r = results %>% 
-  filter(candidate == "Hillary Clinton") %>% 
+r = results %>%
+  filter(candidate == "Hillary Clinton") %>%
   select(fips, votes, fraction_votes)
 r = inner_join(r, c)
 cor.test(r$Race_black_pct, r$fraction_votes)
@@ -796,15 +796,15 @@ d = bind_rows(
     private %>% add_column(type="private"),
     public %>% add_column(type="public"))
 countries = c("Germany", "France", "U.K.")
-d %>% filter(country %in% countries) %>% 
-  ggplot(aes(x=Year, y=capital, 
-             color=country, lty=type)) + 
-  geom_line()+ 
+d %>% filter(country %in% countries) %>%
+  ggplot(aes(x=Year, y=capital,
+             color=country, lty=type)) +
+  geom_line()+
   ylab("Capital (% of national income)") +
-  guides(colour=guide_legend("Country"), 
-         linetype=guide_legend("Capital")) + 
-  theme_classic() + 
-  ggtitle("Capital in Europe, 1970 - 2010", 
+  guides(colour=guide_legend("Country"),
+         linetype=guide_legend("Capital")) +
+  theme_classic() +
+  ggtitle("Capital in Europe, 1970 - 2010",
     "Partial reproduction of Piketty fig 4.4")
 
 ```
@@ -880,14 +880,14 @@ d = read_excel(dest,sheet="TS8.2",skip=4)
 d = d%>% rename("year"=1)
 
 #2 Reshape: Pivoting to long, dropping missing
-d = d%>%pivot_longer(-year, values_to="share")%>% 
+d = d%>%pivot_longer(-year, values_to="share")%>%
         na.omit()
 
 #3 Normalize
 cols = c(NA,"percent","type",NA,"capital_gains")
 d = d %>% separate(name, into=cols,
-   sep=" ", extra="merge", fill="right") %>% 
-  mutate(year=as.numeric(year), 
+   sep=" ", extra="merge", fill="right") %>%
+  mutate(year=as.numeric(year),
          capital_gains=!is.na(capital_gains))
 head(d)
 ```
@@ -910,13 +910,13 @@ plt.set(xlabel="Year", ylabel="Share of income going to top-1%")
 ## R code
 ```{r excel2-r}
 #4 Filter for the desired data
-subset = d %>% filter(year >=1910, 
-                      percent=="1%", 
+subset = d %>% filter(year >=1910,
+                      percent=="1%",
                       capital_gains==F)
 
 #5 Analyze and/or visualization
 ggplot(subset, aes(x=year, y=share, color=type)) +
-  geom_line() + xlab("Year") + 
+  geom_line() + xlab("Year") +
 ylab("Share of income going to top-1%") +
   theme_classic()
 
@@ -936,4 +936,3 @@ ylab("Share of income going to top-1%") +
 [^5]: See Section [-@sec-datatypes] for more information on working with dictionaries
 
 [^6]: Of course, the fact that this is time series data means that the independence assumption of regular correlation is violated badly, so this should be interpreted as a descriptive statistic, e.g. in the years with high private capital there is low public capital and the other way around.
-
diff --git a/content/chapter08.qmd b/content/chapter08.qmd
@@ -7,8 +7,8 @@
 
 At the time of writing this chapter for the published book, `caret` was the state of the art Machine Learning package for R.
 We now think that the (newer) `tidymodels` package is a better choice in many regards.
-For this reason, we are planning to rewrite this chapter using that package. 
-See the [relevant github issue](https://github.com/vanatteveldt/cssbook/issues/6) for more information. 
+For this reason, we are planning to rewrite this chapter using that package.
+See the [relevant github issue](https://github.com/vanatteveldt/cssbook/issues/6) for more information.
 
 We also plan to add a section on Encoder Representation / Transformer models, see the [relevant github issue](https://github.com/vanatteveldt/cssbook/issues/4)
 :::
@@ -114,7 +114,7 @@ in the computational analysis of communication.
 
 In this chapter, we focus on *supervised* machine learning (SML)
 -- a form of machine learning, where we aim to predict a variable
-that, for at least a part of our data, is known. SML is usually applied to *classification* and *regression*  problems. To illustrate the 
+that, for at least a part of our data, is known. SML is usually applied to *classification* and *regression*  problems. To illustrate the
 idea, imagine that you are interested in predicting gender, based
 on Twitter biographies. You determine the gender for some of the
 biographies yourself and hand these examples over to the computer. The
@@ -193,7 +193,7 @@ mod.params
 df = read.csv("https://cssbook.net/d/media.csv")
 mod = lm(formula = "newspaper ~ age + gender",
          data = df)
-# summary(mod) would give a lot more info, 
+# summary(mod) would give a lot more info,
 # but we only care about the coefficients:
 mod
 ```
@@ -368,7 +368,7 @@ print(f"We have {len(X_train)} training and " f"{len(X_test)} test cases.")
 ```{r preparedata-r}
 df = read.csv("https://cssbook.net/d/media.csv")
 df = na.omit(df %>% mutate(
-    usesinternet=recode(internet, 
+    usesinternet=recode(internet,
             .default="user", `0`="non-user")))
 
 set.seed(42)
@@ -380,10 +380,10 @@ split = initial_split(df, prop = .8)
 traindata = training(split)
 testdata  = testing(split)
 
-X_train = select(traindata, 
+X_train = select(traindata,
                  c("age", "gender", "education"))
 y_train = traindata$usesinternet
-X_test = select(testdata, 
+X_test = select(testdata,
                 c("age", "gender", "education"))
 y_test = testdata$usesinternet
 
@@ -419,7 +419,7 @@ y_pred = myclassifier.predict(X_test)
 ```
 ## R code
 ```{r nb-r}
-myclassifier = train(x = X_train, y = y_train, 
+myclassifier = train(x = X_train, y = y_train,
                      method = "naive_bayes")
 y_pred = predict(myclassifier, newdata = X_test)
 ```
@@ -577,7 +577,7 @@ calculate $P(features)$ and $P(features|label)$ by just
 multiplying the probabilities of each individual feature.  Let's
 assume we have three features, $x_1, x_2, x_3$.  We now simply
 calculate the percentage of *all* cases that contain these
-features: $P(x_1)$, $P(x_2)$ and $P(x_3)$. 
+features: $P(x_1)$, $P(x_2)$ and $P(x_3)$.
 
 Then we do the same for the
 conditional probabilities and calculate the percentage of cases
@@ -788,13 +788,13 @@ y_pred = myclassifier.predict(X_test_scaled)
 ## R code
 ```{r svm-r}
 #| cache: true
-# !!! We normalize our features to have M=0 and 
-# SD=1. This is necessary as our features are not 
+# !!! We normalize our features to have M=0 and
+# SD=1. This is necessary as our features are not
 # measured on the same scale, which SVM requires.
 # Alternatively, rescale to [0:1] or [-1:1]
 
-myclassifier = train(x = X_train, y = y_train, 
-    preProcess = c("center", "scale"), 
+myclassifier = train(x = X_train, y = y_train,
+    preProcess = c("center", "scale"),
                      method = "svmLinear3")
 y_pred = predict(myclassifier, newdata = X_test)
 ```
@@ -892,7 +892,7 @@ y_pred = myclassifier.predict(X_test)
 ## R code
 ```{r randomforest-r}
 #| cache: true
-myclassifier = train(x = X_train, y = y_train, 
+myclassifier = train(x = X_train, y = y_train,
                      method = "rf")
 y_pred = predict(myclassifier, newdata = X_test)
 ```
@@ -1034,7 +1034,7 @@ Similarly, [@sec-chap-image] will show how a similar technique can be used to ex
 This involves creating a two-dimensional window over pixels rather than a unidimensional window over words, and often multiple convolutional layers are chained to detect features in increasingly large areas of the image.
 The underlying technique of convolutional networks, however, is the same in both cases.
 
-## Validation and Best Practices {#sec-validation} 
+## Validation and Best Practices {#sec-validation}
 ### Finding a Balance Between Precision and Recall {#sec-balance}
 
 In the previous sections, we have learned how to fit different models:
@@ -1119,8 +1119,8 @@ One approach is to print a table with three columns: the false
 positive rate, the true positive rate, and the threshold value. You
 then decide which FPR--TPR combination is most appealing to you, and
 use the corresponding threshold value. Alternatively, you can find the threshold value
-with the maximum distance between TPR and FPR, an approach also known as Yoden's J (Example 8.9). 
-Plotting the ROC curve can also help interpreting which 
+with the maximum distance between TPR and FPR, an approach also known as Yoden's J (Example 8.9).
+Plotting the ROC curve can also help interpreting which
 TPR/FPR combination is most promising (i.e., closest to the upper left corner).
 
 ::: {.callout-note appearance="simple" icon=false}
@@ -1141,11 +1141,11 @@ print(confusion_matrix(y_test, y_pred))
 ```
 ## R code
 ```{r cutoffpoint-r}
-m = glm(usesinternet ~ age + gender + education, 
+m = glm(usesinternet ~ age + gender + education,
         data=traindata, family="binomial")
 y_pred = predict(m, newdata = testdata,
                  type = "response")
-pred_default = as.factor(ifelse(y_pred>0.5, 
+pred_default = as.factor(ifelse(y_pred>0.5,
                             "user", "non-user"))
 
 print("Confusion matrix, default threshold (0.5)")
@@ -1323,7 +1323,7 @@ print(f"M={acc.mean():.2f}, SD={acc.std():.3f}")
 myclassifier = train(x = X_train, y = y_train,
     method = "glm", family="binomial",
     metric="Accuracy", trControl = trainControl(
-     method = "cv", number = 5, 
+     method = "cv", number = 5,
      returnResamp ="all", savePredictions=TRUE),)
 print(myclassifier$resample)
 print(myclassifier$results)
@@ -1413,20 +1413,20 @@ print(classification_report(y_test, search.predict(X_test_scaled)))
 ::: {.callout-note appearance="simple" icon=false}
 
 ::: {#exm-gridsearch3}
-A gridsearch in R. 
+A gridsearch in R.
 ## R code
 ```{r gridsearch3-r}
 #| cache: true
 # Create the grid of parameters
 grid = expand.grid(Loss=c("L1","L2"),
                    cost=c(100,1000))
 
-# Train the model using our previously defined 
+# Train the model using our previously defined
 # parameters
 gridsearch = train(x = X_train, y = y_train,
-    preProcess = c("center", "scale"), 
-    method = "svmLinear3", 
-    trControl = trainControl(method = "cv", 
+    preProcess = c("center", "scale"),
+    method = "svmLinear3",
+    trControl = trainControl(method = "cv",
             number = 5),
     tuneGrid = grid)
 gridsearch
@@ -1484,4 +1484,3 @@ While in the end, you can find a supervised machine learning
   sampling.
 
 [^6]: ([jakevdp.github.io/PythonDataScienceHandbook/05.07-support-vector-machines.html](https://jakevdp.github.io/PythonDataScienceHandbook/05.07-support-vector-machines.html))
-