-
Notifications
You must be signed in to change notification settings - Fork 5
/
RDecal-ML-CV.tex
336 lines (239 loc) · 20 KB
/
RDecal-ML-CV.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
% Richard Boeri Decal's CV.
% (c) 2011 Richard Decal <richard.decal at ncf edu> http://www.richarddecal.com
% template inspirations from
% (c) 2002 Matthew Boedicker <mboedick@mboedick.org> (original author) http://mboedick.org
% (c) 2003-2007 David J. Grant <davidgrant-at-gmail.com> http://www.davidgrant.ca
% (c) 2008 Nathaniel Johnston <nathaniel@nathanieljohnston.com> http://www.nathanieljohnston.com
% credit to Todd C. Miller <Todd.Miller@courtesan.com> http://www.courtesan.com/todd for the grey boxes
% requirements on Ubuntu: texlive-full texlive-latex-extra texlive-fonts-extra
%This work is licensed under the Creative Commons Attribution-Noncommercial-Share Alike 2.5 License. To view a copy of this license, visit http://creativecommons.org/licenses/by-nc-sa/2.5/ or send a letter to Creative Commons, 543 Howard Street, 5th Floor, San Francisco, California, 94105, USA.
\documentclass[a4paper,12pt]{article}
\renewcommand{\familydefault}{\sfdefault}
% for math ops on widths e.x. \textwidth-1cm
\usepackage{calc}
%%%%%%%%%%%%
%style stuff
%%%%%%%%%%%%
%\newlength{\outerbordwidth}
\raggedbottom % no extra space at the bottom of the page
%\raggedright
\usepackage{framed}
\usepackage{tocloft}
\usepackage{multicol} % for the multiple column'd list
%-----------------------------------------------------------
%make list unindented later
\usepackage{enumitem}
%compress the lines
\usepackage{setspace}
%\setstretch{0.9}
%\halfspacing
%page numbers and labels
\usepackage{fancyhdr}
\pagestyle{fancy}
\usepackage[margin=1in]{geometry}
\cfoot{\bfseries Decal CV \thepage} % except the center
\rfoot{}
\renewcommand{\headrulewidth}{0pt}
\renewcommand{\footrulewidth}{0pt}
%Margin setup
\setlength{\evensidemargin}{0in}
\setlength{\headheight}{0in}
\setlength{\headsep}{0in}
\setlength{\oddsidemargin}{0in}
%\setlength{\paperheight}{11in}
%\setlength{\paperwidth}{8.5in}
\setlength{\tabcolsep}{0in}
%\setlength{\textheight}{9.3in}
%\setlength{\textwidth}{7in}
\setlength{\topmargin}{0in}
\setlength{\topskip}{0in}
%\setlength{\voffset}{0.1in}
\setlength{\itemsep}{0pt}
\setlength{\marginparsep}{0pt}
% remove paragraph indents
\setlength{\parindent}{0pt}
%-----------------------------------------------------------
%Custom commands
% Grey header bars
% credit to Todd C. Miller <Todd.Miller@courtesan.com> http://www.courtesan.com/todd
\usepackage[svgnames]{xcolor}
\definecolor{mygrey}{gray}{0.94}
\newcommand{\resheading}[1]{{\hspace{-9pt} \colorbox{mygrey}{\begin{minipage}{\textwidth}{\textmd{~~\large \textbf{#1} \vphantom{p\^{E}}}}\end{minipage}}\vspace{6pt}} }
%\newcommand{\resheading}[1]{{\vspace*{.06in} \colorbox{mygrey}{\begin{minipage}{\textwidth}{\textmd{\large \textbf{#1} \vphantom{p\^{E}}}}\end{minipage}}} }
%\newcommand{\resheading}[{1}]{ \vspace*{.06in} \colorbox{mygrey} {\begin{minipage} { \textwidth}{\textmd{ {\large #1} } \vphantom{p\^{E}} } \end{minipage} } }
\newcommand{\ressubheading}[4]{{\begin{minipage}{\textwidth}
\textbf{#1} \hfill #2 \\
\textit{#3} \hfill #4 \\
\end{minipage}}}
%\renewcommand*\descriptionlabel[1]{\hspace\labelsep\emph{#1 -}}
%-----------------------------------------------------------
\usepackage{hyperref}
\hypersetup{
colorlinks=true,
linkcolor=blue,
urlcolor=blue,
breaklinks=true
}
\usepackage[hyphenbreaks]{breakurl}
% symbols
\usepackage{fontawesome}
% custom dates
\usepackage{datetime2}
% provides tabularx
\usepackage{ltablex}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{document}
\begin{center}
{\Huge Richard Boeri Decal}
\\
%Artificial Intelligence Engineer \\
{\small \faMapMarker~ USA $\cdot$ \faPhone~ [redacted] $\cdot$ \faEnvelope~ \href{mailto:public@richarddecal.com}{public@richarddecal.com} \\ \faGithubAlt~ \href{https://github.com/crypdick}{crypdick} $\cdot$~\faStackOverflow~ \href{https://stackoverflow.com/users/4212158/crypdick}{crypdick} $\cdot$ ~\faLinkedin~ \href{https://www.linkedin.com/in/richarddecal/}{richarddecal} $\cdot$ ~\faHome~ \href{https://www.richarddecal.com}{richarddecal.com}}
\end{center}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\resheading{Profile}
I'm a self-starting Machine Learning Scientist/Engineer, proficient in full-lifecycle and
full-stack projects, with 10 years of academic and industry experience.
Having founded two ML departments at start-ups, I have a proven record of doing what
it takes to ship useful products: iterating quickly, creating data curation systems, developing
novel and SOTA solutions, architecting ML infrastructure, and scaling to the cloud.
I have a proven record of quickly learning new domains (molecular biologist $\Rightarrow$ computational neuroscientist $\Rightarrow$ ML engineer) and am able to communicate complex technical concepts to all levels.
\vspace{0.5em} % half space
Style: 1) orient to customer needs, 2) design from first principles, 3) build in vertical slices.\\
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% \resheading{Expertise}
%
% \vspace{-1em}
% \begin{tabularx}{\textwidth}{p{2.5cm}>{\arraybackslash}X}
% \bfseries{Specialties} & Deep Learning for Computer Vision $\cdot$ Imbalanced datasets $\cdot$ Dimensionality reduction $\cdot$ Effective visualization \\
% \bfseries{Languages} & Python $\cdot$ R Tidyverse $\cdot$ Bash $\cdot$ RegEx $\cdot$ Espa\~nol $\cdot$ English $\cdot$ Italiano \\
% \bfseries{Tools} & PyTorch, Keras, Tensorflow $\cdot$ (Geo)Pandas, PostgreSQL $\cdot$ Ray/Anyscale, PySpark, Dask $\cdot$ Plotly-Dash, Streamlit, Flask, Gradio $\cdot$ Matplotlib, Seaborn, Altair, Plotly $\cdot$ Pandera, Great Expectations $\cdot$ Docker \\
% \bfseries{MLOps} & AWS ecosystem $\cdot$ MetaFlow, Sagemaker, Kedro, MLFlow $\cdot$ Bitbucket CI/CD
% \end{tabularx}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\resheading{Machine Learning Experience}
\ressubheading{Lead Machine Learning Engineer}{Remote}{\href{dendra.io}{Dendra Systems}}{Feb. 2020 --- Now}
\vspace{-6pt} \textit{Dendra uses swarms of seeding drones to restore ecosystems and monitor biodiversity at scale}. Founding ML lead. Full-stack, full life-cycle ML for scalable ecosystem restoration.
\begin{itemize}
\item Championed transformation from a services company to a ML product company using ``Zone to Win'' framework. My ML solutions have largely automated species identification tasks. This has transformed the business by increasing throughput, increasing profitability, reduced the `cold start' problem with new biomes, enabled us to persue entire new markets.
\item Initiative owner: training large computer vision models for species identification.
\begin{itemize}
\item Bootstrapped end-to-end species ID stack: data processing, hyperparameter tournaments, training, evaluation, serving, monitoring.
\item Translated state-of-the-art self-supervised learning research into production to improve model robustness and reduce required labeled training data (\texttt{Pytorch}).
\item Researched, experimented, and productionized novel ML techniques: models, samplers, optimization functions, etc.
\item Developed few-shot learning models paired with an active learning data harvesting UI to automate new species and novel biomes in hours instead of months (\texttt{Gradio}).
\item Owner of ML roadmap (aligned with product roadmap \& operations dept.). Established priorities, KPIs, and OKRs.
\item Hand-crafted data augmentations to make model robust to irrelevant features.
\end{itemize}
\item Integrated species ID models into customer-facing platform and internal tooling.
\begin{itemize}
\item Conceived novel model-in-the-loop annotation tooling, accelerating insights delivery by over 80x.
\item Devised model performance QC workflows to ensure we satisfy our SLAs.
\end{itemize}
\item Set strategic vision for ``data obsessed'' ML and spearheaded our ``data engine''.
\begin{itemize}
\item Strategized overhaul of our data collection process to enable ML on long-tailed, open-world inference across thousands of target classes. Set business and system requirements, and system design for strategic labeling workflows (\texttt{PlantUML}).
\item Implemented active learning methodologies to systematically harvest ``high-leverage'' data, preventing hallucinations on out-of-distribution data.
\item Implemented novelty-maximizing data pruning to enable pareto-optimal (exponential) model scaling laws. Reduced training data by 60\% while maintaining performance.
\item Devised unsupervised ``trip-wires'' for detecting model failures in production. Integrated alerts into project tracker for strategic annotation team so that we can proactively fix the issue (\texttt{Jira}).
\item Headed data curation tooling initiative (\textit{C4 diagrams}). Point-person for external vendor assessment and selection.
\item Created ``ML University'' lectures to educate ecologists on ML concepts and labeling best practices for high-quality data. Oversaw ML data collection team, developed rule-sets for data labeling and trained data annotation supervisor.
\item Collaborated with ecologists to create model failure reports and gain intuition for model failures. Created data collection campaigns to patch biases in training data.
\item Devised annotation QA and QC workflows: systematically identifying mislabeled and/or partially labeled samples to create ``self-healing'' training dataset.
\end{itemize}
\item Scaling \& Operational Excellence: Architected AWS-native cloud-scale infrastructure.
\begin{itemize}
\item Wrote distributed, scale-agnostic infrastructure for training, hyperparameter tuning, and inference (\texttt{Ray/Anyscale, AWS Batch}).
\item Implemented Bayesian hyperparameter tournaments which aggressively kill underperforming trials, reducing training costs by 20x (\texttt{Ray Tune, HyperOpt, ASHA}).
\item Identified bottlenecks and optimized throughput for multi-GPU jobs (\texttt{Grafana}).
\end{itemize}
\item MLOps: Championed efforts to implement best practices for ML systems.
\begin{itemize}
\item Responsible for debugging model failures with paranoid programming, detailed chronicling, model interpretability algorithms (e.g. GradCAM, Transformer patch activiation maps), and heavy visualization of training dynamics.
\item Responsible for full life-cycle of dataset and model artifacts, quality assurance:
tracking artifact lineage, parameters for reproducibility (\texttt{MetaFlow}).
\item Devised sanity-checks to detect ``silent failures'' during model training.
\item Devised different stratifications for validating models, as well as validating specific data slices.
\item Enabled observability across pipelines (\texttt{Cloudwatch, Slackbots, UMAP, Sentry}). Reviewed metrics weekly to prevent customer-impacting incidents. Periodically reported the unit-economics of our labeling rates (\texttt{Jupyter}).
\item Operation Vacation: Led initiative to automate all workflows, including model training (\texttt{custom orchestrator}). Later, reimplemented as serverless to improve reliability and cost (\texttt{Step Functions, API Gateway, $\lambda$, EventBridge}).
\item Enforced code quality and correctness using pre-commit hooks, CI (\texttt{Bitbucket Pipelines}), ML sanity checks, property-based testing (\texttt{Hypothesis}), run-time validation (\texttt{Pandera}), design-by-contract (\texttt{beartype}).
\end{itemize}
\end{itemize}
\ressubheading{Lead Data Scientist}{Remote}{\href{https://pacemate.com/}{PaceMate}}{Jan. 2019 --- Dec. 2019}
\vspace{-6pt} \textit{Pacemate monitors transmissions with bluetooth-enabled heart implants, identifying life-threatening arrhythmias and alerting emergency services}. Founded ML division. Built end-to-end data processing and model training pipelines.
\begin{itemize}
\item Automated remote detection of cardiac arrhythmias in Internet-enabled heart implants using deep learning.
\begin{itemize}
\item Developed processing pipelines for ECG data (\texttt{imbalanced-learn}, custom tools).
\item Working with cardiologists and software engineers to formulate business requirements (\texttt{YouTrack}).
\item Implemented state-of-the-art deep neural network for automated cardiac arrhythmia classification specifically tuned for the device implanted in a majority of our patients (\texttt{Keras}).
\item Created data labeling dashboard for electrophysiologists to review model predictions (\texttt{Plotly Dash}).
\end{itemize}
\item Created dashboard to collate, explore, and summarize key insights from our electronic medical records.
\begin{itemize}
\item Researched ML-assisted techniques for information extraction from extremely heterogeneous documents.
\item Wrote and scaled performant ETL pipelines (\texttt{SQL, PySpark, spaCy}).
\item Created dashboard to enable easy faceting and querying of EMR records to facilitate data-driven decision-making (\texttt{Plotly Dash}).
\item Created a report on our data inventory and trends in our data.
\end{itemize}
\item Upheld SOC2 security standards with measures such as encryption at rest, traffic tunnelling, and instance hardening.
\item Presented to the C-suite and met with potential investors.
\end{itemize}
\ressubheading{Data Scientist}{Sarasota, FL}{New College of FL, \href{farinstitute.org}{F.A.R. Institute}}{Aug. 2018 --- Dec. 2018}
\vspace{-6pt} \textit{The Florence A. Rothman Institute supports innovation in medical data analysis}. Semester-long master's capstone project supervised by Dr. Pat McDonald. Unpaid.
\begin{itemize}
\item Data-driven prediction of 30-day readmission using visit clustering.
\begin{itemize}
\item \texttt{visit2vec}: reduce high-dimensional patient visit data into low-dimensional embeddings using a technique inspired by \texttt{word2vec} (\texttt{TensorFlow}).
\item Explored structure in patient visits data by clustering patient visits using t-SNE\@.
\end{itemize}
\item Modeled patient trajectories on years of heart failure patients from Sarasota Memorial Hospital.
\begin{itemize}
\item Clustered patients over time based on cardiac and non-cardiac chronic conditions (\texttt{SQL, Pandas, PySpark}).
\item Created network graphs characterizing interactions between multiple chronic conditions and heart failure and their effect on mortality (\texttt{NetworkX})
\item Used finite state modeling to quantify interaction between chronic conditions and mortality (\texttt{PySpark, Numpy}).
\end{itemize}
\end{itemize}
\ressubheading{Research Intern}{Seattle, WA}{Peng Lab, \href{alleninstitute.org}{Allen Institute for Brain Science}}{June 2018 --- Aug. 2018}
\vspace{-6pt} \textit{The neuromorphology lab investigates the architecture of the brain at the population and single-cell level}. Proposed a method that would automate the biggest bottleneck to high-throughput neural cell morphological analysis.
\begin{itemize}
\item Deep reinforcement learning for tracing neural structures in petabytes of noisy fluorescent microscope data.
\begin{itemize}
\item Implemented proof-of-concept Deep Q Network using 3D convolutions to trace neural cell structures (\texttt{TensorFlow, rl-medical}).
\item Created simulation environment and reward system for training agents (\texttt{Matplotlib, OpenAI Gym}) based on \href{https://static-content.springer.com/esm/art%3A10.1038%2Fs41467-019-11443-y/MediaObjects/41467_2019_11443_MOESM4_ESM.mp4}{manually traced microscope images}.
\end{itemize}
\end{itemize}
% \ressubheading{Classroom Mentor}{Remote}{\href{udacity.com}{Udacity}}{Dec. 2017 --- May 2018}
% Provided 1-on-1 tutoring and code reviews for the \textit{Intro to Programming Nanodegree: Python for Data Analysis Track}.\\
\ressubheading{Research Assistant}{Seattle, WA}{\href{https://compneuro.washington.edu/}{Fairhall Lab, University of Washington}}{Oct. 2014 --- Jan. 2016}
\vspace{-6pt} \textit{Computational neuroscience lab investigating the biophysics of neural cells}. I developed agent-based dynamical models of mosquito thermal plume navigation behavior.
\begin{itemize}
\item Computed and visualized flight kinematic statistics and thermal sensing statistics using windtunnel flight data (\texttt{Numpy, Seaborn, scipy[interpolate, spatial, stats], sklearn, statsmodels}).
\item Formulated biophysical models of mosquito thermonavigation:
applied numerical optimization algorithms to fit model to experimental data (\texttt{scipy[optimize], Pandas}).
\item Created animations of thermal plume navigation models (\texttt{Matplotlib 3D, MayaVi}).
\end{itemize}
\ressubheading{Molecular Biologist}{Various}{Various}{Pre 2014}
\vspace{-6pt} Before transitioning to data science, I was formerly a molecular biologist.\\
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\resheading{Education}
\ressubheading{M.S. Data Science}{Sarasota, FL}{New College of Florida}{Aug. 2017 --- Dec. 2018}
% \ressubheading{Artificial Intelligence Engineer Nanodegree}{}{Udacity}{Feb. 2017 --- May 2017}
\ressubheading{B.A., Chemistry/Biology (with honors)}{Sarasota, FL}{New College of Florida}{Aug. 2007 --- May 2011}
\ressubheading{Early admission (admitted 16 yrs old)}{Jupiter, FL}{Harriet L. Wilkes Honors College}{Jul. 2006 --- May 2007}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\resheading{Publications, Presentations, \& Teaching}
* 2024 Talk at \textit{AI In Production Conference} on \href{https://www.aiinproduction.com/speakers/richard_decal}{data curation}.
* 2021 Invited talk \textit{Ray Summit}: \href{https://www.anyscale.com/events/2021/06/22/how-ray-and-anyscale-make-it-easy-to-do-massive-scale-ml-on-aerial-imagery}{\textit{How Ray and Anyscale Make it Easy to do Massive-scale ML on Aerial Imagery}}. Accompanying blog post \href{https://www.anyscale.com/blog/how-ray-and-anyscale-make-it-easy-to-do-massive-scale-machine-learning-on}{here}.
* 2019 Seminar at \textit{New College of FL}: \textit{Remote Sensing of Cardiac Arrhythmia at Scale using Deep Learning}.
* 2019 Seminar at \textit{Escuela Secundaria Tecnica de Torquinst}: \textit{Intelligencia Artificial}.
* 2018 Classroom mentor for \textit{Udacity's Intro to Programming Nanodegree: Python for Data Analysis Track} (1-on-1 tutoring, code reviews).
* 2015 UW Outreach: various educational events for students from low socioeconomic backgrounds.
* 2012 \href{https://scholar.google.co.nz/citations?user=4ODJ78oAAAAJ&hl=en}{Two peer-reviewed journal articles} in \textit{Genetics} and \textit{PNAS}, also presented as posters at three national conferences.
* 2007 Undergraduate honors thesis on RNA interference mechanisms in \textit{C. elegans}.\\
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\resheading{Selected Awards \& Grants}
\ressubheading{NCF Data Scholar}{2017 --- 2018}{Full tuition waiver for master's program.}{}
\ressubheading{National Institutes of Health PA-12-149 Federal grant}{2014 --- 2016}{Self-funded grant covering my salary and expenses at the UW Dept of Biophysics.}{}
\ressubheading{Florida ``Bright Futures" Scholar}{2007 --- 2011}{Merit-based scholarship. Full tuition.}{}
\ressubheading{Dubois-Felsmann Research Grant}{2010 --- 2011}{Covered reagent costs for my thesis experiments \& conferences.}{}
\end{document}