From 8a12478fc2d5df3938238c013dc51195ede40312 Mon Sep 17 00:00:00 2001 From: Pavel Dimens Date: Thu, 20 May 2021 13:55:53 -0400 Subject: [PATCH 1/6] tweaks --- rules/LepMap3.smk | 1 + rules/trim.smk | 20 +++++++++++++++----- scripts/LepWrapTrim.r | 3 ++- scripts/TrimSummaryPlot.r | 37 +++++++++++++++++++++++++++++++++++++ 4 files changed, 55 insertions(+), 6 deletions(-) create mode 100755 scripts/TrimSummaryPlot.r diff --git a/rules/LepMap3.smk b/rules/LepMap3.smk index cdf391f..ad62f8a 100644 --- a/rules/LepMap3.smk +++ b/rules/LepMap3.smk @@ -19,6 +19,7 @@ lod_lim = config["lod_limit"] lod_diff = config["lod_difference"] # ordering # lg_range = list(range(1, config["exp_lg"]+1)) +lg_count = config["exp_lg"] threads_per = config["threads_per"] dist_method = config["dist_method"] ITER = config["iterations"] diff --git a/rules/trim.smk b/rules/trim.smk index a40f64f..8a1c2bf 100644 --- a/rules/trim.smk +++ b/rules/trim.smk @@ -3,7 +3,7 @@ rule trim_edge_clusters: output: "5_Trim/ordered.{lg_range}.trimmed" log: "5_Trim/logs/ordered.{lg_range}.removed", - "5_Trim/logs/ordered.{lg_range}.trim.pdf" + "5_Trim/plots/ordered.{lg_range}.trim.pdf" params: trim_threshold = trim_thresh, edge_length = edge_len @@ -14,11 +14,18 @@ rule trim_edge_clusters: """ rule trim_summary: - input: expand("5_Trim/ordered.{lg}.trimmed", lg = lg_range) + input: + lg = expand("5_Trim/ordered.{lg}.trimmed", lg = lg_range), + plots = expand("5_Trim/plots/ordered.{lg}.trim.pdf", lg = lg_range) output: detailed = "5_Trim/trim.details", - summary = "5_Trim/trim.summary" - message: "Summarizing trim logs to {output}" + summary = "5_Trim/trim.summary", + summarypdf = "5_Trim/trim.summary.pdf", + summarysvg = "5_Trim/trim.summary.svg", + mergeplots = "5_Trim/plots/all.trimplots.pdf" + message: "Summarizing trimming results" + params: + lg = lg_count priority: 1 shell: """ @@ -31,5 +38,8 @@ rule trim_summary: sort -V {output.detailed}.tmp > {output.detailed} && rm {output.detailed}.tmp echo "n_removed map" > {output.summary}.tmp cut -d" " -f1 {output.detailed} | uniq -c >> {output.summary}.tmp - column -t {output.summary}.tmp > {output.summary} && rm {output.summary}.tmp + column -t {output.summary}.tmp > {output.summary} && rm {output.summary}.tmp + echo "Merging QC plots for all linkage groups" + convert -density 300 {input.plots} {output.mergeplots} + scripts/TrimSummaryPlot.r {output.summary} {params.lg} """ diff --git a/scripts/LepWrapTrim.r b/scripts/LepWrapTrim.r index d6d921c..8bddfe4 100755 --- a/scripts/LepWrapTrim.r +++ b/scripts/LepWrapTrim.r @@ -27,7 +27,8 @@ lg <- unlist(strsplit(filename, "\\."))[2] #========= output instantiation ========# outfile_base <- paste("5_Trim", filename, sep = "/") outfile_log_base <- paste("5_Trim", "logs", filename, sep = "/") -plotfile <- paste(outfile_log_base, "trim.pdf", sep = ".") +plotfile_base <- paste("5_Trim", "plots", filename, sep = "/") +plotfile <- paste(plotfile_base, "trim.pdf", sep = ".") ##### Pruning the ends ##### dist_thresh <- as.numeric(args[2]) diff --git a/scripts/TrimSummaryPlot.r b/scripts/TrimSummaryPlot.r new file mode 100755 index 0000000..8d27d64 --- /dev/null +++ b/scripts/TrimSummaryPlot.r @@ -0,0 +1,37 @@ +#! /usr/bin/env Rscript + +suppressMessages(library(tidyverse, quietly = TRUE)) + +args <- commandArgs(trailingOnly = TRUE) +# args[1] is the summary file +# args[2] is the expected number of linkage groups (to fill in zeroes) + +# read in the summary table and pop out the LG number +tbl <- read_table(args[1]) %>% + rowwise() %>% + mutate(map = as.numeric(strsplit(map, "\\.")[[1]][2])) %>% + suppressMessages() + +# find the max number of linkage groups +lg_num <- 1:args[2] +missing_lg <- lg_num[!(lg_num %in% tbl$map)] +missings <- data.frame(n_removed = 0, map = missing_lg) +tbl <- merge(tbl, missings, on = map, all = TRUE) + +tbl %>% + ggplot(aes(map, n_removed)) + + geom_segment( + aes(map, n_removed, xend = map, yend=0), + size = 5, + color = "#78bcff" + ) + + geom_point(shape = 21, fill = "#4aa6ff", color = "transparent", size = 5) + + scale_x_continuous(labels = as.character(tbl$map), breaks = tbl$map) + + labs( + title = "Number of markers removed via trimming", + x = "Linkage Group", + y = "Number removed" + ) + +ggsave(paste0(args[1], ".pdf"), height = 4, width = 7, units = "in") +ggsave(paste0(args[1], ".svg"), height = 4, width = 7, units = "in") \ No newline at end of file From d5b8990f4aa5589e5e66e00f2348c6c2ace2b69e Mon Sep 17 00:00:00 2001 From: Pavel Dimens Date: Thu, 20 May 2021 14:01:48 -0400 Subject: [PATCH 2/6] tweaks --- conda_setup.yml | 354 +++++++++++++++++++----------------------------- config.yaml | 12 +- 2 files changed, 148 insertions(+), 218 deletions(-) diff --git a/conda_setup.yml b/conda_setup.yml index b01e49f..5e596f7 100644 --- a/conda_setup.yml +++ b/conda_setup.yml @@ -12,388 +12,319 @@ dependencies: - aiohttp=3.7.4=py39h3811e60_0 - amply=0.1.4=py_0 - appdirs=1.4.4=pyh9f0ad1d_0 - - argon2-cffi=20.1.0=py39h3811e60_2 - async-timeout=3.0.1=py_1000 - - async_generator=1.10=py_0 - atk-1.0=2.36.0=h3371d22_4 - attmap=0.13.0=pyhd8ed1ab_0 - - attrs=20.3.0=pyhd3deb0d_0 - - backcall=0.2.0=pyh9f0ad1d_0 + - attrs=21.2.0=pyhd8ed1ab_0 + - azure-common=1.1.27=pyhd8ed1ab_0 + - azure-core=1.14.0=pyhd8ed1ab_0 + - azure-storage-blob=12.8.1=pyhd8ed1ab_0 - backports=1.0=py_2 - - backports.functools_lru_cache=1.6.1=py_0 + - backports.functools_lru_cache=1.6.4=pyhd8ed1ab_0 - bcrypt=3.2.0=py39h3811e60_1 - binutils_impl_linux-64=2.35.1=h193b22a_2 - binutils_linux-64=2.35=h67ddf6f_30 - - bleach=3.3.0=pyh44b312d_0 + - blinker=1.4=py_1 - boto=2.49.0=py_0 - - boto3=1.17.33=pyhd8ed1ab_0 - - botocore=1.20.33=pyhd8ed1ab_0 + - boto3=1.17.76=pyhd8ed1ab_0 + - botocore=1.20.76=pyhd8ed1ab_0 - brotlipy=0.7.0=py39h3811e60_1001 - bwidget=1.9.14=ha770c72_0 - - bz2file=0.98=py_0 - bzip2=1.0.8=h7f98852_4 - c-ares=1.17.1=h7f98852_1 - ca-certificates=2020.12.5=ha878542_0 - - cachetools=4.2.1=pyhd8ed1ab_0 - - cairo=1.16.0=h7979940_1007 + - cachetools=4.2.2=pyhd8ed1ab_0 + - cairo=1.16.0=h6cf1ce9_1008 - certifi=2020.12.5=py39hf3d152e_1 - cffi=1.14.5=py39he32792d_0 - chardet=4.0.0=py39hf3d152e_1 - coincbc=2.10.5=hcee13e7_1 - - configargparse=1.4=pyhd8ed1ab_0 - - cryptography=3.4.6=py39hbca0aa6_0 - - curl=7.75.0=h979ede3_0 - - datrie=0.8.2=py39h07f9747_1 - - decorator=4.4.2=py_0 - - defusedxml=0.7.1=pyhd8ed1ab_0 - - docutils=0.16=py39hf3d152e_3 - - dropbox=10.10.0=py39h06a4308_0 - - entrypoints=0.3=pyhd8ed1ab_1003 - - expat=2.2.10=h9c3ff4c_0 + - configargparse=1.4.1=pyhd8ed1ab_0 + - connection_pool=0.0.3=pyhd3deb0d_0 + - cryptography=3.4.7=py39hbca0aa6_0 + - curl=7.76.1=hea6ffbf_2 + - datrie=0.8.2=py39h3811e60_2 + - decorator=5.0.9=pyhd8ed1ab_0 + - docutils=0.17.1=py39hf3d152e_0 + - dropbox=11.9.0=pyhd8ed1ab_0 + - expat=2.3.0=h9c3ff4c_0 - fftw=3.3.9=nompi_h74d3f13_101 - filechunkio=1.8=py_2 - filelock=3.0.12=pyh9f0ad1d_0 - font-ttf-dejavu-sans-mono=2.37=hab24e00_0 - - font-ttf-inconsolata=2.001=hab24e00_0 - - font-ttf-source-code-pro=2.030=hab24e00_0 + - font-ttf-inconsolata=3.000=h77eed37_0 + - font-ttf-source-code-pro=2.038=h77eed37_0 - font-ttf-ubuntu=0.83=hab24e00_0 - - fontconfig=2.13.1=hba837de_1004 + - fontconfig=2.13.1=hba837de_1005 - fonts-conda-ecosystem=1=0 - fonts-conda-forge=1=0 - freetype=2.10.4=h0708190_1 - fribidi=1.0.10=h36c2ea0_0 - ftputil=5.0.1=pyhd8ed1ab_0 - - gcc_impl_linux-64=9.3.0=h70c0ae5_18 + - gcc_impl_linux-64=9.3.0=h70c0ae5_19 - gcc_linux-64=9.3.0=hf25ea35_30 - - gdk-pixbuf=2.42.2=h0c95a7a_2 + - gdk-pixbuf=2.42.6=h04a7f16_0 - gettext=0.19.8.1=h0b5b191_1005 - - gfortran_impl_linux-64=9.3.0=hc4a2995_18 + - gfortran_impl_linux-64=9.3.0=hc4a2995_19 - gfortran_linux-64=9.3.0=hdc58fab_30 - ghostscript=9.18=1 - giflib=5.2.1=h36c2ea0_2 - - gitdb=4.0.5=pyhd8ed1ab_1 - - gitpython=3.1.14=pyhd8ed1ab_0 - - google-api-core=1.25.1=pyhd8ed1ab_0 - - google-api-python-client=2.0.2=pyhd8ed1ab_0 - - google-auth=1.24.0=pyhd3deb0d_0 - - google-auth-httplib2=0.0.4=pyh9f0ad1d_0 + - gitdb=4.0.7=pyhd8ed1ab_0 + - gitpython=3.1.17=pyhd8ed1ab_0 + - google-api-core=1.26.3=pyhd8ed1ab_0 + - google-api-python-client=2.5.0=pyhd8ed1ab_0 + - google-auth=1.30.0=pyh44b312d_0 + - google-auth-httplib2=0.1.0=pyhd8ed1ab_0 - google-cloud-core=1.5.0=pyhd3deb0d_0 - google-cloud-storage=1.19.0=py_0 - google-crc32c=1.1.2=py39hb81f231_0 - google-resumable-media=1.2.0=pyhd3deb0d_0 - - googleapis-common-protos=1.52.0=py39hf3d152e_1 + - googleapis-common-protos=1.53.0=py39hf3d152e_0 - graphite2=1.3.13=h58526e2_1001 - - graphviz=2.47.0=h93c640b_0 - - grpcio=1.36.1=py39hff7568b_0 + - graphviz=2.47.1=h85b4f2f_1 + - grpcio=1.37.1=py39hff7568b_0 - gsl=2.6=he838d99_2 - - gtk2=2.24.33=hab0c2f8_0 + - gtk2=2.24.33=h539f30e_1 - gts=0.7.6=h64030ff_2 - - gxx_impl_linux-64=9.3.0=hd87eabc_18 + - gxx_impl_linux-64=9.3.0=hd87eabc_19 - gxx_linux-64=9.3.0=h3fbe746_30 - - harfbuzz=2.8.0=h83ec7ef_0 - - httplib2=0.19.0=pyhd8ed1ab_0 + - harfbuzz=2.8.1=h83ec7ef_0 + - httplib2=0.19.1=pyhd8ed1ab_0 - icu=68.1=h58526e2_0 - idna=2.10=pyh9f0ad1d_0 - - imagemagick=7.0.11_4=pl5320hb503478_0 - - importlib-metadata=3.7.3=py39hf3d152e_0 + - imagemagick=7.0.11_13=pl5320hb118871_0 + - importlib-metadata=4.0.1=py39hf3d152e_0 - iniconfig=1.1.1=pyh9f0ad1d_0 - - ipykernel=5.5.0=py39hef51801_1 - - ipython=7.21.0=py39hef51801_0 - ipython_genutils=0.2.0=py_1 - - jbig=2.1=h7f98852_2002 - - jedi=0.18.0=py39hf3d152e_2 - - jinja2=2.11.3=pyh44b312d_0 + - isodate=0.6.0=py_1 + - jbig=2.1=h7f98852_2003 + - jinja2=3.0.1=pyhd8ed1ab_0 - jmespath=0.10.0=pyh9f0ad1d_0 - jpeg=9d=h36c2ea0_0 - jsonschema=3.2.0=pyhd8ed1ab_3 - - jupyter_client=6.1.12=pyhd8ed1ab_0 - jupyter_core=4.7.1=py39hf3d152e_0 - - jupyterlab_pygments=0.1.2=pyh9f0ad1d_0 - kernel-headers_linux-64=2.6.32=h77966d4_13 - - krb5=1.17.2=h926e7f8_0 + - krb5=1.19.1=hcc1bbae_0 - ld_impl_linux-64=2.35.1=hea4e1c9_2 - - libblas=3.9.0=8_openblas - - libcblas=3.9.0=8_openblas + - libblas=3.9.0=9_openblas + - libcblas=3.9.0=9_openblas - libcrc32c=1.1.1=h9c3ff4c_2 - - libcurl=7.75.0=hc4aaa36_0 + - libcurl=7.76.1=h2574ce0_2 - libedit=3.1.20191231=he28a2e2_2 - libev=4.33=h516909a_1 - libffi=3.3=h58526e2_2 - libgcc=7.2.0=h69d50b8_2 - - libgcc-devel_linux-64=9.3.0=h7864c58_18 - - libgcc-ng=9.3.0=h2828fa1_18 - - libgd=2.3.0=h47910db_1 - - libgfortran-ng=9.3.0=hff62375_18 - - libgfortran5=9.3.0=hff62375_18 - - libglib=2.68.0=h3e27bee_1 - - libgomp=9.3.0=h2828fa1_18 + - libgcc-devel_linux-64=9.3.0=h7864c58_19 + - libgcc-ng=9.3.0=h2828fa1_19 + - libgd=2.3.2=h78a0170_0 + - libgfortran-ng=9.3.0=hff62375_19 + - libgfortran5=9.3.0=hff62375_19 + - libglib=2.68.2=h3e27bee_0 + - libgomp=9.3.0=h2828fa1_19 - libiconv=1.16=h516909a_0 - - liblapack=3.9.0=8_openblas + - liblapack=3.9.0=9_openblas - libnghttp2=1.43.0=h812cca2_0 - - libopenblas=0.3.12=pthreads_h4812303_1 + - libopenblas=0.3.15=pthreads_h8fe5266_1 - libpng=1.6.37=h21135ba_2 - - libprotobuf=3.15.6=h780b84a_0 - - librsvg=2.50.3=hfa39831_1 + - libprotobuf=3.17.0=h780b84a_0 + - librsvg=2.50.5=hc3c00ef_0 - libsodium=1.0.18=h36c2ea0_1 - libssh2=1.9.0=ha56f1ee_6 - - libstdcxx-devel_linux-64=9.3.0=hb016644_18 - - libstdcxx-ng=9.3.0=h6de172a_18 - - libtiff=4.2.0=hdc55705_0 + - libstdcxx-devel_linux-64=9.3.0=hb016644_19 + - libstdcxx-ng=9.3.0=h6de172a_19 + - libtiff=4.3.0=hf544144_0 - libtool=2.4.6=h58526e2_1007 - libuuid=2.32.1=h7f98852_1000 - libwebp=1.2.0=h3452ae3_0 - libwebp-base=1.2.0=h7f98852_2 - libxcb=1.13=h7f98852_1003 - - libxml2=2.9.10=h72842e0_3 + - libxml2=2.9.12=h72842e0_0 - logmuse=0.2.6=pyh8c360ce_0 - lz4-c=1.9.3=h9c3ff4c_0 - make=4.3=hd18ef5c_1 - - markupsafe=1.1.1=py39h3811e60_3 - - mistune=0.8.4=py39h3811e60_1003 - - more-itertools=8.7.0=pyhd8ed1ab_0 + - markupsafe=2.0.1=py39h3811e60_0 + - more-itertools=8.7.0=pyhd8ed1ab_1 + - msrest=0.6.21=pyh44b312d_0 - multidict=5.1.0=py39h3811e60_1 - - nbclient=0.5.3=pyhd8ed1ab_0 - - nbconvert=6.0.7=py39hf3d152e_3 - - nbformat=5.1.2=pyhd8ed1ab_1 + - nbformat=5.1.3=pyhd8ed1ab_0 - ncurses=6.2=h58526e2_4 - - nest-asyncio=1.4.3=pyhd8ed1ab_0 - - networkx=2.5=py_0 - - notebook=6.3.0=py39hf3d152e_0 - - numpy=1.20.1=py39hdbf815f_0 + - networkx=2.3=py_0 + - numpy=1.20.3=py39hdbf815f_0 - oauth2client=4.1.3=py_0 - - openjpeg=2.4.0=hf7af979_0 - - openssl=1.1.1j=h7f98852_0 + - oauthlib=3.0.1=py_0 + - openjpeg=2.4.0=hb52868f_1 + - openssl=1.1.1k=h7f98852_0 - packaging=20.9=pyh44b312d_0 - - pandas=1.2.3=py39hde0f152_0 - - pandoc=2.12=h7f98852_0 - - pandocfilters=1.4.2=py_1 - - pango=1.42.4=h69149e4_5 + - pandas=1.2.4=py39hde0f152_0 + - pandoc=2.13=h7f98852_0 + - pango=1.48.5=hb8ff022_0 - paramiko=2.7.2=pyh9f0ad1d_0 - - parso=0.8.1=pyhd8ed1ab_0 - pcre=8.44=he1b5a44_0 - pcre2=10.36=h032f7d1_1 - - peppy=0.31.0=pyh9f0ad1d_0 + - peppy=0.31.1=pyhd8ed1ab_0 - perl=5.32.0=h36c2ea0_0 - - pexpect=4.8.0=pyh9f0ad1d_2 - - pickleshare=0.7.5=py_1003 - - pip=21.0.1=pyhd8ed1ab_0 + - pip=21.1.1=pyhd8ed1ab_0 - pixman=0.40.0=h36c2ea0_0 - pkg-config=0.29.2=h36c2ea0_1008 - pluggy=0.13.1=py39hf3d152e_4 + - ply=3.11=py_1 - prettytable=2.1.0=pyhd8ed1ab_0 - - prometheus_client=0.9.0=pyhd3deb0d_0 - - prompt-toolkit=3.0.17=pyha770c72_0 - - protobuf=3.15.6=py39he80948d_0 + - protobuf=3.17.0=py39he80948d_0 - psutil=5.8.0=py39h3811e60_1 - pthread-stubs=0.4=h36c2ea0_1001 - - ptyprocess=0.7.0=pyhd3deb0d_0 - pulp=2.4=py39hf3d152e_0 - py=1.10.0=pyhd3deb0d_0 - pyasn1=0.4.8=py_0 - pyasn1-modules=0.2.7=py_0 - pycparser=2.20=pyh9f0ad1d_2 - - pygments=2.8.1=pyhd8ed1ab_0 + - pygments=2.9.0=pyhd8ed1ab_0 - pygraphviz=1.7=py39h78163bd_0 + - pyjwt=2.1.0=pyhd8ed1ab_0 - pynacl=1.4.0=py39h3811e60_2 - pyopenssl=20.0.1=pyhd8ed1ab_0 - pyparsing=2.4.7=pyh9f0ad1d_0 - pyrsistent=0.17.3=py39h3811e60_2 - pysftp=0.2.9=py_1 - pysocks=1.7.1=py39hf3d152e_3 - - pytest=6.2.2=py39hf3d152e_0 - - python=3.9.2=hffdb5ce_0_cpython + - pytest=6.2.4=py39hf3d152e_0 + - python=3.9.4=hffdb5ce_0_cpython - python-dateutil=2.8.1=py_0 - - python-irodsclient=0.8.6=pyhd8ed1ab_0 + - python-irodsclient=0.9.0=pyhd8ed1ab_0 - python_abi=3.9=1_cp39 - pytz=2021.1=pyhd8ed1ab_0 - pyyaml=5.4.1=py39h3811e60_0 - - pyzmq=22.0.3=py39h37b5a0c_1 - r-askpass=1.1=r40hcdcec82_2 - r-assertthat=0.2.1=r40h6115d3f_2 - r-backports=1.2.1=r40hcfec24a_0 - - r-base=4.0.3=h8ff2632_7 + - r-base=4.0.5=h9e01966_1 - r-base64enc=0.1_3=r40hcdcec82_1004 - r-blob=1.2.1=r40h6115d3f_1 - - r-boot=1.3_27=r40hc72bb7e_0 - - r-brio=1.1.1=r40hcfec24a_0 - - r-broom=0.7.5=r40hc72bb7e_0 - - r-bslib=0.2.4=r40hc72bb7e_0 - - r-cachem=1.0.4=r40hcfec24a_0 - - r-callr=3.5.1=r40h142f84f_0 - - r-caret=6.0_86=r40hcdcec82_2 + - r-brio=1.1.2=r40hcfec24a_0 + - r-broom=0.7.6=r40hc72bb7e_0 + - r-callr=3.7.0=r40hc72bb7e_0 - r-cellranger=1.1.0=r40h6115d3f_1003 - - r-class=7.3_18=r40hcfec24a_0 - - r-cli=2.3.1=r40hc72bb7e_0 + - r-cli=2.5.0=r40hc72bb7e_0 - r-clipr=0.7.1=r40h142f84f_0 - - r-cluster=2.1.1=r40h859d828_0 - - r-codetools=0.2_18=r40hc72bb7e_0 - - r-colorspace=2.0_0=r40h9e2df91_0 - - r-commonmark=1.7=r40hcdcec82_1002 - - r-cpp11=0.2.6=r40hc72bb7e_0 + - r-colorspace=2.0_1=r40hcfec24a_0 + - r-cpp11=0.2.7=r40hc72bb7e_0 - r-crayon=1.4.1=r40hc72bb7e_0 - - r-crul=1.1.0=r40h785f33e_0 - - r-curl=4.3=r40hcdcec82_1 + - r-curl=4.3.1=r40hcfec24a_0 - r-data.table=1.14.0=r40hcfec24a_0 - r-dbi=1.1.1=r40hc72bb7e_0 - - r-dbplyr=2.1.0=r40hc72bb7e_0 + - r-dbplyr=2.1.1=r40hc72bb7e_0 - r-desc=1.3.0=r40hc72bb7e_0 - - r-diffobj=0.3.3=r40hcfec24a_0 - - r-digest=0.6.27=r40h1b71b39_0 - - r-dplyr=1.0.5=r40h03ef668_0 - - r-ellipsis=0.3.1=r40hcdcec82_0 - - r-essentials=4.0=r40_2002 + - r-diffobj=0.3.4=r40hcfec24a_0 + - r-digest=0.6.27=r40h03ef668_0 + - r-dplyr=1.0.6=r40h03ef668_1 + - r-dtplyr=1.1.0=r40hc72bb7e_0 + - r-ellipsis=0.3.2=r40hcfec24a_0 - r-evaluate=0.14=r40h6115d3f_2 - r-fansi=0.4.2=r40hcfec24a_0 - r-farver=2.1.0=r40h03ef668_0 - - r-fastmap=1.1.0=r40h03ef668_0 - r-forcats=0.5.1=r40hc72bb7e_0 - - r-foreach=1.5.1=r40h142f84f_0 - - r-foreign=0.8_81=r40hcfec24a_0 - - r-formatr=1.8=r40hc72bb7e_0 - r-fs=1.5.0=r40h0357c0b_0 + - r-gargle=1.1.0=r40hc72bb7e_0 + - r-gdtools=0.2.2=r40h36050f4_1 - r-generics=0.1.0=r40hc72bb7e_0 - r-ggplot2=3.3.3=r40hc72bb7e_0 - - r-gistr=0.9.0=r40h6115d3f_0 - - r-glmnet=4.1_1=r40h859d828_0 - r-glue=1.4.2=r40hcfec24a_0 - - r-gower=0.2.2=r40hcdcec82_0 + - r-googledrive=1.0.1=r40h6115d3f_1 + - r-googlesheets4=0.3.0=r40hc72bb7e_0 - r-gtable=0.3.0=r40h6115d3f_3 - - r-haven=2.3.1=r40hde08347_0 - - r-hexbin=1.28.2=r40h859d828_0 - - r-highr=0.8=r40h6115d3f_2 - - r-hms=1.0.0=r40hc72bb7e_0 + - r-haven=2.4.1=r40h2713e49_0 + - r-highr=0.9=r40hc72bb7e_0 + - r-hms=1.1.0=r40hc72bb7e_0 - r-htmltools=0.5.1.1=r40h03ef668_0 - - r-htmlwidgets=1.5.3=r40hc72bb7e_0 - - r-httpcode=0.3.0=r40_1 - - r-httpuv=1.5.5=r40h03ef668_0 - r-httr=1.4.2=r40h6115d3f_0 - - r-ipred=0.9_11=r40hcfec24a_0 - - r-irdisplay=1.0=r40hd8ed1ab_0 - - r-irkernel=1.1.1=r40h6115d3f_0 + - r-ids=1.0.1=r40h6115d3f_1 - r-isoband=0.2.4=r40h03ef668_0 - - r-iterators=1.0.13=r40h142f84f_0 - - r-jquerylib=0.1.3=r40hc72bb7e_0 - r-jsonlite=1.7.2=r40hcfec24a_0 - - r-kernsmooth=2.23_18=r40h742201e_0 - - r-knitr=1.31=r40hc72bb7e_0 + - r-knitr=1.33=r40hc72bb7e_0 - r-labeling=0.4.2=r40h142f84f_0 - - r-later=1.1.0.1=r40h0357c0b_0 - - r-lattice=0.20_41=r40hcfec24a_3 - - r-lava=1.6.9=r40hc72bb7e_0 - - r-lazyeval=0.2.2=r40hcdcec82_2 + - r-lattice=0.20_44=r40hcfec24a_0 - r-lifecycle=1.0.0=r40hc72bb7e_0 - r-lubridate=1.7.10=r40h03ef668_0 - r-magrittr=2.0.1=r40hcfec24a_1 - - r-maps=3.3.0=r40hcdcec82_1004 - r-markdown=1.1=r40hcfec24a_1 - - r-mass=7.3_53.1=r40hcfec24a_0 - - r-matrix=1.3_2=r40he454529_0 - - r-mgcv=1.8_34=r40he454529_0 + - r-mass=7.3_54=r40hcfec24a_0 + - r-matrix=1.3_3=r40he454529_0 + - r-mgcv=1.8_35=r40he454529_0 - r-mime=0.10=r40hcfec24a_0 - - r-modelmetrics=1.2.2.2=r40h0357c0b_1 - r-modelr=0.1.8=r40h6115d3f_0 - r-munsell=0.5.0=r40h6115d3f_1003 - r-nlme=3.1_152=r40h859d828_0 - - r-nnet=7.3_15=r40hcfec24a_0 - - r-numderiv=2016.8_1.1=r40h6115d3f_3 - - r-openssl=1.4.3=r40he5c4762_0 - - r-pbdzmq=0.3_5=r40h42bf92c_1 - - r-pillar=1.5.1=r40hc72bb7e_0 - - r-pkgbuild=1.2.0=r40hc72bb7e_0 + - r-openssl=1.4.4=r40he36bf35_0 + - r-pillar=1.6.1=r40hc72bb7e_0 - r-pkgconfig=2.0.3=r40h6115d3f_1 - - r-pkgload=1.2.0=r40h03ef668_0 + - r-pkgload=1.2.1=r40h03ef668_0 - r-plyr=1.8.6=r40h0357c0b_1 - r-praise=1.0.0=r40h6115d3f_1004 - r-prettyunits=1.1.1=r40h6115d3f_1 - - r-proc=1.17.0.1=r40h03ef668_0 - - r-processx=3.4.5=r40hcfec24a_0 - - r-prodlim=2019.11.13=r40h0357c0b_1 + - r-processx=3.5.2=r40hcfec24a_0 - r-progress=1.2.2=r40h6115d3f_2 - - r-promises=1.2.0.1=r40h03ef668_0 - - r-pryr=0.1.4=r40h0357c0b_1004 - r-ps=1.6.0=r40hcfec24a_0 - - r-purrr=0.3.4=r40hcdcec82_1 - - r-quantmod=0.4.18=r40hc72bb7e_0 + - r-purrr=0.3.4=r40hcfec24a_1 - r-r6=2.5.0=r40hc72bb7e_0 - - r-randomforest=4.6_14=r40h580db52_1004 - r-rappdirs=0.3.3=r40hcfec24a_0 - - r-rbokeh=0.5.1=r40h6115d3f_0 - r-rcolorbrewer=1.1_2=r40h6115d3f_1003 - r-rcpp=1.0.6=r40h03ef668_0 - r-readr=1.4.0=r40h1b71b39_0 - r-readxl=1.3.1=r40hde08347_4 - - r-recipes=0.1.15=r40hc72bb7e_0 - - r-recommended=4.0=r40_1004 - r-rematch=1.0.1=r40h6115d3f_1003 - r-rematch2=2.1.2=r40h6115d3f_1 - - r-repr=1.1.3=r40h785f33e_0 - - r-reprex=1.0.0=r40hc72bb7e_0 + - r-reprex=2.0.0=r40hc72bb7e_0 - r-reshape2=1.4.4=r40h0357c0b_1 - - r-rlang=0.4.10=r40hcfec24a_0 - - r-rmarkdown=2.7=r40hc72bb7e_0 - - r-rpart=4.1_15=r40hcfec24a_2 + - r-rlang=0.4.11=r40hcfec24a_0 + - r-rmarkdown=2.8=r40hc72bb7e_0 - r-rprojroot=2.0.2=r40hc72bb7e_0 - r-rstudioapi=0.13=r40hc72bb7e_0 - r-rvest=1.0.0=r40hc72bb7e_0 - - r-sass=0.3.1=r40h03ef668_0 - r-scales=1.1.1=r40h6115d3f_0 - r-selectr=0.4_2=r40h6115d3f_1 - - r-shape=1.4.5=r40_0 - - r-shiny=1.6.0=r40hc72bb7e_0 - - r-sourcetools=0.1.7=r40he1b5a44_1002 - - r-spatial=7.3_13=r40hcfec24a_0 - - r-squarem=2021.1=r40hc72bb7e_0 - - r-stringi=1.5.3=r40hcabe038_1 + - r-stringi=1.6.2=r40hcabe038_0 - r-stringr=1.4.0=r40h6115d3f_2 - - r-survival=3.2_10=r40hcfec24a_0 + - r-svglite=2.0.0=r40h03ef668_0 - r-sys=3.4=r40hcdcec82_0 + - r-systemfonts=1.0.2=r40hef9c87a_0 - r-testthat=3.0.2=r40h03ef668_0 - - r-tibble=3.1.0=r40hcfec24a_1 + - r-tibble=3.1.2=r40hcfec24a_0 - r-tidyr=1.1.3=r40h03ef668_0 - - r-tidyselect=1.1.0=r40h6115d3f_0 - - r-tidyverse=1.3.0=r40h6115d3f_2 - - r-timedate=3043.102=r40h6115d3f_1002 - - r-tinytex=0.30=r40hc72bb7e_0 - - r-triebeard=0.3.0=r40he1b5a44_1003 - - r-ttr=0.24.2=r40hcdcec82_0 - - r-urltools=1.7.3=r40h0357c0b_2 + - r-tidyselect=1.1.1=r40hc72bb7e_0 + - r-tidyverse=1.3.1=r40hc72bb7e_0 + - r-tinytex=0.31=r40hc72bb7e_0 - r-utf8=1.2.1=r40hcfec24a_0 - r-uuid=0.1_4=r40hcdcec82_1 - - r-vctrs=0.3.6=r40hcfec24a_0 - - r-viridislite=0.3.0=r40h6115d3f_1003 + - r-vctrs=0.3.8=r40hcfec24a_0 + - r-viridislite=0.4.0=r40hc72bb7e_0 - r-waldo=0.2.5=r40hc72bb7e_0 - - r-withr=2.4.1=r40hc72bb7e_0 - - r-xfun=0.20=r40hcfec24a_0 + - r-withr=2.4.2=r40hc72bb7e_0 + - r-xfun=0.23=r40hcfec24a_0 - r-xml2=1.3.2=r40h0357c0b_1 - - r-xtable=1.8_4=r40h6115d3f_3 - - r-xts=0.12.1=r40hcdcec82_0 - r-yaml=2.2.1=r40hcfec24a_1 - r-zeallot=0.1.0=r40h6115d3f_1002 - - r-zoo=1.8_9=r40hcfec24a_0 - ratelimiter=1.2.0=py_1002 - - readline=8.0=he28a2e2_2 + - readline=8.1=h46c0cb4_0 - requests=2.25.1=pyhd3deb0d_0 + - requests-oauthlib=1.3.0=pyh9f0ad1d_0 - rsa=4.7.2=pyh44b312d_0 - - s3transfer=0.3.6=pyhd8ed1ab_0 + - s3transfer=0.4.2=pyhd8ed1ab_0 - sed=4.8=he412f7d_0 - - send2trash=1.5.0=py_0 - setuptools=49.6.0=py39hf3d152e_3 - simplejson=3.17.2=py39h3811e60_2 - - six=1.15.0=pyh9f0ad1d_0 + - six=1.16.0=pyh6c4a22f_0 - slacker=0.14.0=py_0 - - smart_open=4.2.0=pyhd8ed1ab_0 + - smart_open=5.0.0=pyhd8ed1ab_0 - smmap=3.0.5=pyh44b312d_0 - - snakemake=6.0.5=0 - - snakemake-minimal=6.0.5=py_0 - - sqlite=3.34.0=h74cdb3f_0 + - snakemake=6.4.0=hdfd78af_0 + - snakemake-minimal=6.4.0=pyhdfd78af_0 + - sqlite=3.35.5=h74cdb3f_0 + - stone=3.2.1=pyhd8ed1ab_0 + - stopit=1.1.2=py_0 - sysroot_linux-64=2.12=h77966d4_13 - - terminado=0.9.3=py39hf3d152e_0 - - testpath=0.4.4=py_0 - tk=8.6.10=h21135ba_1 - tktable=2.10=hb7b940f_3 - toml=0.10.2=pyhd8ed1ab_0 - toposort=1.6=pyhd8ed1ab_0 - - tornado=6.1=py39h3811e60_1 - traitlets=5.0.5=py_0 - typing-extensions=3.7.4.3=0 - typing_extensions=3.7.4.3=py_0 @@ -403,14 +334,13 @@ dependencies: - urllib3=1.26.4=pyhd8ed1ab_0 - veracitools=0.1.3=py_0 - wcwidth=0.2.5=pyh9f0ad1d_2 - - webencodings=0.5.1=py_1 - wheel=0.36.2=pyhd3deb0d_0 - wrapt=1.12.1=py39h3811e60_3 - xmlrunner=1.7.7=py_0 - xorg-kbproto=1.0.7=h7f98852_1002 - xorg-libice=1.0.10=h7f98852_0 - xorg-libsm=1.2.3=hd9c2040_1000 - - xorg-libx11=1.7.0=h7f98852_0 + - xorg-libx11=1.7.1=h7f98852_0 - xorg-libxau=1.0.9=h7f98852_0 - xorg-libxdmcp=1.1.3=h7f98852_0 - xorg-libxext=1.3.4=h7f98852_1 @@ -422,8 +352,6 @@ dependencies: - xz=5.2.5=h516909a_1 - yaml=0.2.5=h516909a_0 - yarl=1.6.3=py39h3811e60_1 - - zeromq=4.3.4=h9c3ff4c_0 - zipp=3.4.1=pyhd8ed1ab_0 - zlib=1.2.11=h516909a_1010 - - zstd=1.4.9=ha95c52a_0 - + - zstd=1.5.0=ha95c52a_0 \ No newline at end of file diff --git a/config.yaml b/config.yaml index e9bd288..77f26f5 100644 --- a/config.yaml +++ b/config.yaml @@ -8,7 +8,7 @@ # ParentCall2 # #---------------# # the filtered VCF file with your genotype likelihoods: -vcf: "tons_of_snps.vcf" +vcf: "out.14.missrecode.vcf" # Instructions to create pedigree file: https://sourceforge.net/p/lep-map3/wiki/software/LepMap3 Home/#parentcall2 # the pedigree file associated with your data @@ -39,7 +39,7 @@ informative: "informativeMask=123" # JoinSingles2ALL # #-------------------# # set this to false if you want to skip joining singles (0) to linkage groups -run_joinsingles2all: true +run_joinsingles2all: false # these are the parameters for JoinSingles2ALL, and are highly data-dependent # start with lower values for lod_limit and increase as necessary @@ -57,7 +57,7 @@ lod_difference: "lodDifference=2" exp_lg: 24 # Set iterations to the number of iterations you want per chromosome (more is better). Recommend 30 or more -iterations: 5 +iterations: 100 # Set threads_per to the number of threads you would like to use per linkage group for ordering threads_per: 5 @@ -68,13 +68,15 @@ dist_method: "useKosambi=1" # number of iterations to use of OrderMarkers2 phasing. # this number is typically 1-2 -phase_iterations: 1 +phase_iterations: 2 #-----------------# # Edge Trimming # #-----------------# # Edge trimming will examine the first and last X% of markers in a linkage group # and remove clusters that are N centimorgans away from the next marker +# you can "skip" trimming by making edge_length really low (e.g. 1) +# and trim_cutoff really high (e.g. 40) # Set edge_length to the percent number of markers you would like to examine from either end of the linkage group # Value can be an integer or decimal, i.e. 15 is the same as 0.15, which both mean "15%" @@ -93,7 +95,7 @@ run_lepanchor: false # the path to the genome assembly you are trying to anchor # ideally it *is not* gzipped and ends in .fa, but there are built-in workarounds if it is. -assembly: "boa.constrictor.fasta" +assembly: "YFT.genome.latest.fasta" # the number of linkage groups you have lg_count: 24 From e46c9774a6b7150b7735fef77705545f96c8a920 Mon Sep 17 00:00:00 2001 From: Pavel Dimens Date: Thu, 20 May 2021 14:19:23 -0400 Subject: [PATCH 3/6] ways to skip trimming --- rules/trim.smk | 2 +- scripts/TrimSummaryPlot.r | 26 ++++++++++++++++---------- 2 files changed, 17 insertions(+), 11 deletions(-) diff --git a/rules/trim.smk b/rules/trim.smk index 8a1c2bf..d3eca2f 100644 --- a/rules/trim.smk +++ b/rules/trim.smk @@ -39,7 +39,7 @@ rule trim_summary: echo "n_removed map" > {output.summary}.tmp cut -d" " -f1 {output.detailed} | uniq -c >> {output.summary}.tmp column -t {output.summary}.tmp > {output.summary} && rm {output.summary}.tmp + scripts/TrimSummaryPlot.r {output.summary} {params.lg} echo "Merging QC plots for all linkage groups" convert -density 300 {input.plots} {output.mergeplots} - scripts/TrimSummaryPlot.r {output.summary} {params.lg} """ diff --git a/scripts/TrimSummaryPlot.r b/scripts/TrimSummaryPlot.r index 8d27d64..af65195 100755 --- a/scripts/TrimSummaryPlot.r +++ b/scripts/TrimSummaryPlot.r @@ -6,17 +6,22 @@ args <- commandArgs(trailingOnly = TRUE) # args[1] is the summary file # args[2] is the expected number of linkage groups (to fill in zeroes) -# read in the summary table and pop out the LG number -tbl <- read_table(args[1]) %>% - rowwise() %>% - mutate(map = as.numeric(strsplit(map, "\\.")[[1]][2])) %>% - suppressMessages() +# if the summary file is empty (b/c nothing was trimmed) +if (length(readLines(args[1])) < 2){ + tbl <- data.frame(map = 1:args[2], n_removed = 0) +} else { + # read in the summary table and pop out the LG number + tbl <- read_table(args[1]) %>% + rowwise() %>% + mutate(map = as.numeric(strsplit(map, "\\.")[[1]][2])) %>% + suppressMessages() -# find the max number of linkage groups -lg_num <- 1:args[2] -missing_lg <- lg_num[!(lg_num %in% tbl$map)] -missings <- data.frame(n_removed = 0, map = missing_lg) -tbl <- merge(tbl, missings, on = map, all = TRUE) + # find the max number of linkage groups + lg_num <- 1:args[2] + missing_lg <- lg_num[!(lg_num %in% tbl$map)] + missings <- data.frame(n_removed = 0, map = missing_lg) + tbl <- merge(tbl, missings, on = map, all = TRUE) +} tbl %>% ggplot(aes(map, n_removed)) + @@ -27,6 +32,7 @@ tbl %>% ) + geom_point(shape = 21, fill = "#4aa6ff", color = "transparent", size = 5) + scale_x_continuous(labels = as.character(tbl$map), breaks = tbl$map) + + scale_y_continuous(limits = c(0, NA)) + labs( title = "Number of markers removed via trimming", x = "Linkage Group", From fc1f4132ab874b6ac0e4df1e6604196233d72337 Mon Sep 17 00:00:00 2001 From: Pavel Dimens Date: Fri, 21 May 2021 15:52:29 -0400 Subject: [PATCH 4/6] fix agp creation --- rules/LepAnchor.smk | 4 +-- rules/trim.smk | 14 +++----- scripts/LAMidpointSummary.r | 68 +++++++++++++++++++++++++++++++++++++ scripts/LA_summary.r | 1 - scripts/TrimCounts.r | 20 +++++++++++ scripts/TrimSummaryPlot.r | 31 +++++------------ 6 files changed, 103 insertions(+), 35 deletions(-) create mode 100755 scripts/LAMidpointSummary.r create mode 100755 scripts/TrimCounts.r diff --git a/rules/LepAnchor.smk b/rules/LepAnchor.smk index 1a4ab7e..b29ac15 100644 --- a/rules/LepAnchor.smk +++ b/rules/LepAnchor.smk @@ -323,8 +323,8 @@ rule unused: """ cut -f 1 {input.lengths} | grep -v -w -F -f <(cut -f 2 {input.haplos};awk '($5!="U"){{print $6}}' {input.agp}) > {output.txt} grep -F -w -f {output.txt} {input.lengths} | awk '{{print $1,1,$2,1,"W",$1,1,$2,"+"}}' > {output.agp} - cat {input.agp} {output.agp} > {output.final_agp} - cat {input.scaff_agp} {output.agp} > {output.scaff_agp} + cat {input.agp} > {output.final_agp} + cat {input.scaff_agp} > {output.scaff_agp} """ rule build_scaffold_fasta: diff --git a/rules/trim.smk b/rules/trim.smk index d3eca2f..b2295eb 100644 --- a/rules/trim.smk +++ b/rules/trim.smk @@ -29,17 +29,13 @@ rule trim_summary: priority: 1 shell: """ - echo "# this is a summary of which markers were removed from which linkage group via trimming distant edge clusters" >> {output.detailed} - echo -e "LG\trm_marker" >> {output.detailed} for each in 5_Trim/logs/ordered.*.removed ; do BASE=$(basename $each | cut -d "." -f1,2) - sed -e "s/^/$BASE /" $each >> {output.detailed}.tmp - done - sort -V {output.detailed}.tmp > {output.detailed} && rm {output.detailed}.tmp - echo "n_removed map" > {output.summary}.tmp - cut -d" " -f1 {output.detailed} | uniq -c >> {output.summary}.tmp - column -t {output.summary}.tmp > {output.summary} && rm {output.summary}.tmp - scripts/TrimSummaryPlot.r {output.summary} {params.lg} + sed -e "s/^/$BASE /" $each >> {output.detailed}.tmp + done | sort -V > {output.detailed} + #sort -V {output.detailed}.tmp >> {output.detailed} && rm {output.detailed}.tmp + scripts/TrimCounts.r {output.detailed} {params.lg} > {output.summary} + scripts/TrimSummaryPlot.r {output.summary} echo "Merging QC plots for all linkage groups" convert -density 300 {input.plots} {output.mergeplots} """ diff --git a/scripts/LAMidpointSummary.r b/scripts/LAMidpointSummary.r new file mode 100755 index 0000000..23dd908 --- /dev/null +++ b/scripts/LAMidpointSummary.r @@ -0,0 +1,68 @@ +#! /usr/bin/env Rscript + +suppressMessages(library(tidyverse, warn.conflicts = FALSE, quietly = TRUE)) + +args <- commandArgs(trailingOnly = TRUE) +# args[1] = mareydata file +# args[2] = marey midpoint data (optional) + +allmaps <- suppressMessages(read_tsv(gzfile(args[1]), col_names = FALSE)) %>% + select(X3, X2, X5) %>% + rename(lg = X3, Mb = X2, cM = X5) %>% + mutate(Mb = Mb/1000000) %>% + group_by(lg) %>% + mutate(marker = seq_along(Mb)) + +allmaps %>% + ggplot(aes(x = Mb, y = cM)) + + geom_point(size = 0.6, color = "dodgerblue") + + labs( + title = "Marker Positions in Anchored+Oriented Assembly", + subtitle = "The relative marker positions vs their position in the chromosome/LG", + x = "Physical Position (Mbp)", + y = "Genetic Position (cM)" + ) + + theme( + legend.position = "top", + legend.margin = margin(0, 0, 0, 0), + legend.box.margin = margin(-5, -5, -5, -5), + legend.title = element_blank(), + legend.text = element_text(size = 13), + axis.text.x = element_text(colour = "black", size = 11), + axis.text.y = element_text(colour = "black", size = 11), + strip.text = element_text(size = 13), + axis.title.x = element_text(size = 13, margin = margin(t = 5, r = 0, b = 0, l = 0)), + axis.title.y = element_text(size = 13, margin = margin(t = 0, r = 5, b = 0, l = 0)) + ) + + facet_wrap(~lg, ncol = 4, scales = "free") + +outfile <- paste0(dirname(args[1]), "/LepAnchor.midpoint.mareymaps.pdf") +savedims <- length(unique(allmaps$lg)) * 2.5 + +ggsave(outfile, width = 8.5, height = savedims/4, units = "in") + +allmaps %>% + ggplot(aes(x = marker, y = cM)) + + geom_point(size = 0.6, alpha = 0.5, color = "dodgerblue") + + labs( + title = "Relative Marker Positions within Linkge Groups", + subtitle = "The distance of sequential markers from each other in the linkage maps", + x = "Marker Number", + y = "Genetic Position (cM)" + ) + + theme( + legend.position = "top", + legend.margin = margin(0, 0, 0, 0), + legend.box.margin = margin(-5, -5, -5, -5), + legend.title = element_blank(), + legend.text = element_text(size = 13), + axis.text.x = element_text(colour = "black", size = 11), + axis.text.y = element_text(colour = "black", size = 11), + strip.text = element_text(size = 13), + axis.title.x = element_text(size = 13, margin = margin(t = 5, r = 0, b = 0, l = 0)), + axis.title.y = element_text(size = 13, margin = margin(t = 0, r = 5, b = 0, l = 0)) + ) + + facet_wrap(~lg, ncol = 4, scales = "free_x") + +outfile2 <- paste0(dirname(args[1]), "/LepAnchor.midpoint.sequentialmaps.pdf") +ggsave(outfile2, width = 8.5, height = savedims/5, units = "in", limitsize = FALSE) \ No newline at end of file diff --git a/scripts/LA_summary.r b/scripts/LA_summary.r index 4c18b8e..34ecea7 100755 --- a/scripts/LA_summary.r +++ b/scripts/LA_summary.r @@ -4,7 +4,6 @@ suppressMessages(library(tidyverse, warn.conflicts = FALSE, quietly = TRUE)) args <- commandArgs(trailingOnly = TRUE) # args[1] = mareydata file -# args[2] = marey midpoint data (optional) allmaps <- suppressMessages(read_tsv(gzfile(args[1]), col_names = FALSE)) %>% select(X3, X2, X5, X6) %>% diff --git a/scripts/TrimCounts.r b/scripts/TrimCounts.r new file mode 100755 index 0000000..b0faefb --- /dev/null +++ b/scripts/TrimCounts.r @@ -0,0 +1,20 @@ +#! /usr/bin/env Rscript + +args <- commandArgs(trailingOnly = TRUE) +# args[1] = trim.details file +# args[2] = number of LG + +if (length(readLines(args[1])) < 1){ + tbl <- 0 + vals <- 0 +} else { + tbl <- read.table(args[1], header = FALSE) + # split lg filenames to pull out lg number + vals <- as.numeric(sapply(tbl[,1], function(y){strsplit(y, "\\.")[[1]][2]})) +} +# count the occurence of 1:n_lg +counts <- sapply(1:args[2], function(x){sum(vals == x)}) +# create dataframe of counts +out_df <- data.frame(lg = 1:args[2], n_removed = counts) +# print to terminal +print(out_df, row.names = FALSE) \ No newline at end of file diff --git a/scripts/TrimSummaryPlot.r b/scripts/TrimSummaryPlot.r index af65195..8b4cd60 100755 --- a/scripts/TrimSummaryPlot.r +++ b/scripts/TrimSummaryPlot.r @@ -4,40 +4,25 @@ suppressMessages(library(tidyverse, quietly = TRUE)) args <- commandArgs(trailingOnly = TRUE) # args[1] is the summary file -# args[2] is the expected number of linkage groups (to fill in zeroes) -# if the summary file is empty (b/c nothing was trimmed) -if (length(readLines(args[1])) < 2){ - tbl <- data.frame(map = 1:args[2], n_removed = 0) -} else { - # read in the summary table and pop out the LG number - tbl <- read_table(args[1]) %>% - rowwise() %>% - mutate(map = as.numeric(strsplit(map, "\\.")[[1]][2])) %>% - suppressMessages() - - # find the max number of linkage groups - lg_num <- 1:args[2] - missing_lg <- lg_num[!(lg_num %in% tbl$map)] - missings <- data.frame(n_removed = 0, map = missing_lg) - tbl <- merge(tbl, missings, on = map, all = TRUE) -} +# read in the summary table and pop out the LG number +tbl <- read_table(args[1]) %>% suppressMessages() tbl %>% - ggplot(aes(map, n_removed)) + + ggplot(aes(lg, n_removed)) + geom_segment( - aes(map, n_removed, xend = map, yend=0), + aes(lg, n_removed, xend = lg, yend=0), size = 5, color = "#78bcff" ) + geom_point(shape = 21, fill = "#4aa6ff", color = "transparent", size = 5) + - scale_x_continuous(labels = as.character(tbl$map), breaks = tbl$map) + - scale_y_continuous(limits = c(0, NA)) + + scale_x_continuous(labels = as.character(tbl$lg), breaks = tbl$lg) + labs( title = "Number of markers removed via trimming", x = "Linkage Group", y = "Number removed" - ) - + ) + + scale_y_continuous(limits = c(-0.05, max(tbl$n_removed)+10)) + ggsave(paste0(args[1], ".pdf"), height = 4, width = 7, units = "in") ggsave(paste0(args[1], ".svg"), height = 4, width = 7, units = "in") \ No newline at end of file From 22969b30342e77783fe255246d006e663ea5a4ad Mon Sep 17 00:00:00 2001 From: Pavel Dimens Date: Tue, 25 May 2021 14:02:05 -0400 Subject: [PATCH 5/6] more tweaks --- rules/LepAnchor.smk | 14 +++++--- scripts/LASummary.r | 68 +++++++++++++++++++++++++++++++++++++++ scripts/LASummarySexAvg.r | 68 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 145 insertions(+), 5 deletions(-) create mode 100755 scripts/LASummary.r create mode 100755 scripts/LASummarySexAvg.r diff --git a/rules/LepAnchor.smk b/rules/LepAnchor.smk index b29ac15..226c3d9 100644 --- a/rules/LepAnchor.smk +++ b/rules/LepAnchor.smk @@ -357,13 +357,13 @@ rule mareymap_data: agp = expand("10_Anchoring/agp/chr.{lgs}.agp", lgs = lg_range) output: mareydata = "11_MareyMaps/marey.data.gz", - midpoints = "11_MareyMaps/marey.midpoints_data.gz" + sexavg = "11_MareyMaps/marey.data.sexavg.gz" log: "11_MareyMaps/missing_scaffolds.txt" message: """ Creating Marey map interval data first points in uncertainty intervals | {output.mareydata} - midpoints in uncertainty intervals | {output.midpoints} + midpoints in uncertainty intervals | {output.sexavg} """ params: chrom = lg @@ -377,20 +377,24 @@ rule mareymap_data: for c in $(seq 1 {params.chrom}) do awk -vn=$c '($3==n)' {input.lift} | awk -f software/LepAnchor/scripts/liftover.awk 10_Anchoring/agp/chr.$c.agp - | awk -vm=1 '(/LG/ && NR>=4){{if (NF>4) s=0.5; else s=1;print $1"\t"$2"\t"$3"\t"m"\t"s*($4+$5)}}' | gzip - done > {output.midpoints} 2> /dev/null + done > {output.sexavg} 2> /dev/null """ rule mareymaps: input: data = "11_MareyMaps/marey.data.gz", + sexavg = "11_MareyMaps/marey.data.sexavg.gz", agp = expand("10_Anchoring/agp/chr.{lgs}.agp", lgs = lg_range) output: indiv_plots = expand("11_MareyMaps/LG.{lgs}.mareymap.png", lgs = lg_range), summary = "11_MareyMaps/LepAnchor.mareymaps.pdf", - sequential = "11_MareyMaps/LepAnchor.sequentialmaps.pdf" + sequential = "11_MareyMaps/LepAnchor.sequentialmaps.pdf", + SAsummary = "11_MareyMaps/LepAnchor.sexavg.mareymaps.pdf", + SAsequential = "11_MareyMaps/LepAnchor.sexavg.sequentialmaps.pdf" message: "Creating Marey Maps" shell: """ Rscript software/LepAnchor/scripts/plot_marey.R {input.data} 10_Anchoring/agp - Rscript scripts/LA_summary.r {input.data} + Rscript scripts/LASummary.r {input.data} + Rscript scripts/LASummarySexAvg.r {input.sexavg} """ \ No newline at end of file diff --git a/scripts/LASummary.r b/scripts/LASummary.r new file mode 100755 index 0000000..34ecea7 --- /dev/null +++ b/scripts/LASummary.r @@ -0,0 +1,68 @@ +#! /usr/bin/env Rscript + +suppressMessages(library(tidyverse, warn.conflicts = FALSE, quietly = TRUE)) + +args <- commandArgs(trailingOnly = TRUE) +# args[1] = mareydata file + +allmaps <- suppressMessages(read_tsv(gzfile(args[1]), col_names = FALSE)) %>% + select(X3, X2, X5, X6) %>% + rename(lg = X3, Mb = X2, male = X5, female = X6) %>% + mutate(Mb = Mb/1000000) %>% + group_by(lg) %>% + mutate(marker = seq_along(Mb)) %>% + pivot_longer(c(male, female), names_to = "sex", values_to = "cM" ) + +allmaps %>% + ggplot(aes(x = Mb, y = cM, color = sex)) + + geom_point(size = 0.6) + + labs( + title = "Marker Positions in Anchored+Oriented Assembly", + subtitle = "The relative marker positions vs their position in the chromosome/LG", + x = "Physical Position (Mbp)", + y = "Genetic Position (cM)" + ) + + theme( + legend.position = "top", + legend.margin = margin(0, 0, 0, 0), + legend.box.margin = margin(-5, -5, -5, -5), + legend.title = element_blank(), + legend.text = element_text(size = 13), + axis.text.x = element_text(colour = "black", size = 11), + axis.text.y = element_text(colour = "black", size = 11), + strip.text = element_text(size = 13), + axis.title.x = element_text(size = 13, margin = margin(t = 5, r = 0, b = 0, l = 0)), + axis.title.y = element_text(size = 13, margin = margin(t = 0, r = 5, b = 0, l = 0)) + ) + + facet_wrap(~lg, ncol = 4, scales = "free") + +outfile <- paste0(dirname(args[1]), "/LepAnchor.mareymaps.pdf") +savedims <- length(unique(allmaps$lg)) * 2.5 + +ggsave(outfile, width = 8.5, height = savedims/4, units = "in") + +allmaps %>% + ggplot(aes(x = marker, y = cM)) + + geom_point(size = 0.6, alpha = 0.5) + + labs( + title = "Relative Marker Positions within Linkge Groups", + subtitle = "The distance of sequential markers from each other in the linkage maps", + x = "Marker Number", + y = "Genetic Position (cM)" + ) + + theme( + legend.position = "top", + legend.margin = margin(0, 0, 0, 0), + legend.box.margin = margin(-5, -5, -5, -5), + legend.title = element_blank(), + legend.text = element_text(size = 13), + axis.text.x = element_text(colour = "black", size = 11), + axis.text.y = element_text(colour = "black", size = 11), + strip.text = element_text(size = 13), + axis.title.x = element_text(size = 13, margin = margin(t = 5, r = 0, b = 0, l = 0)), + axis.title.y = element_text(size = 13, margin = margin(t = 0, r = 5, b = 0, l = 0)) + ) + + facet_wrap(lg ~ sex, ncol = 2, scales = "free_x") + +outfile2 <- paste0(dirname(args[1]), "/LepAnchor.sequentialmaps.pdf") +ggsave(outfile2, width = 8.5, height = savedims, units = "in", limitsize = FALSE) \ No newline at end of file diff --git a/scripts/LASummarySexAvg.r b/scripts/LASummarySexAvg.r new file mode 100755 index 0000000..e176af4 --- /dev/null +++ b/scripts/LASummarySexAvg.r @@ -0,0 +1,68 @@ +#! /usr/bin/env Rscript + +suppressMessages(library(tidyverse, warn.conflicts = FALSE, quietly = TRUE)) + +args <- commandArgs(trailingOnly = TRUE) +# args[1] = mareydata file +# args[2] = marey midpoint data (optional) + +allmaps <- suppressMessages(read_tsv(gzfile(args[1]), col_names = FALSE)) %>% + select(X3, X2, X5) %>% + rename(lg = X3, Mb = X2, cM = X5) %>% + mutate(Mb = Mb/1000000) %>% + group_by(lg) %>% + mutate(marker = seq_along(Mb)) + +allmaps %>% + ggplot(aes(x = Mb, y = cM)) + + geom_point(size = 0.6, color = "dodgerblue") + + labs( + title = "Marker Positions in Anchored+Oriented Assembly", + subtitle = "The relative marker positions vs their position in the chromosome/LG", + x = "Physical Position (Mbp)", + y = "Genetic Position (cM)" + ) + + theme( + legend.position = "top", + legend.margin = margin(0, 0, 0, 0), + legend.box.margin = margin(-5, -5, -5, -5), + legend.title = element_blank(), + legend.text = element_text(size = 13), + axis.text.x = element_text(colour = "black", size = 11), + axis.text.y = element_text(colour = "black", size = 11), + strip.text = element_text(size = 13), + axis.title.x = element_text(size = 13, margin = margin(t = 5, r = 0, b = 0, l = 0)), + axis.title.y = element_text(size = 13, margin = margin(t = 0, r = 5, b = 0, l = 0)) + ) + + facet_wrap(~lg, ncol = 4, scales = "free") + +outfile <- paste0(dirname(args[1]), "/LepAnchor.sexavg.mareymaps.pdf") +savedims <- length(unique(allmaps$lg)) * 2.5 + +ggsave(outfile, width = 8.5, height = savedims/4, units = "in") + +allmaps %>% + ggplot(aes(x = marker, y = cM)) + + geom_point(size = 0.6, alpha = 0.5, color = "dodgerblue") + + labs( + title = "Relative Marker Positions within Linkge Groups", + subtitle = "The distance of sequential markers from each other in the linkage maps", + x = "Marker Number", + y = "Genetic Position (cM)" + ) + + theme( + legend.position = "top", + legend.margin = margin(0, 0, 0, 0), + legend.box.margin = margin(-5, -5, -5, -5), + legend.title = element_blank(), + legend.text = element_text(size = 13), + axis.text.x = element_text(colour = "black", size = 11), + axis.text.y = element_text(colour = "black", size = 11), + strip.text = element_text(size = 13), + axis.title.x = element_text(size = 13, margin = margin(t = 5, r = 0, b = 0, l = 0)), + axis.title.y = element_text(size = 13, margin = margin(t = 0, r = 5, b = 0, l = 0)) + ) + + facet_wrap(~lg, ncol = 4, scales = "free_x") + +outfile2 <- paste0(dirname(args[1]), "/LepAnchor.sexavg.sequentialmaps.pdf") +ggsave(outfile2, width = 8.5, height = savedims/5, units = "in", limitsize = FALSE) \ No newline at end of file From 6c4aef82df3c0d415f63e5ace18d2cb4b64efba7 Mon Sep 17 00:00:00 2001 From: Pavel Dimens Date: Thu, 27 May 2021 13:23:02 -0400 Subject: [PATCH 6/6] add trimming to LepAnchor --- conda_setup.yml | 10 ++- config.yaml | 26 ++++-- rules/LepAnchor.smk | 194 ++++++++++++++++++++++++++++++++------------ scripts/LASummary.r | 48 +++++------ scripts/LATrim.r | 156 +++++++++++++++++++++++++++++++++++ 5 files changed, 351 insertions(+), 83 deletions(-) create mode 100755 scripts/LATrim.r diff --git a/conda_setup.yml b/conda_setup.yml index 5e596f7..c81b970 100644 --- a/conda_setup.yml +++ b/conda_setup.yml @@ -1,5 +1,6 @@ name: lepwrap channels: + - conda-forge/label/main - bioconda - conda-forge - defaults @@ -148,7 +149,7 @@ dependencies: - multidict=5.1.0=py39h3811e60_1 - nbformat=5.1.3=pyhd8ed1ab_0 - ncurses=6.2=h58526e2_4 - - networkx=2.3=py_0 + - networkx=2.5=py_0 - numpy=1.20.3=py39hdbf815f_0 - oauth2client=4.1.3=py_0 - oauthlib=3.0.1=py_0 @@ -206,6 +207,7 @@ dependencies: - r-cli=2.5.0=r40hc72bb7e_0 - r-clipr=0.7.1=r40h142f84f_0 - r-colorspace=2.0_1=r40hcfec24a_0 + - r-cowplot=1.1.1=r40hc72bb7e_0 - r-cpp11=0.2.7=r40hc72bb7e_0 - r-crayon=1.4.1=r40hc72bb7e_0 - r-curl=4.3.1=r40hcfec24a_0 @@ -223,9 +225,11 @@ dependencies: - r-farver=2.1.0=r40h03ef668_0 - r-forcats=0.5.1=r40hc72bb7e_0 - r-fs=1.5.0=r40h0357c0b_0 + - r-fuzzyjoin=0.1.6=r40_0 - r-gargle=1.1.0=r40hc72bb7e_0 - r-gdtools=0.2.2=r40h36050f4_1 - r-generics=0.1.0=r40hc72bb7e_0 + - r-geosphere=1.5_10=r40hcfec24a_2 - r-ggplot2=3.3.3=r40hc72bb7e_0 - r-glue=1.4.2=r40hcfec24a_0 - r-googledrive=1.0.1=r40h6115d3f_1 @@ -281,6 +285,8 @@ dependencies: - r-rvest=1.0.0=r40hc72bb7e_0 - r-scales=1.1.1=r40h6115d3f_0 - r-selectr=0.4_2=r40h6115d3f_1 + - r-sp=1.4_5=r40hcfec24a_0 + - r-stringdist=0.9.6.3=r40hcfec24a_0 - r-stringi=1.6.2=r40hcabe038_0 - r-stringr=1.4.0=r40h6115d3f_2 - r-svglite=2.0.0=r40h03ef668_0 @@ -354,4 +360,4 @@ dependencies: - yarl=1.6.3=py39h3811e60_1 - zipp=3.4.1=pyhd8ed1ab_0 - zlib=1.2.11=h516909a_1010 - - zstd=1.5.0=ha95c52a_0 \ No newline at end of file + - zstd=1.5.0=ha95c52a_0 diff --git a/config.yaml b/config.yaml index 77f26f5..d32e040 100644 --- a/config.yaml +++ b/config.yaml @@ -60,7 +60,7 @@ exp_lg: 24 iterations: 100 # Set threads_per to the number of threads you would like to use per linkage group for ordering -threads_per: 5 +threads_per: 2 # Choose your distance method by commenting the line you dont want dist_method: "useKosambi=1" @@ -76,14 +76,14 @@ phase_iterations: 2 # Edge trimming will examine the first and last X% of markers in a linkage group # and remove clusters that are N centimorgans away from the next marker # you can "skip" trimming by making edge_length really low (e.g. 1) -# and trim_cutoff really high (e.g. 40) +# and trim_cutoff really high (e.g. 50) # Set edge_length to the percent number of markers you would like to examine from either end of the linkage group # Value can be an integer or decimal, i.e. 15 is the same as 0.15, which both mean "15%" -edge_length: 15 +edge_length: 1 # Set trim_cuttoff to the centiMorgan distance cutoff (10 is reasonable) -trim_cutoff: 10 +trim_cutoff: 40 @@ -91,7 +91,7 @@ trim_cutoff: 10 #### Lep-Anchor #### #################### # change this to true if you also want to run Lep-Anchor -run_lepanchor: false +run_lepanchor: true # the path to the genome assembly you are trying to anchor # ideally it *is not* gzipped and ends in .fa, but there are built-in workarounds if it is. @@ -114,3 +114,19 @@ proximity_file: "/dev/null" OS_info: "Ubuntu" #OS_info: "CentOS5" #OS_info: "CentOS6" + + +#-----------------# +# Edge Trimming # +#-----------------# +# Edge trimming will examine the first and last X% of markers in a linkage group +# and remove clusters that are N% centimorgans (of the total cM span) away from +# the next marker. You can "skip" trimming by making edge_length really low (e.g. 1) +# and trim_cutoff really high (e.g. 50) + +# Set edge_length to the percent number of markers you would like to examine from either end of the linkage group +# Value can be an integer or decimal, i.e. 15 is the same as 0.15, which both mean "15%" +LA_edge_length: 20 + +# Set trim_cuttoff to the centiMorgan distance cutoff (5 is reasonable) +LA_trim_cutoff: 3 \ No newline at end of file diff --git a/rules/LepAnchor.smk b/rules/LepAnchor.smk index 226c3d9..6a6c666 100644 --- a/rules/LepAnchor.smk +++ b/rules/LepAnchor.smk @@ -9,22 +9,32 @@ proximity = config["proximity_file"] lg = config["lg_count"] lg_range = list(range(1,lg+1)) os_name = config["OS_info"] +edgelen = config["LA_edge_length"] +trimdist = config["LA_trim_cutoff"] rule all: input: - fasta = "10_Anchoring/Anchored.contigs.fa.gz", - scaff = "10_Anchoring/Anchored.scaffolds.fa.gz", - mareydata = "11_MareyMaps/marey.data.gz", - mareymaps = "11_MareyMaps/LepAnchor.mareymaps.pdf" + fasta = "12_Fasta/Anchored.contigs.fa.gz", + scaff = "12_Fasta/Anchored.scaffolds.fa.gz", + fastaonly = "12_Fasta/Anchored.contigs.only.fa.gz", + scaffonly = "12_Fasta/Anchored.scaffolds.only.fa.gz", + mareydata = "13_MareyMaps/data.marey.gz", + mareymaps = "13_MareyMaps/LepAnchor.mareymaps.pdf", + trimmedmareymaps = "15_TrimNewIntervals/LepAnchor.mareymaps.pdf", + trimsummary = "15_TrimNewIntervals/LA.trim.summary.pdf" message: """ Lep-Anchor has finished. Good luck with the rest of your analyses! Output Files: ============= - contig fasta file | 10_Anchoring/Anchored.contigs.fa.gz - scaffold fasta file | 10_Anchoring/Anchored.scaffolds.fa.gz - converted linakge maps | 11_MareyMaps/marey.data.gz - summary MareyMaps | 11_MareyMaps/LepAnchor.mareymaps.pdf + contig fasta file | {input.fasta} + contigs-only fasta | {input.fastaonly} + scaffold fasta file | {input.scaff} + scaffolds-only fasta | {input.scaffonly} + converted linakge maps | {input.mareydata} + untrimmed marey maps | {input.mareymaps} + interval trimming summary | {input.trimsummary} + trimmed marey maps | {input.trimmedmareymaps} """ rule repeatmask: @@ -123,7 +133,7 @@ rule chain_2: rule extract_markers: input: "2_Filtering/data.filtered.lepmap3.gz" - output: "snps.txt" + output: report("snps.txt", category = "Data") message: "Extracting marker information from Lep-Map3 data file {input}" shell: "scripts/extract_markers.sh {input}" @@ -133,7 +143,7 @@ rule generate_intervals: markers = "snps.txt", intervals = expand("7_Intervals/ordered.{x}.intervals", x = range(1, lg + 1)) output: - intervals = "10_Anchoring/lepmap3_intervals.la" + intervals = report("10_Anchoring/lepmap3_intervals.la", category = "Data") message: "Combining {params} Lep-Map3 interval files into single LepAnchor input {output}" params: lg = lg @@ -146,13 +156,13 @@ rule generate_intervals: rule contiglengths: input: geno - output: "10_Anchoring/contigs.length" + output: report("10_Anchoring/contigs.length", category = "Data") message: "Getting contig lengths" shell: "gunzip -fc {input} | awk -f software/LepAnchor/scripts/contigLength.awk > {output}" rule find_haplotypes: input: "9_Chain/chainfile.gz" - output: "10_Anchoring/fullHaplotypes50.txt" + output: report("10_Anchoring/fullHaplotypes50.txt", category = "Logs") message: "Finding full haplotypes (potential chimeric contigs)" shell: """ @@ -166,8 +176,8 @@ rule liftover: intervals = "10_Anchoring/lepmap3_intervals.la", haplos = "10_Anchoring/fullHaplotypes50.txt" output: - lift = "10_Anchoring/liftover.la", - sortedlift = "10_Anchoring/liftover.sorted.la" + lift = report("10_Anchoring/liftover.la", category = "Lifted Intervals"), + sortedlift = report("10_Anchoring/liftover.sorted.la", category = "Lifted Intervals") message: "Running liftoverHaplotypes for the input maps" shell: """ @@ -178,7 +188,7 @@ rule liftover: rule cleanmap: input: "10_Anchoring/liftover.sorted.la" output: "10_Anchoring/map_all.clean" - log: "10_Anchoring/cleamap.log" + log: report("10_Anchoring/cleamap.log", category = "Logs") message: "Running CleanMap" shell: "java -cp software/LepAnchor CleanMap map={input} > {output} 2> {log}" @@ -187,7 +197,7 @@ rule map2bed: cleanmap = "10_Anchoring/map_all.clean", lengths = "10_Anchoring/contigs.length", output: "10_Anchoring/map.bed" - log: "10_Anchoring/map2bed.log" + log: report("10_Anchoring/map2bed.log", category = "Logs") message: "Running Map2Bed" shell: "java -cp software/LepAnchor Map2Bed map={input.cleanmap} contigLength={input.lengths} > {output} 2> {log}" @@ -218,7 +228,7 @@ rule place_orient: output: chrom = "10_Anchoring/orient_1/chr.{lg_range}.la" log: - chrom = "10_Anchoring/orient_1/logs/chr.{lg_range}.la.err" + chrom = report("10_Anchoring/orient_1/logs/chr.{lg_range}.la.err", category = "Anchoring I Logs") params: chrom = "{lg_range}" message: "Running PlaceAndOrientContigs for linkage group {params.chrom}" @@ -264,7 +274,7 @@ rule place_orient2: output: chrom = "10_Anchoring/orient_2/ichr.{lg_range}.la" log: - chrom = "10_Anchoring/orient_2/logs/ichr.{lg_range}.la.err" + chrom = report("10_Anchoring/orient_2/logs/ichr.{lg_range}.la.err", category = "Anchoring II Logs") params: chrom = "{lg_range}" message: "Running a second iteration of PlaceAndOrientContigs for linkage group {params.chrom}" @@ -278,8 +288,8 @@ rule prune: oriented = expand("10_Anchoring/orient_2/ichr.{lgs}.la", lgs = lg_range), bedfile = "10_Anchoring/map_propogated.bed" output: - pruned = "10_Anchoring/orient_2/pruned.la", - cleaned = "10_Anchoring/overlaps_rm.la" + pruned = report("10_Anchoring/orient_2/pruned.la", category = "Logs"), + cleaned = report("10_Anchoring/overlaps_rm.la", category = "Logs") message: "Pruning contig blocks without map support and removing overlaps" params: chrom = lg @@ -296,55 +306,93 @@ rule construct_agp: input: cleaned = "10_Anchoring/overlaps_rm.la" output: - agp = "10_Anchoring/agp/chr.{lg_range}.agp", - scaff_agp = "10_Anchoring/agp_scaffolds/chr.{lg_range}.scaffolds.agp" + agp = report("11_AGP/contigs/chr.{lg_range}.agp", category = "Contig AGP Files"), + scaff_agp = report("11_AGP/scaffolds/chr.{lg_range}.scaffolds.agp", category = "Scaffold AGP Files") message: "Creating AGP files for linkage group {params.chrom}" params: chrom = "{lg_range}" shell: """ - awk -vn={params.chrom} '($5==n)' {input.cleaned} | awk -vprefix="LG" -vlg={params.chrom} -f software/LepAnchor/scripts/makeagp_full2.awk - > 10_Anchoring/agp/chr.{params.chrom}.agp - awk -vn={params.chrom} '($5==n)' {input.cleaned} | awk -vprefix="LG" -vlg={params.chrom} -f software/LepAnchor/scripts/makeagp2.awk - > 10_Anchoring/agp_scaffolds/chr.{params.chrom}.scaffolds.agp + awk -vn={params.chrom} '($5==n)' {input.cleaned} | awk -vprefix="LG" -vlg={params.chrom} -f software/LepAnchor/scripts/makeagp_full2.awk - > {output.agp} + awk -vn={params.chrom} '($5==n)' {input.cleaned} | awk -vprefix="LG" -vlg={params.chrom} -f software/LepAnchor/scripts/makeagp2.awk - > {output.scaff_agp} """ rule unused: input: lengths = "10_Anchoring/contigs.length", haplos = "10_Anchoring/fullHaplotypes50.txt", - agp = expand("10_Anchoring/agp/chr.{lgs}.agp", lgs = lg_range), - scaff_agp = expand("10_Anchoring/agp_scaffolds/chr.{lgs}.scaffolds.agp", lgs = lg_range) + agp = expand("11_AGP/contigs/chr.{lgs}.agp", lgs = lg_range), output: - txt = "10_Anchoring/not_used_final.txt", - agp = "10_Anchoring/not_used.agp", - final_agp = "10_Anchoring/REF_LA.agp", - scaff_agp = "10_Anchoring/REF_LA_scaffolds.agp" + txt = "11_AGP/not_used_final.txt", + agp = "11_AGP/not_used.agp" message: "Finding unused contigs" shell: """ cut -f 1 {input.lengths} | grep -v -w -F -f <(cut -f 2 {input.haplos};awk '($5!="U"){{print $6}}' {input.agp}) > {output.txt} grep -F -w -f {output.txt} {input.lengths} | awk '{{print $1,1,$2,1,"W",$1,1,$2,"+"}}' > {output.agp} - cat {input.agp} > {output.final_agp} + """ + +rule build_final_agp: + input: + agp = expand("11_AGP/contigs/chr.{lgs}.agp", lgs = lg_range), + scaff_agp = expand("11_AGP/scaffolds/chr.{lgs}.scaffolds.agp", lgs = lg_range), + unused = "11_AGP/not_used.agp", + output: + contig_agp = "11_AGP/lepanchor.contigs.only.agp", + scaff_agp = "11_AGP/lepanchor.scaffolds.only.agp", + contig_all_agp = "11_AGP/lepanchor.contigs.all.agp", + scaff_all_agp = "11_AGP/lepanchor.scaffolds.all.agp" + message: "Generating final AGP files" + shell: + """ + cat {input.agp} > {output.contig_agp} cat {input.scaff_agp} > {output.scaff_agp} + cat {input.agp} {input.unused} > {output.contig_all_agp} + cat {input.scaff_agp} {input.unused} > {output.scaff_all_agp} """ -rule build_scaffold_fasta: +rule build_scaffold_only_fasta: input: assembly = geno, - agp = "10_Anchoring/REF_LA.agp" + agp = "11_AGP/lepanchor.contigs.only.agp" output: - fasta = "10_Anchoring/Anchored.scaffolds.fa.gz", + fasta = "12_Fasta/Anchored.scaffolds.only.fa.gz", + message: "Constructing final scaffold-only fasta file {output.fasta}" + shell: + """ + gunzip -fc {input.assembly} | awk -f software/LepAnchor/scripts/makefasta.awk - {input.agp} | gzip > {output.fasta} + """ + +rule build_scaffold_contig_fasta: + input: + assembly = geno, + agp = "11_AGP/lepanchor.contigs.all.agp" + output: + fasta = "12_Fasta/Anchored.scaffolds.fa.gz", message: "Constructing final scaffold fasta file {output.fasta}" shell: """ gunzip -fc {input.assembly} | awk -f software/LepAnchor/scripts/makefasta.awk - {input.agp} | gzip > {output.fasta} """ +rule build_contig_only_fasta: + input: + assembly = geno, + scaff_agp = "11_AGP/lepanchor.scaffolds.only.agp" + output: + fasta = "12_Fasta/Anchored.contigs.only.fa.gz" + message: "Constructing final contig-only fasta file {output.fasta}" + shell: + """ + gunzip -fc {input.assembly} | awk -f software/LepAnchor/scripts/makefasta.awk - {input.scaff_agp} | gzip > {output.fasta} + """ + rule build_contig_fasta: input: assembly = geno, - scaff_agp = "10_Anchoring/REF_LA_scaffolds.agp" + scaff_agp = "11_AGP/lepanchor.scaffolds.all.agp" output: - fasta = "10_Anchoring/Anchored.contigs.fa.gz" + fasta = "12_Fasta/Anchored.contigs.fa.gz" message: "Constructing final contig fasta file {output.fasta}" shell: """ @@ -354,11 +402,11 @@ rule build_contig_fasta: rule mareymap_data: input: lift = "10_Anchoring/liftover.la", - agp = expand("10_Anchoring/agp/chr.{lgs}.agp", lgs = lg_range) + agp = expand("11_AGP/contigs/chr.{lgs}.agp", lgs = lg_range) output: - mareydata = "11_MareyMaps/marey.data.gz", - sexavg = "11_MareyMaps/marey.data.sexavg.gz" - log: "11_MareyMaps/missing_scaffolds.txt" + mareydata = "13_MareyMaps/data.marey.gz", + sexavg = "13_MareyMaps/data.marey.sexavg.gz" + log: report("13_MareyMaps/missing_scaffolds.txt", category = "Logs") message: """ Creating Marey map interval data @@ -371,30 +419,70 @@ rule mareymap_data: """ for c in $(seq 1 {params.chrom}) do - awk -vn=$c '($3==n)' {input.lift} | awk -f software/LepAnchor/scripts/liftover.awk 10_Anchoring/agp/chr.$c.agp - | awk -vm=1 '(/LG/ && NF>=4){{if (NF==4) $5=$4;print $1"\t"$2"\t"$3"\t"m"\t"$4"\t"$5}}' | gzip + awk -vn=$c '($3==n)' {input.lift} | awk -f software/LepAnchor/scripts/liftover.awk 11_AGP/contigs/chr.$c.agp - | awk -vm=1 '(/LG/ && NF>=4){{if (NF==4) $5=$4;print $1"\t"$2"\t"$3"\t"m"\t"$4"\t"$5}}' | gzip done > {output.mareydata} 2> {log} for c in $(seq 1 {params.chrom}) do - awk -vn=$c '($3==n)' {input.lift} | awk -f software/LepAnchor/scripts/liftover.awk 10_Anchoring/agp/chr.$c.agp - | awk -vm=1 '(/LG/ && NR>=4){{if (NF>4) s=0.5; else s=1;print $1"\t"$2"\t"$3"\t"m"\t"s*($4+$5)}}' | gzip + awk -vn=$c '($3==n)' {input.lift} | awk -f software/LepAnchor/scripts/liftover.awk 11_AGP/contigs/chr.$c.agp - | awk -vm=1 '(/LG/ && NR>=4){{if (NF>4) s=0.5; else s=1;print $1"\t"$2"\t"$3"\t"m"\t"s*($4+$5)}}' | gzip done > {output.sexavg} 2> /dev/null """ rule mareymaps: input: - data = "11_MareyMaps/marey.data.gz", - sexavg = "11_MareyMaps/marey.data.sexavg.gz", - agp = expand("10_Anchoring/agp/chr.{lgs}.agp", lgs = lg_range) + data = "13_MareyMaps/data.marey.gz", + sexavg = "13_MareyMaps/data.marey.sexavg.gz", + agp = expand("11_AGP/contigs/chr.{lgs}.agp", lgs = lg_range) output: - indiv_plots = expand("11_MareyMaps/LG.{lgs}.mareymap.png", lgs = lg_range), - summary = "11_MareyMaps/LepAnchor.mareymaps.pdf", - sequential = "11_MareyMaps/LepAnchor.sequentialmaps.pdf", - SAsummary = "11_MareyMaps/LepAnchor.sexavg.mareymaps.pdf", - SAsequential = "11_MareyMaps/LepAnchor.sexavg.sequentialmaps.pdf" + indiv_plots = report(expand("13_MareyMaps/LG.{lgs}.mareymap.png", lgs = lg_range), category = "Marey Maps"), + summary = report("13_MareyMaps/LepAnchor.mareymaps.pdf", category = "Marey Maps") , + sequential = report("13_MareyMaps/LepAnchor.sequentialmaps.pdf", category = "Sequential Maps"), + SAsummary = report("13_MareyMaps/LepAnchor.sexavg.mareymaps.pdf", category = "Marey Maps Sex Avg"), + SAsequential = report("13_MareyMaps/LepAnchor.sexavg.sequentialmaps.pdf", category = "Sequential Maps Sex Avg") message: "Creating Marey Maps" shell: """ - Rscript software/LepAnchor/scripts/plot_marey.R {input.data} 10_Anchoring/agp - Rscript scripts/LASummary.r {input.data} + Rscript software/LepAnchor/scripts/plot_marey.R {input.data} 11_AGP/contigs + Rscript scripts/LASummary.r {input.data} true Rscript scripts/LASummarySexAvg.r {input.sexavg} - """ \ No newline at end of file + """ + +rule generate_updated_intervals: + input: "13_MareyMaps/data.marey.gz" + output: "14_NewIntervals/LA.intervals.{lg_range}" + message: "Splitting out LG {params.chrom} from {input}" + params: + chrom = "{lg_range}" + shell: + """ + zgrep "LG{params.chrom}\s" {input} > {output} + """ + +rule trim_newintervals: + input: "14_NewIntervals/LA.intervals.{lg_range}" + output: + outfile = "15_TrimNewIntervals/LA.intervals.{lg_range}.trimmed", + plot = "15_TrimNewIntervals/plots/LA.intervals.{lg_range}.trim.pdf" + message: "Trimming edge clusters for {input}" + params: + edge = edgelen, + dist = trimdist + shell: "Rscript scripts/LATrim.r {input} {params.dist} {params.edge} 15_TrimNewIntervals" + +rule merge_trimplots: + input: expand("15_TrimNewIntervals/plots/LA.intervals.{lg}.trim.pdf", lg = lg_range) + output: "15_TrimNewIntervals/LA.trim.summary.pdf" + message: "Merging trimming plots into {output}" + shell: "convert -density 200 {input} {output}" + + +rule merge_trimmedintervals: + input: expand("15_TrimNewIntervals/LA.intervals.{lg}.trimmed", lg = lg_range) + output: "15_TrimNewIntervals/data.marey.trimmed.gz" + message: "Concatenating trimmed intervals to {output}" + shell: "cat {input} | gzip -c > {output}" + +rule plot_trimmedintervals: + input: "15_TrimNewIntervals/data.marey.trimmed.gz" + output: report("15_TrimNewIntervals/LepAnchor.mareymaps.pdf", category = "Trimmed Marey Maps") + shell: "Rscript scripts/LASummary.r {input}" \ No newline at end of file diff --git a/scripts/LASummary.r b/scripts/LASummary.r index 34ecea7..5d7b440 100755 --- a/scripts/LASummary.r +++ b/scripts/LASummary.r @@ -41,28 +41,30 @@ savedims <- length(unique(allmaps$lg)) * 2.5 ggsave(outfile, width = 8.5, height = savedims/4, units = "in") -allmaps %>% - ggplot(aes(x = marker, y = cM)) + - geom_point(size = 0.6, alpha = 0.5) + - labs( - title = "Relative Marker Positions within Linkge Groups", - subtitle = "The distance of sequential markers from each other in the linkage maps", - x = "Marker Number", - y = "Genetic Position (cM)" - ) + - theme( - legend.position = "top", - legend.margin = margin(0, 0, 0, 0), - legend.box.margin = margin(-5, -5, -5, -5), - legend.title = element_blank(), - legend.text = element_text(size = 13), - axis.text.x = element_text(colour = "black", size = 11), - axis.text.y = element_text(colour = "black", size = 11), - strip.text = element_text(size = 13), - axis.title.x = element_text(size = 13, margin = margin(t = 5, r = 0, b = 0, l = 0)), - axis.title.y = element_text(size = 13, margin = margin(t = 0, r = 5, b = 0, l = 0)) +if (length(args) > 1){ + allmaps %>% + ggplot(aes(x = marker, y = cM)) + + geom_point(size = 0.6, alpha = 0.5) + + labs( + title = "Relative Marker Positions within Linkage Groups", + subtitle = "The distance of sequential markers from each other in the linkage maps", + x = "Marker Number", + y = "Genetic Position (cM)" ) + - facet_wrap(lg ~ sex, ncol = 2, scales = "free_x") + theme( + legend.position = "top", + legend.margin = margin(0, 0, 0, 0), + legend.box.margin = margin(-5, -5, -5, -5), + legend.title = element_blank(), + legend.text = element_text(size = 13), + axis.text.x = element_text(colour = "black", size = 11), + axis.text.y = element_text(colour = "black", size = 11), + strip.text = element_text(size = 13), + axis.title.x = element_text(size = 13, margin = margin(t = 5, r = 0, b = 0, l = 0)), + axis.title.y = element_text(size = 13, margin = margin(t = 0, r = 5, b = 0, l = 0)) + ) + + facet_wrap(lg ~ sex, ncol = 2, scales = "free_x") -outfile2 <- paste0(dirname(args[1]), "/LepAnchor.sequentialmaps.pdf") -ggsave(outfile2, width = 8.5, height = savedims, units = "in", limitsize = FALSE) \ No newline at end of file + outfile2 <- paste0(dirname(args[1]), "/LepAnchor.sequentialmaps.pdf") + ggsave(outfile2, width = 8.5, height = savedims, units = "in", limitsize = FALSE) +} \ No newline at end of file diff --git a/scripts/LATrim.r b/scripts/LATrim.r new file mode 100755 index 0000000..d43110b --- /dev/null +++ b/scripts/LATrim.r @@ -0,0 +1,156 @@ +#! /usr/bin/env Rscript + +suppressMessages(if (!require("tidyverse")) install.packages("tidyverse")) +suppressMessages(library("tidyverse")) +suppressMessages(if (!require("cowplot")) install.packages("cowplot")) +library(cowplot) + +args <- commandArgs(trailingOnly = TRUE) +# args[1] is the OrderMarkers2 output file +# args[2] is the centimorgan cutoff distance +# args[3] is the % of edge markers to scan +# args[4] is the name of the output folder + +lgfile <- read.delim( + args[1], + header = FALSE, + sep = "\t", + comment.char="#" +) %>% + mutate(Mpass = "PASS", Fpass = "PASS") + +## setup output file names ## +# split the filename by path +filename <- unlist(strsplit(args[1], "/")) +# pop out just the filename +filename <- filename[length(filename)] +lg <- (strsplit(lgfile$V1[1], "LG") %>% unlist())[2] %>% as.numeric() + +#========= output instantiation ========# + +dir.create(args[4], showWarnings = FALSE) +dir.create(paste0(args[4],"/plots"), showWarnings = FALSE) +dir.create(paste0(args[4],"/logs"), showWarnings = FALSE) +dir.create(paste0(args[4],"/QC_raw"), showWarnings = FALSE) +outfile_base <- paste(args[4], filename, sep = "/") +outfile_log_base <- paste(args[4], "logs", filename, sep = "/") +plotfile_base <- paste(args[4], "plots", filename, sep = "/") +plotfile <- paste(plotfile_base, "trim.pdf", sep = ".") +rawfile_base <- paste(args[4], "QC_raw", filename, sep = "/") + +##### Pruning the ends ##### +dist_thresh <- as.numeric(args[2]) +if(dist_thresh >= 1){ + dist_thresh <- dist_thresh * .01 +} +dist_thresh <- abs(max(lgfile$V5) - min(lgfile$V5)) * dist_thresh + +# if the percent threshold is given as an integer, convert it to a decimal +edge_length <- as.numeric(args[3]) +if(edge_length >= 1){ + edge_length <- edge_length * .01 +} + +n_markers <- length(lgfile$V1) + +front_edge <- round(n_markers * edge_length, digits = 0) +back_edge <- round(n_markers - front_edge, digits = 0) + +for (j in 5:6){ # iterate over male (5) and female (6) + # sort on column + if (j == 5){ + lgfile <- arrange(lgfile, V5) + } else { + lgfile <- arrange(lgfile, V6) + } + # trim beginning + for(a in front_edge:2){ #first n% of total markers starting from the front edge, going out + diff <- abs(lgfile[a,j]-lgfile[a-1,j]) # difference between two points + if( diff > dist_thresh ){ # is the difference between the two points > distance argument? + lgfile[(a-1):1, j+2] <- "FAIL" # mark that marker and all markers BEFORE it as FAIL + break() + } + } + # trim end + for(z in back_edge:(n_markers-1)){ #last n% total markers starting from the back edge going out + diff <- abs(lgfile[z+1,j]-lgfile[z,j]) # difference between two points + if( diff > dist_thresh ){ # is the difference between the two points > distance argument? + lgfile[(z+1):n_markers,j+2] <- "FAIL" # mark that marker and all markers AFTER it as FAIL + break() + } + } +} + +# create new table of markers passing QC +cleaned_markers <- (lgfile %>% filter(Mpass == "PASS" & Fpass == "PASS"))[,1:6] + +# isolate bad markers +removed_markers <- (lgfile %>% filter(Mpass == "FAIL" | Fpass == "FAIL"))[,1:6] + +# get simple counts +rm_male <- (lgfile %>% filter(Mpass == "FAIL" & Fpass == "PASS"))$V1 %>% length() +rm_female <- (lgfile %>% filter(Fpass == "FAIL" & Mpass == "PASS"))$V1 %>% length() +rm_both <- (lgfile %>% filter(Mpass == "FAIL" & Fpass == "FAIL"))$V1 %>% length() + +pdf(NULL) + +plot_male <- lgfile %>% arrange(V5) %>% + ggplot(aes(x=seq_along(V5), y = V5, color = Mpass)) + + geom_point() + + geom_vline(xintercept = front_edge, linetype = "dashed", size = 0.2) + + geom_vline(xintercept = back_edge, linetype = "dashed", size = 0.2) + + labs( + title = "", + subtitle = paste0("Male markers trimmed: ", rm_male), + caption = paste0(edge_length*100, "% of edge markers, ", args[2], "% cM threshold= ", dist_thresh), + x = "Marker Number", + y = "Position (cM)", + color = "Pass Filtering" + ) + +plot_female <- lgfile %>% arrange(V6) %>% + ggplot(aes(x=seq_along(V6), y = V6, color = Fpass)) + + geom_point() + + geom_vline(xintercept = front_edge, linetype = "dashed", size = 0.2) + + geom_vline(xintercept = back_edge, linetype = "dashed", size = 0.2) + + labs( + title = paste("Edge Cluster Trimming for LG:", lg), + subtitle = paste0("Female markers trimmed: ", rm_female), + caption = paste0("Markers failing both M+F: ", rm_both), + x = "Marker Number", + y = "Position (cM)", + color = "Pass Filtering", + legend.position = "none" + ) + +plot_grid(plot_female, plot_male, ncol = 2, nrow = 1) + +suppressMessages(ggsave(plotfile, width = 7, height = 3, units = "in")) + +write.table( + cleaned_markers, + file = paste(outfile_base, "trimmed", sep = "."), + sep = "\t", + quote = FALSE, + row.names = FALSE, + col.names = FALSE, +) + +write.table( + lgfile, + file = paste(rawfile_base, "filtered.raw", sep = "."), + sep = "\t", + quote = FALSE, + row.names = FALSE, + col.names = FALSE, +) + +write.table( + removed_markers, + file=paste(outfile_log_base, "removed", sep = "."), + append=FALSE, + sep = "\t", + quote = FALSE, + row.names = FALSE, + col.names = FALSE +) \ No newline at end of file