Example MetaScope Script
Aubrey Odom
Program in Bioinformatics, Boston University, Boston, MAaodom@bu.edu
January 16, 2025
Source:vignettes/vignette_docs/Example_Script.Rmd
Example_Script.Rmd
Introduction
The following is an example script utilizing the Bowtie 2 sub-workflow. The demultiplex step is omitted.
Creation of indices
nThreads <- 8
loc <- "/restricted/projectnb/pathoscope/reflib/2023_refseq_bowtie"
# Download taxonomy database
# This should be done ONCE before running any samples
# The same downloaded database can be used for all samples
download_accessions(file.path(loc, "indices"),
NCBI_accessions_database = TRUE,
NCBI_accessions_name = "accessionTaxa.sql",
silva_taxonomy_database = TRUE,
silva_taxonomy_name = "all_silva_headers.rds",
blast_16S_database = TRUE,
blast_16S_name = "16S_ribosomal_RNA"))
targets <- tolower(c("bacteria"))
filters <- c('homo sapiens', 'mus musculus')
inVec <- c(targets, filters)
# make directories
mk_all_dir <- function(ind) {
ind_ <- stringr::str_replace_all(ind, c(" " = "_"))
if (!dir.exists(file.path(loc, ind_))) dir.create(file.path(loc, ind_))
}
sapply(inVec, mk_all_dir)
# Download reference and representative genomes
sapply(inVec,
function(x) download_refseq(x, reference = TRUE,
representative = TRUE,
compress = TRUE,
patho_out = FALSE,
# Same directory as created above
out_dir = file.path(
loc, stringr::str_replace_all(x, c(" " = "_"))),
accession_path = file.path(loc, "indices", "accessionTaxa.sql")))
# Make indices
for (ind in inVec) {
ind_ <- stringr::str_replace_all(ind, c(" " = "_"))
# Create the reference library index files in the destination directory
if (ind == "fungi") {
mk_bowtie_index(ref_dir = file.path(loc, ind_), lib_dir = file.path(loc, "all_indices"),
lib_name = ind_,
bowtie2_build_options = "--large-index",
threads = nThreads, overwrite = TRUE)
} else {
mk_bowtie_index(ref_dir = file.path(loc, ind_), lib_dir = file.path(loc, "all_indices"),
lib_name = paste0(ind_, "rep"), threads = nThreads, overwrite = TRUE)
}
}
Command-line pre-processing (BASH script)
Note, the variable SGE_TASK_ID
refers to an index
integer set via a parallel processing job submission system for a
computing resource.
threads=8
dataDir=/restricted/projectnb/pathoscope/work/aubrey/metascope_benchmarking/miossec_data
dataDir=/restricted/projectnb/pathoscope/work/aubrey/metascope_benchmarking/miossec_data/species_10/abundance_100000/dominance_1
### MAKE WORKING DIR
workingDir=${SGE_TASK_ID}_tmp_miossec
rm -rf $TMPDIR/$workingDir
mkdir $TMPDIR/$workingDir
# Specify input files
input_line=$(sed "${SGE_TASK_ID}q;d" /restricted/projectnb/pathoscope/work/aubrey/metascope_benchmarking/miossec_data/dat_manifest.txt)
IFS=$'\t'
read -r sampleName readPath1 readPath2 <<< "$input_line"
### TRIM the reads
# CHANGE CROP AND HEADCROP DEPENDING ON MultiQC ANALYSIS
#java -jar ~/pathoscope/code/other/Trimmomatic-0.36/trimmomatic-0.36.jar PE \
# -phred33 -threads $threads $reads1 $reads2 \
# $TMPDIR/$workingDir/reads1.paired.fastq.gz \
# $TMPDIR/$workingDir/reads1.unpaired.fastq.gz \
# $TMPDIR/$workingDir/reads2.paired.fastq.gz \
# $TMPDIR/$workingDir/reads2.unpaired.fastq.gz \
# SLIDINGWINDOW:4:20 LEADING:3 TRAILING:3 MINLEN:36 \
# CROP:225 HEADCROP:30
### Run MetaScope
indexDir="/restricted/projectnb/pathoscope/reflib/2023_refseq_bowtie/all_indices"
expTag=$sampleName
outDir="/restricted/projectnb/pathoscope/work/aubrey/metascope_benchmarking/refseq2023_output"
tmpDir="$TMPDIR/$workingDir/"
# Choose filter and targets to download
target="bacteriarep,fungi,viruses"
filter="human_T2T,mus_musculus"
# This is where samtools and R are loaded if necessary
module load samtools
module load R
# Change "run_MetaScope.R" to name of R script saved elsewhere
Rscript --vanilla --max-ppsize=500000 run_MetaScope.R \
${readPath1} ${readPath2} ${indexDir} ${expTag} ${outDir} ${tmpDir} ${threads} \
${target} ${filter}
rm -rf $TMPDIR/$workingDir
R script
# Take in arguments from bash script
args <- commandArgs(trailingOnly = TRUE)
readPath1 <- args[1]
readPath2 <- args[2]
indexDir <- args[3]
expTag <- args[4]
outDir <- args[5]
tmpDir <- args[6]
threads <- args[7]
targets <- stringr::str_split(args[8], ",")[[1]]
filters <- stringr::str_split(args[9], ",")[[1]]
# Time this!
now <- Sys.time()
# Load MetaScope
library(MetaScope)
# Align to targets
do_this <- function(x) stringr::str_replace_all(x, c(" " = "_"))
targets_ <- do_this(targets)
filters_ <- do_this(filters)
# Identify bt2 params
data(bt2_regular_params)
bt2_params <- bt2_regular_params
target_map <- align_target_bowtie(read1 = readPath1,
read2 = readPath2,
lib_dir = indexDir,
libs = targets_,
align_dir = tmpDir,
align_file = expTag,
overwrite = TRUE,
threads = threads,
bowtie2_options = paste(bt2_params, "-f"),
quiet = FALSE)
# Align to filters
output <- paste(paste0(tmpDir, expTag), "filtered", sep = ".")
final_map <- filter_host_bowtie(reads_bam = target_map,
lib_dir = indexDir,
libs = filters_,
make_bam = FALSE,
output = output,
threads = threads,
overwrite = TRUE,
quiet = FALSE,
bowtie2_options = bt2_params)
# MetaScope ID
metascope_id(final_map, input_type = "csv.gz", aligner = "bowtie2",
accession_path = file.path(loc, "indices", "accessionTaxa.sql"),
num_species_plot = 15,
quiet = FALSE,
out_dir = outDir)
message(capture.output(Sys.time() - now))