Skip to main content

Example RNA-seq Nextflow script from video

#!/usr/bin/env nextflow
// Defining necessary variables including the path to the reference genome and fastq files
params.ref = "/hpc/faculty/azeezoe/rna_seq/refs/22.fa"
params.reads = "/hpc/faculty/azeezoe/rna_seq/reads/*_R{1,2}.fq.gz"
params.outdir = "./results"
params.PE = true
 
// Fastqc process, creates zip files used for multiqc
process FASTQC {
    publishDir "${params.outdir}/fastqc", mode: 'copy'
     
    // Input are the reads channel
    input:
    tuple val(sample), val(extension), path(reads)
 
    output:
    path("*.zip")
 
    script:
    """
    fastqc ${reads}
    """
}
 
// FASTP_PE, fastp processing for paired end files. if params.PE (top) set to false then this process is not called
process FASTP_PE {
    publishDir "${params.outdir}/fastp", mode: 'copy'
 
    input:
    tuple val(sample), val(extension), path(reads)
 
    output:
    // Defining channel output emitted as "reads", and the fastp.json report as "json"
    tuple val(sample), val(extension), path("${sample}_{1,2}.trimmed.{${extension[0]},${extension[1]}}"), emit: reads
    path("${sample}.fastp.json"), emit: json
 
    script:
    // Splitting the reads and extension lists into individual variables
    def read1 = reads[0]
    def read2 = reads[1]
    def extension1 = extension[0]
    def extension2 = extension[1]
 
    """
    fastp -i ${read1} -I ${read2} -o ${sample}_1.trimmed.${extension1} -O ${sample}_2.trimmed.${extension2} -j ${sample}.fastp.json
    """
}
 
// Multiqc allows for easy visualization of fastp and fastqc reports
process MULTIQC{
    publishDir "${params.outdir}/multiqc", mode: 'copy'
 
    input:
    // takes the fastp.json output and all the zip files from fastqc
    path(fastp_out)
    path(fastqc_out)
 
    output:
    path('*')
 
    script:
    // "multiqc ." effectively means "run multiqc on everything in the entire directory you are in".
    // Because we aren't "in" any directory as we have to specify inputs to a process,
    // we will need to use the ".collect()" method for inputs when we call this process in the workflow block (see below),
    // so as to hand multiqc everything we need at once.
    """
    multiqc .
    """
 
}
 
// Indexing our transcriptome
process SALMON_IDX {
    input: 
    path(transcriptome)
 
    output:
    path("salmon_idx")
 
    script:
    """
    salmon index -t ${transcriptome} -i salmon_idx
    """
}
 
// Salmon quantification step, requires our trimmed reads and the salmon index
process SALMON_QUANT_PE {
    publishDir "${params.outdir}/salmon", mode: 'copy'
 
    input:
    path(salmon_idx)
    tuple val(sample), val(extension), path(reads)
 
    output:
    path("*")
 
    script:
    def read1 = reads[0]
    def read2 = reads[1]
 
    """
    salmon quant --threads ${task.cpus} -i ${salmon_idx} -l A -1 ${read1} -2 ${read2} -o ${sample}.quant
    """
}
 
 
workflow {
    // Handles .fastq, .fq, .fastq.gz, .fq.gz files automatically, no need to modify
    if ( params.PE ) {
        reads_ch = channel.fromFilePairs( params.reads, checkIfExists: true )
        .map { sample_id, files ->
            def extensions = files.collect { file ->
                def file_name = file.name
                def idx = file_name.indexOf(".")
                def extension = file_name.substring(idx+1)
                extension
                }
            tuple(sample_id, extensions, files)
        }
    }
    else {
        reads = channel.fromPath( params.reads, checkIfExists: true )
        reads_ch = reads.map { file ->
            def file_name = file.name
            def idx = file_name.indexOf(".")
            def sample_id = file_name.substring(0, idx)
            def extension = file_name.substring(idx+1)
            tuple( sample_id, extension, file )
        }
    }
     
    // Viewing what the reads channel looks like
    reads_ch.view()
     
    // Fastqc and fastp takes reads channel input
    fastqc_out = FASTQC(reads_ch)
    fastp_out = FASTP_PE(reads_ch)
     
    // Multiqc requires just the "json" output from fastp, and everything from fastqc.
    // By default nextflow passes inputs as a stream to a process, but in this case we need
    // to hand everything to the multiqc process at once. To do this we use the ".collect()" method
    MULTIQC(fastp_out.json.collect(), fastqc_out.collect())
 
    salmon_idx = SALMON_IDX(params.ref)
    // Notice again that we specify we only want to supply the reads output from fastp to salmon quant
    // To do this we use "dot notation", fastp_out.reads (which we defined with the "emit" keyword in the process block)
    SALMON_QUANT_PE(salmon_idx, fastp_out.reads)
}