Simple variant calling pipeline

This example shows a simple variant calling pipeline using container technology.

/*
 * Pipeline parameters
 */

// Primary input
params.reads_bam = "${workflow.projectDir}/data/bam/*.bam"

// Accessory files
params.reference = "${workflow.projectDir}/data/ref/ref.fasta"
params.reference_index = "${workflow.projectDir}/data/ref/ref.fasta.fai"
params.reference_dict = "${workflow.projectDir}/data/ref/ref.dict"
params.calling_intervals = "${workflow.projectDir}/data/ref/intervals.bed"

// Base name for final output file
params.cohort_name = "family_trio"

/*
 * Generate BAM index file
 */
process SAMTOOLS_INDEX {

    container 'community.wave.seqera.io/library/samtools:1.20--b5dfbd93de237464'
    conda "bioconda::samtools=1.20"

    input:
        path input_bam

    output:
        tuple path(input_bam), path("${input_bam}.bai")

    """
    samtools index '$input_bam'

    """
}

/*
 * Call variants with GATK HapolotypeCaller in GVCF mode
 */
process GATK_HAPLOTYPECALLER {

    container "community.wave.seqera.io/library/gatk4:4.5.0.0--730ee8817e436867"
    conda "bioconda::gatk4=4.5.0.0"

    input:
        tuple path(input_bam), path(input_bam_index)
        path ref_fasta
        path ref_index
        path ref_dict
        path interval_list

    output:
        path "${input_bam}.g.vcf"
        path "${input_bam}.g.vcf.idx"

    """
    gatk HaplotypeCaller \
        -R ${ref_fasta} \
        -I ${input_bam} \
        -O ${input_bam}.g.vcf \
        -L ${interval_list} \
        -ERC GVCF
    """
}

/*
 * Consolidate GVCFs and apply joint genotyping analysis
 */
process GATK_JOINTGENOTYPING {

    container "community.wave.seqera.io/library/gatk4:4.5.0.0--730ee8817e436867"
    conda "bioconda::gatk4=4.5.0.0"

    input:
        path vcfs
        path idxs
        val cohort_name
        path ref_fasta
        path ref_index
        path ref_dict
        path interval_list

    output:
        path "${cohort_name}.joint.vcf"
        path "${cohort_name}.joint.vcf.idx"

    script:
    def input_vcfs = vcfs.collect { "-V ${it}" }.join(' ')
    """
    gatk GenomicsDBImport \
        ${input_vcfs} \
        --genomicsdb-workspace-path ${cohort_name}_gdb \
        -L ${interval_list}

    gatk GenotypeGVCFs \
        -R ${ref_fasta} \
        -V gendb://${cohort_name}_gdb \
        -O ${cohort_name}.joint.vcf \
        -L ${interval_list}
    """
}

workflow {

    // Create input channel from BAM files
    // We convert it to a tuple with the file name and the file path
    // See https://www.nextflow.io/docs/latest/script.html#getting-file-attributes
    bam_ch = Channel.fromPath(params.reads_bam, checkIfExists: true)

    // Create reference channels using the fromPath channel factory
    // The collect converts from a queue channel to a value channel
    // See https://www.nextflow.io/docs/latest/channel.html#channel-types for details
    ref_ch               = Channel.fromPath(params.reference, checkIfExists: true).collect()
    ref_index_ch         = Channel.fromPath(params.reference_index, checkIfExists: true).collect()
    ref_dict_ch          = Channel.fromPath(params.reference_dict, checkIfExists: true).collect()
    calling_intervals_ch = Channel.fromPath(params.calling_intervals, checkIfExists: true).collect()

    // Create index file for input BAM file
    SAMTOOLS_INDEX(bam_ch)

    // Call variants from the indexed BAM file
    GATK_HAPLOTYPECALLER(
        SAMTOOLS_INDEX.out,
        ref_ch,
        ref_index_ch,
        ref_dict_ch,
        calling_intervals_ch
    )

    all_vcfs = GATK_HAPLOTYPECALLER.out[0].collect()
    all_tbis = GATK_HAPLOTYPECALLER.out[1].collect()

    // Consolidate GVCFs and apply joint genotyping analysis
    GATK_JOINTGENOTYPING(
        all_vcfs,
        all_tbis,
        params.cohort_name,
        ref_ch,
        ref_index_ch,
        ref_dict_ch,
        calling_intervals_ch
    )
}

Synopsis

This example shows a basic variant calling Nextflow pipeline consisting of three processes. The SAMTOOLS_INDEX process creates index files for the input BAM files. The GATK_HAPLOTYPECALLER process takes the index bam files created by the SAMTOOLS_INDEX process and accessory files and creates variant call files. Finally, the GATK_JOINTGENOTYPING process consolidates the variant call files generated by GATK_HAPLOTYPECALLER and applies a joint genotyping analysis.

Try it

This pipeline is available on the seqeralabs/nf-hello-gatk GitHub repository.

An active internet connection and Docker are required for Nextflow to download the pipeline and the necessary Docker images to run the pipeline within containers. The data used by this pipeline is stored on the GitHub repository and will download automatically.

To try this pipeline:

  1. Follow the Nextflow installation guide to install Nextflow.

  2. Follow the Docker installation guide to install Docker.

  3. Launch the pipeline:

    nextflow run seqeralabs/nf-hello-gatk

NOTE: The nf-hello-gatk pipeline will use Docker to manage software dependencies by default.