From 7a2f4287ca73c5978a5b3770a4685534d5214375 Mon Sep 17 00:00:00 2001 From: Maina Vienne <maina.vienne@inrae.fr> Date: Mon, 18 Jul 2022 13:34:00 +0200 Subject: [PATCH 01/13] Adding possibility to give flowcell information on samplesheet Not yet dealing with these informations --- brouillon.nf | 105 +++++++++++++++++++++++++++++++++++++++++++++++++++ main.nf | 51 ++++++++++++++----------- 2 files changed, 135 insertions(+), 21 deletions(-) create mode 100644 brouillon.nf diff --git a/brouillon.nf b/brouillon.nf new file mode 100644 index 0000000..4b5e080 --- /dev/null +++ b/brouillon.nf @@ -0,0 +1,105 @@ +#!/usr/bin/env nextflow + +nextflow.enable.dsl = 2 + +def getAndCheckHeader() { + File file = new File(params.input) + assert file.exists() : "${params.input} file not found" + def line=""; + file.withReader { reader -> + line = reader.readLine() + } + def tab = line.split(/,/) + def list = ['sample','flowcell','fastq_1','fastq_2', 'assembly'] + for (i in tab) { + if (!list.contains(i)) { + exit 1, 'Error 1 while check samplesheet format please enter sample[,flowcell],fastq_1[,fastq_2][,assembly] with header line' + } + } + if (!tab.contains("sample") || !tab.contains("fastq_1")){ + exit 1, 'Error 1 while check samplesheet format please enter at least sample,fastq_1 with header line' + } + return tab +} + + +def returnFile(it) { + if (it == null) { + return null + } else { + if (!file(it).exists()) exit 1, "Missing file in CSV file: ${it}, see --help for more information" + } + return file(it) +} + +def hasExtension(it, extension) { + it.toString().toLowerCase().endsWith(extension.toLowerCase()) +} + + +workflow { + if (params.input) { ch_input = file(params.input) } else { exit 1, 'Input samplesheet not specified!' } + +} + + header = getAndCheckHeader() + Channel.from(file(params.input) + .splitCsv ( header:true, sep:',' ) ) + .map { row -> + def sample = row.sample + def paired = false + if (row.fastq_2 != null) { + paired = true + } + if (hasExtension(row.fastq_1, "fastq") || hasExtension(row.fastq_1, "fq") || hasExtension(row.fastq_2, "fastq") || hasExtension(row.fastq_2, "fq")) { + exit 1, "We do recommend to use gziped fastq file to help you reduce your data footprint." + } + ["sample":row.sample, + "flowcell":row.flowcell, + "fastq_1":returnFile(row.fastq_1), + "fastq_2":returnFile(row.fastq_2), + "paired":paired, + "assembly":returnFile(row.assembly) ] + } + .set { ch_inputs } + + ch_inputs.view() + +ch_inputs.map { item -> if (item.flowcell!=null) { [item.sample+"_"+item.flowcell,item.fastq_1, item.fastq_2,[item.sample, item.flowcell, item.paired, item.assembly]]}}.set { ch_test } + + ch_inputs + .map { item -> + if (item.flowcell!=null) { [item.sample+"_"+item.flowcell,item.fastq_1, item.fastq_2]} + else {[item.sample, item.fastq_1, item.fastq_2 ]}} + .set{ch_reads_sampleIDunique} + + ch_inputs + .map { item -> + if (item.flowcell!=null) {[item.sample+"_"+item.flowcell,item.sample, item.flowcell, item.paired]} + else {[item.sample, item.paired]}} + .set{ch_meta} + + + ch_inputs + .map { item -> + if (item.flowcell!=null) {[item.sample+"_"+item.flowcell, item.assembly ] } + else {[item.sample, item.assembly ]}} + .set { ch_assembly } + +ch_reads_sampleIDunique.view() +ch_meta.view() +ch_assembly.view() + + + // ch_multiqc_config = file(params.sr_multiqc_config, checkIfExists: true) + // ch_inputs + // .map { item -> [ item.sample, item.fastq_1, item.fastq_2 ] } + // .set { ch_reads } + // ch_inputs + // .map { item -> [ item.sample, item.paired ] } + // .set { ch_paired } + // ch_inputs + // .map { item -> [ item.sample, item.assembly ] } + // .set { ch_assembly } + //ch_reads.view{ it -> "${it}" } + //ch_paired.view{ it -> "${it}" } \ No newline at end of file diff --git a/main.nf b/main.nf index 8c70e2d..3c82a1d 100644 --- a/main.nf +++ b/main.nf @@ -109,22 +109,19 @@ def getAndCheckHeader() { line = reader.readLine() } def tab = line.split(/,/) - if (! ((tab[0] == "sample") && (tab[1] == "fastq_1") )) { - exit 1, 'Error 1 while check samplesheet format please enter sample,fastq_1[,fastq_2][,assembly] with header line' - } - if (tab.size() == 3 ){ - if (!((tab[2] == "fastq_2") || (tab[2] == "assembly"))) { - exit 1, 'Error 2 while check samplesheet format please enter sample,fastq_1[,fastq_2][,assembly] with header line' - } - } - if (tab.size() == 4) { - if ( ! ((tab[2] == "fastq_2") && (tab[3] == "assembly"))) { - exit 1, 'Error 3 while check samplesheet format please enter sample,fastq_1[,fastq_2][,assembly] with header line' + def list = ['sample','flowcell','fastq_1','fastq_2', 'assembly'] + for (i in tab) { + if (!list.contains(i)) { + exit 1, 'Error 1 while check samplesheet format please enter sample[,flowcell],fastq_1[,fastq_2][,assembly] with header line' } + } + if (!tab.contains("sample") || !tab.contains("fastq_1")){ + exit 1, 'Error 1 while check samplesheet format please enter at least sample,fastq_1 with header line' } return tab } + def returnFile(it) { if (it == null) { return null @@ -192,6 +189,7 @@ workflow { exit 1, "We do recommend to use gziped fastq file to help you reduce your data footprint." } ["sample":row.sample, + "flowcell":row.flowcell, "fastq_1":returnFile(row.fastq_1), "fastq_2":returnFile(row.fastq_2), "paired":paired, @@ -200,6 +198,9 @@ workflow { .set { ch_inputs } has_assembly = (file(params.input).splitCsv ( header:true, sep:',' ).assembly[0] != null) + + ch_inputs.map { item -> if (item.flowcell!=null) { [item.sample+"_"+item.flowcell,item.fastq_1, item.fastq_2,[item.sample, item.flowcell, item.paired, item.assembly]]}}.set { ch_test } + //////////// // End check samplesheet //////////// @@ -244,16 +245,24 @@ workflow { ch_v_eggnogmapper = Channel.empty() if ( params.type.toUpperCase() == "SR" ) { - ch_multiqc_config = file(params.sr_multiqc_config, checkIfExists: true) - ch_inputs - .map { item -> [ item.sample, item.fastq_1, item.fastq_2 ] } - .set { ch_reads } - ch_inputs - .map { item -> [ item.sample, item.paired ] } - .set { ch_paired } - ch_inputs - .map { item -> [ item.sample, item.assembly ] } - .set { ch_assembly } + ch_inputs + .map { item -> + if (item.flowcell!=null) { [item.sample+"_"+item.flowcell,item.fastq_1, item.fastq_2]} + else {[item.sample, item.fastq_1, item.fastq_2 ]}} + .set{ch_reads} + + ch_inputs + .map { item -> + if (item.flowcell!=null) {[item.sample+"_"+item.flowcell,item.sample, item.flowcell, item.paired]} + else {[item.sample, item.paired]}} + .set{ch_paired} + + + ch_inputs + .map { item -> + if (item.flowcell!=null) {[item.sample+"_"+item.flowcell, item.assembly ] } + else {[item.sample, item.assembly ]}} + .set { ch_assembly } //ch_reads.view{ it -> "${it}" } //ch_paired.view{ it -> "${it}" } -- GitLab From b7c7af2f8e3d53d19216edbbf93a400730defe94 Mon Sep 17 00:00:00 2001 From: Maina Vienne <maina.vienne@inrae.fr> Date: Mon, 18 Jul 2022 13:34:00 +0200 Subject: [PATCH 02/13] Adding possibility to give flowcell information on samplesheet Not yet dealing with these informations --- brouillon.nf | 105 +++++++++++++++++++++++++++++++++++++++++++++++++++ main.nf | 51 ++++++++++++++----------- 2 files changed, 135 insertions(+), 21 deletions(-) create mode 100644 brouillon.nf diff --git a/brouillon.nf b/brouillon.nf new file mode 100644 index 0000000..4b5e080 --- /dev/null +++ b/brouillon.nf @@ -0,0 +1,105 @@ +#!/usr/bin/env nextflow + +nextflow.enable.dsl = 2 + +def getAndCheckHeader() { + File file = new File(params.input) + assert file.exists() : "${params.input} file not found" + def line=""; + file.withReader { reader -> + line = reader.readLine() + } + def tab = line.split(/,/) + def list = ['sample','flowcell','fastq_1','fastq_2', 'assembly'] + for (i in tab) { + if (!list.contains(i)) { + exit 1, 'Error 1 while check samplesheet format please enter sample[,flowcell],fastq_1[,fastq_2][,assembly] with header line' + } + } + if (!tab.contains("sample") || !tab.contains("fastq_1")){ + exit 1, 'Error 1 while check samplesheet format please enter at least sample,fastq_1 with header line' + } + return tab +} + + +def returnFile(it) { + if (it == null) { + return null + } else { + if (!file(it).exists()) exit 1, "Missing file in CSV file: ${it}, see --help for more information" + } + return file(it) +} + +def hasExtension(it, extension) { + it.toString().toLowerCase().endsWith(extension.toLowerCase()) +} + + +workflow { + if (params.input) { ch_input = file(params.input) } else { exit 1, 'Input samplesheet not specified!' } + +} + + header = getAndCheckHeader() + Channel.from(file(params.input) + .splitCsv ( header:true, sep:',' ) ) + .map { row -> + def sample = row.sample + def paired = false + if (row.fastq_2 != null) { + paired = true + } + if (hasExtension(row.fastq_1, "fastq") || hasExtension(row.fastq_1, "fq") || hasExtension(row.fastq_2, "fastq") || hasExtension(row.fastq_2, "fq")) { + exit 1, "We do recommend to use gziped fastq file to help you reduce your data footprint." + } + ["sample":row.sample, + "flowcell":row.flowcell, + "fastq_1":returnFile(row.fastq_1), + "fastq_2":returnFile(row.fastq_2), + "paired":paired, + "assembly":returnFile(row.assembly) ] + } + .set { ch_inputs } + + ch_inputs.view() + +ch_inputs.map { item -> if (item.flowcell!=null) { [item.sample+"_"+item.flowcell,item.fastq_1, item.fastq_2,[item.sample, item.flowcell, item.paired, item.assembly]]}}.set { ch_test } + + ch_inputs + .map { item -> + if (item.flowcell!=null) { [item.sample+"_"+item.flowcell,item.fastq_1, item.fastq_2]} + else {[item.sample, item.fastq_1, item.fastq_2 ]}} + .set{ch_reads_sampleIDunique} + + ch_inputs + .map { item -> + if (item.flowcell!=null) {[item.sample+"_"+item.flowcell,item.sample, item.flowcell, item.paired]} + else {[item.sample, item.paired]}} + .set{ch_meta} + + + ch_inputs + .map { item -> + if (item.flowcell!=null) {[item.sample+"_"+item.flowcell, item.assembly ] } + else {[item.sample, item.assembly ]}} + .set { ch_assembly } + +ch_reads_sampleIDunique.view() +ch_meta.view() +ch_assembly.view() + + + // ch_multiqc_config = file(params.sr_multiqc_config, checkIfExists: true) + // ch_inputs + // .map { item -> [ item.sample, item.fastq_1, item.fastq_2 ] } + // .set { ch_reads } + // ch_inputs + // .map { item -> [ item.sample, item.paired ] } + // .set { ch_paired } + // ch_inputs + // .map { item -> [ item.sample, item.assembly ] } + // .set { ch_assembly } + //ch_reads.view{ it -> "${it}" } + //ch_paired.view{ it -> "${it}" } \ No newline at end of file diff --git a/main.nf b/main.nf index 3974a9e..e35609f 100644 --- a/main.nf +++ b/main.nf @@ -109,22 +109,19 @@ def getAndCheckHeader() { line = reader.readLine() } def tab = line.split(/,/) - if (! ((tab[0] == "sample") && (tab[1] == "fastq_1") )) { - exit 1, 'Error 1 while check samplesheet format please enter sample,fastq_1[,fastq_2][,assembly] with header line' - } - if (tab.size() == 3 ){ - if (!((tab[2] == "fastq_2") || (tab[2] == "assembly"))) { - exit 1, 'Error 2 while check samplesheet format please enter sample,fastq_1[,fastq_2][,assembly] with header line' - } - } - if (tab.size() == 4) { - if ( ! ((tab[2] == "fastq_2") && (tab[3] == "assembly"))) { - exit 1, 'Error 3 while check samplesheet format please enter sample,fastq_1[,fastq_2][,assembly] with header line' + def list = ['sample','flowcell','fastq_1','fastq_2', 'assembly'] + for (i in tab) { + if (!list.contains(i)) { + exit 1, 'Error 1 while check samplesheet format please enter sample[,flowcell],fastq_1[,fastq_2][,assembly] with header line' } + } + if (!tab.contains("sample") || !tab.contains("fastq_1")){ + exit 1, 'Error 1 while check samplesheet format please enter at least sample,fastq_1 with header line' } return tab } + def returnFile(it) { if (it == null) { return null @@ -192,6 +189,7 @@ workflow { exit 1, "We do recommend to use gziped fastq file to help you reduce your data footprint." } ["sample":row.sample, + "flowcell":row.flowcell, "fastq_1":returnFile(row.fastq_1), "fastq_2":returnFile(row.fastq_2), "paired":paired, @@ -200,6 +198,9 @@ workflow { .set { ch_inputs } has_assembly = (file(params.input).splitCsv ( header:true, sep:',' ).assembly[0] != null) + + ch_inputs.map { item -> if (item.flowcell!=null) { [item.sample+"_"+item.flowcell,item.fastq_1, item.fastq_2,[item.sample, item.flowcell, item.paired, item.assembly]]}}.set { ch_test } + //////////// // End check samplesheet //////////// @@ -244,16 +245,24 @@ workflow { ch_v_eggnogmapper = Channel.empty() if ( params.type.toUpperCase() == "SR" ) { - ch_multiqc_config = file(params.sr_multiqc_config, checkIfExists: true) - ch_inputs - .map { item -> [ item.sample, item.fastq_1, item.fastq_2 ] } - .set { ch_reads } - ch_inputs - .map { item -> [ item.sample, item.paired ] } - .set { ch_paired } - ch_inputs - .map { item -> [ item.sample, item.assembly ] } - .set { ch_assembly } + ch_inputs + .map { item -> + if (item.flowcell!=null) { [item.sample+"_"+item.flowcell,item.fastq_1, item.fastq_2]} + else {[item.sample, item.fastq_1, item.fastq_2 ]}} + .set{ch_reads} + + ch_inputs + .map { item -> + if (item.flowcell!=null) {[item.sample+"_"+item.flowcell,item.sample, item.flowcell, item.paired]} + else {[item.sample, item.paired]}} + .set{ch_paired} + + + ch_inputs + .map { item -> + if (item.flowcell!=null) {[item.sample+"_"+item.flowcell, item.assembly ] } + else {[item.sample, item.assembly ]}} + .set { ch_assembly } //ch_reads.view{ it -> "${it}" } //ch_paired.view{ it -> "${it}" } -- GitLab From 1b28dd85feb7dd8464982593015e30adb8b6830d Mon Sep 17 00:00:00 2001 From: Maina Vienne <maina.vienne@inrae.fr> Date: Wed, 20 Jul 2022 14:18:13 +0200 Subject: [PATCH 03/13] use of metadata in read channels --- main.nf | 81 ++++++++++++++++++++----------------- modules/cutadapt.nf | 16 ++++---- modules/fastqc.nf | 59 +++++++-------------------- subworkflows/01_clean_qc.nf | 6 +-- subworkflows/short_reads.nf | 6 +-- 5 files changed, 69 insertions(+), 99 deletions(-) diff --git a/main.nf b/main.nf index e35609f..18f42cb 100644 --- a/main.nf +++ b/main.nf @@ -109,7 +109,7 @@ def getAndCheckHeader() { line = reader.readLine() } def tab = line.split(/,/) - def list = ['sample','flowcell','fastq_1','fastq_2', 'assembly'] + def list = ['sample','flowcell','group','fastq_1','fastq_2', 'assembly'] for (i in tab) { if (!list.contains(i)) { exit 1, 'Error 1 while check samplesheet format please enter sample[,flowcell],fastq_1[,fastq_2][,assembly] with header line' @@ -141,11 +141,6 @@ workflow { // Check mandatory parameters - //////////// - // Start check samplesheet - //////////// - if (params.input) { ch_input = file(params.input) } else { exit 1, 'Input samplesheet not specified!' } - if (params.type.toUpperCase() == 'SR') { if ( assembly_tool == null || assembly_tool == ''){ @@ -176,31 +171,66 @@ workflow { if ( !(params.stop_at_clean) && !(params.stop_at_assembly) && !(params.stop_at_filtering) && !(params.stop_at_structural_annot) && !(params.diamond_bank) ) { exit 1, "You must specify --stop_at_structural_annot or specify a diamond bank with --diamond_bank" } + + + //////////// + // Start check samplesheet + //////////// + if (params.input) { ch_input = file(params.input) } else { exit 1, 'Input samplesheet not specified!' } + header = getAndCheckHeader() Channel.from(file(params.input) .splitCsv ( header:true, sep:',' ) ) .map { row -> def sample = row.sample - def paired = false - if (row.fastq_2 != null) { - paired = true - } if (hasExtension(row.fastq_1, "fastq") || hasExtension(row.fastq_1, "fq") || hasExtension(row.fastq_2, "fastq") || hasExtension(row.fastq_2, "fq")) { exit 1, "We do recommend to use gziped fastq file to help you reduce your data footprint." } ["sample":row.sample, "flowcell":row.flowcell, + "group":row.group, "fastq_1":returnFile(row.fastq_1), "fastq_2":returnFile(row.fastq_2), - "paired":paired, "assembly":returnFile(row.assembly) ] } .set { ch_inputs } + ch_inputs + .map { item -> + def meta = [:] + meta.id = item.sample + if (item.flowcell!=null) { meta.id = meta.id+"_"+item.flowcell} + if (item.group !=null) {meta.id = meta.id+"_"+item.group} + meta.sample = item.sample + meta.flowcell = item.flowcell + meta.group = item.group + meta.assembly = item.assembly!=null + meta.type = params.type.toUpperCase() + if (meta.type=="SR"){ + return [meta,[item.fastq_1,item.fastq_2]] + } + else if (meta.type=="HIFI"){ + return [meta,[item.fastq_1]] + } + } + .set{ch_reads} - has_assembly = (file(params.input).splitCsv ( header:true, sep:',' ).assembly[0] != null) - - ch_inputs.map { item -> if (item.flowcell!=null) { [item.sample+"_"+item.flowcell,item.fastq_1, item.fastq_2,[item.sample, item.flowcell, item.paired, item.assembly]]}}.set { ch_test } + ch_inputs + .map { item -> + def meta = [:] + meta.id = item.sample + if (item.flowcell!=null) { meta.id = meta.id+"_"+item.flowcell} + if (item.group !=null) {meta.id = meta.id+"_"+item.group} + meta.sample = item.sample + meta.flowcell = item.flowcell + meta.group = item.group + meta.assembly = item.assembly!=null + meta.type = params.type.toUpperCase() + if (meta.type=="SR"){ + return [meta,item.assembly] + } + } + .set { ch_assembly } //////////// // End check samplesheet //////////// @@ -245,35 +275,12 @@ workflow { ch_v_eggnogmapper = Channel.empty() if ( params.type.toUpperCase() == "SR" ) { - ch_inputs - .map { item -> - if (item.flowcell!=null) { [item.sample+"_"+item.flowcell,item.fastq_1, item.fastq_2]} - else {[item.sample, item.fastq_1, item.fastq_2 ]}} - .set{ch_reads} - - ch_inputs - .map { item -> - if (item.flowcell!=null) {[item.sample+"_"+item.flowcell,item.sample, item.flowcell, item.paired]} - else {[item.sample, item.paired]}} - .set{ch_paired} - - - ch_inputs - .map { item -> - if (item.flowcell!=null) {[item.sample+"_"+item.flowcell, item.assembly ] } - else {[item.sample, item.assembly ]}} - .set { ch_assembly } - //ch_reads.view{ it -> "${it}" } - //ch_paired.view{ it -> "${it}" } - SR ( ch_reads, - ch_paired, ch_assembly, ch_host_fasta, ch_host_index, ch_kaiju_db, - has_assembly, assembly_tool ) diff --git a/modules/cutadapt.nf b/modules/cutadapt.nf index 82e6e54..6d5bda0 100644 --- a/modules/cutadapt.nf +++ b/modules/cutadapt.nf @@ -1,26 +1,26 @@ process CUTADAPT { - tag "${sampleId}" + tag "${meta.id}" publishDir "${params.outdir}/01_clean_qc/01_1_cleaned_reads/", mode: 'copy', pattern: 'cleaned_*.fastq.gz' publishDir "${params.outdir}/01_clean_qc/01_1_cleaned_reads/logs", mode: 'copy', pattern: '*_cutadapt.log' input: - tuple val(sampleId), path(read1), path(read2) + tuple val(meta), path(reads) val adapter1 val adapter2 output: - tuple val(sampleId), path("*${sampleId}*_R1.fastq.gz"), path("*${sampleId}*_R2.fastq.gz"), emit: reads - path "${sampleId}_cutadapt.log", emit: report + tuple val(meta), path("*${meta.id}*.fastq.gz"), emit: reads + path "${meta.id}_cutadapt.log", emit: report script: if (!params.use_sickle & params.skip_host_filter) { // output are final cleaned paths - output_paths = "-o cleaned_${sampleId}_R1.fastq.gz -p cleaned_${sampleId}_R2.fastq.gz" + output_paths = "-o cleaned_${meta.id}_R1.fastq.gz -p cleaned_${meta.id}_R2.fastq.gz" } else { // tempory paths not saved in publish dir - output_paths = "-o ${sampleId}_cutadapt_R1.fastq.gz -p ${sampleId}_cutadapt_R2.fastq.gz" + output_paths = "-o ${meta.id}_cutadapt_R1.fastq.gz -p ${meta.id}_cutadapt_R2.fastq.gz" } if (!params.use_sickle){ quality_trim = "-q 20,20" @@ -29,6 +29,6 @@ process CUTADAPT { } """ cutadapt -a $adapter1 -A $adapter2 $output_paths -m 36 --trim-n -q 20,20 --max-n 0 \ - --cores=${task.cpus} ${read1} ${read2} > ${sampleId}_cutadapt.log + --cores=${task.cpus} ${reads[0]} ${reads[1]} > ${meta.id}_cutadapt.log """ -} \ No newline at end of file +} diff --git a/modules/fastqc.nf b/modules/fastqc.nf index 2e06c23..f215148 100644 --- a/modules/fastqc.nf +++ b/modules/fastqc.nf @@ -1,55 +1,26 @@ -process FASTQC_RAW { - tag "${sampleId}" +process FASTQC { + tag "${meta.id}" label 'FASTQC' - publishDir "${params.outdir}/01_clean_qc/01_2_qc/fastqc_raw", mode: 'copy' + publishDir "${params.outdir}/01_clean_qc/01_2_qc/", mode: 'copy' input: - tuple val(sampleId), path(read1), path(read2) + tuple val(meta), path(reads) output: - path "${sampleId}/*.zip", emit: zip - path "${sampleId}/*.html", emit: html + path "$outdir/${meta.id}/*.zip", emit: zip + path "$outdir/${meta.id}/*.html", emit: html script: + if (reads[0]==~/cleaned.*/){ outdir="fastqc_cleaned" } else { outdir="fastqc_raw" } + if (meta.type=="SR"){ + fastq = "${reads[0]} ${reads[1]}" + option = "--nogroup" + } else if (meta.type=="HIFI"){ + fastq = "${reads}" + option = "" } """ - mkdir ${sampleId} ; fastqc --nogroup --quiet -o ${sampleId} --threads ${task.cpus} ${read1} ${read2} - """ -} - -process FASTQC_CLEANED { - tag "${sampleId}" - label 'FASTQC' - publishDir "${params.outdir}/01_clean_qc/01_2_qc/fastqc_cleaned", mode: 'copy' - - input: - tuple val(sampleId), path(read1), path(read2) - - output: - path "${sampleId}/*.zip", emit: zip - path "${sampleId}/*.html", emit: html - - script: - """ - mkdir ${sampleId}; fastqc --nogroup --quiet -o ${sampleId} --threads ${task.cpus} ${read1} ${read2} - """ -} - -process FASTQC_HIFI { - tag "${sampleId}" - label 'FASTQC' - publishDir "${params.outdir}/${publishdir}", mode: 'copy' - - input: - tuple val(sampleId), path(read), val(publishdir) - - output: - path "${sampleId}/*.zip", emit: zip - path "${sampleId}/*.html", emit: html - - script: - """ - echo ${params.outdir}/${publishdir} - mkdir ${sampleId}; fastqc --quiet -o ${sampleId} --threads ${task.cpus} ${read} + mkdir -p $outdir/${meta.id} + fastqc $option --quiet -o $outdir/${meta.id} --threads ${task.cpus} $fastq """ } diff --git a/subworkflows/01_clean_qc.nf b/subworkflows/01_clean_qc.nf index 764987f..09c614f 100644 --- a/subworkflows/01_clean_qc.nf +++ b/subworkflows/01_clean_qc.nf @@ -1,13 +1,12 @@ include { CUTADAPT } from '../modules/cutadapt' include { SICKLE } from '../modules/sickle' include { HOST_FILTER; HOST_FILTER_HIFI } from '../modules/host_filter' -include { FASTQC_RAW; FASTQC_CLEANED; FASTQC_HIFI; FASTQC_HIFI as FASTQC_HIFI_RAW } from '../modules/fastqc' +include { FASTQC as FASTQC_RAW; FASTQC as FASTQC_CLEANED; FASTQC as FASTQC_HIFI; FASTQC as FASTQC_HIFI_RAW } from '../modules/fastqc' include { KAIJU_AND_MERGE; KAIJU_AND_MERGE_FOR_HIFI } from '../modules/kaiju' workflow STEP_01_CLEAN_QC { take: raw_reads - paired host_fasta host_index kaiju_db @@ -25,8 +24,6 @@ workflow STEP_01_CLEAN_QC { ch_cutadapt_report = CUTADAPT.out.report if (params.use_sickle) { - ch_sickle_reads = ch_intermediate_reads.join(paired) - // ch_sickle_reads.view{ it -> "${it}" } SICKLE ( ch_sickle_reads ) @@ -76,7 +73,6 @@ workflow STEP_01_CLEAN_QC { emit: preprocessed_reads = ch_preprocessed_reads - cutadapt_report = ch_cutadapt_report sickle_report = ch_sickle_report before_filter_report = ch_before_filter_report diff --git a/subworkflows/short_reads.nf b/subworkflows/short_reads.nf index b001174..3cc3920 100644 --- a/subworkflows/short_reads.nf +++ b/subworkflows/short_reads.nf @@ -3,16 +3,13 @@ include { STEP_02_ASSEMBLY as S02_ASSEMBLY } from './02_assembly' include { ASSEMBLY_FILTER as S03_FILTERING } from './03_filtering' include { QUAST as S04_FILTERED_QUAST } from '../modules/metaquast' - workflow SHORT_READS { take: reads - paired assembly host_fasta host_index kaiju_db - has_assembly assembly_tool main: @@ -34,7 +31,6 @@ workflow SHORT_READS { if ( !params.skip_clean ) { S01_CLEAN_QC ( reads, - paired, host_fasta, host_index, kaiju_db @@ -52,7 +48,7 @@ workflow SHORT_READS { ch_dedup = Channel.empty() if ( !params.stop_at_clean ) { - S02_ASSEMBLY ( ch_preprocessed_reads, assembly, has_assembly, assembly_tool ) + S02_ASSEMBLY ( ch_preprocessed_reads, assembly, assembly_tool ) ch_assembly = S02_ASSEMBLY.out.assembly ch_dedup = S02_ASSEMBLY.out.dedup ch_idxstats = S02_ASSEMBLY.out.idxstats -- GitLab From 4b9d9dde6ad053a7d5d4b74d41d11111091909a1 Mon Sep 17 00:00:00 2001 From: Maina Vienne <maina.vienne@inrae.fr> Date: Wed, 20 Jul 2022 14:31:40 +0200 Subject: [PATCH 04/13] Merge fastq to manage flowcell --- modules/merge_fastq.nf | 20 ++++++++++++++++++++ subworkflows/short_reads.nf | 37 +++++++++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+) create mode 100644 modules/merge_fastq.nf diff --git a/modules/merge_fastq.nf b/modules/merge_fastq.nf new file mode 100644 index 0000000..4e5fe57 --- /dev/null +++ b/modules/merge_fastq.nf @@ -0,0 +1,20 @@ +process MERGE_FASTQ { + tag "$meta.id" + label 'MERGE_FASTQ' + + input: + tuple val(meta), path(reads) + + output: + tuple val(meta), path("*.merged.fastq.gz"), emit: reads + + script: + def readList = reads.collect{ it.toString() } + def read1 = [] + def read2 = [] + readList.eachWithIndex{ v, ix -> ( ix & 1 ? read2 : read1 ) << v } + """ + cat ${read1.join(' ')} > ${meta.sample}_1.merged.fastq.gz + cat ${read2.join(' ')} > ${meta.sample}_2.merged.fastq.gz + """ +} \ No newline at end of file diff --git a/subworkflows/short_reads.nf b/subworkflows/short_reads.nf index 3cc3920..bbac49f 100644 --- a/subworkflows/short_reads.nf +++ b/subworkflows/short_reads.nf @@ -2,6 +2,7 @@ include { STEP_01_CLEAN_QC as S01_CLEAN_QC } from './01_clean_qc' include { STEP_02_ASSEMBLY as S02_ASSEMBLY } from './02_assembly' include { ASSEMBLY_FILTER as S03_FILTERING } from './03_filtering' include { QUAST as S04_FILTERED_QUAST } from '../modules/metaquast' +include { MERGE_FASTQ } from '../modules/merge_fastq.nf' workflow SHORT_READS { take: @@ -48,6 +49,42 @@ workflow SHORT_READS { ch_dedup = Channel.empty() if ( !params.stop_at_clean ) { + + ////////////////// + // Manage Flowcell + ////////////////// + ch_reads_tmp = ch_preprocessed_reads + .map { + meta, fastq -> + [ meta.sample, meta, fastq ] + } + .groupTuple(by: [0]) + .branch { + id, meta, fastq -> + single : fastq.size() == 1 + return [meta.flatten(),fastq.flatten() ] + multiple: fastq.size() > 1 + return [[id:meta.sample.unique().join(), + sample:meta.sample.unique().join(), + flowcell:meta.flowcell.join("_"), + group:meta.group.unique().join(), + assembly:meta.assembly.unique().join(), + type:meta.type.unique().join(),], fastq.flatten() ] + } + + + MERGE_FASTQ ( + ch_reads_tmp.multiple + ) + .reads + .mix(ch_reads_tmp.single) + .set{ch_preprocessed_reads} + + ch_preprocessed_reads.view() + ///////////////////// + //End manage Flowcell + ///////////////////// + S02_ASSEMBLY ( ch_preprocessed_reads, assembly, assembly_tool ) ch_assembly = S02_ASSEMBLY.out.assembly ch_dedup = S02_ASSEMBLY.out.dedup -- GitLab From 6e3f8a19b846efff03a46401cf181c334d3c7f72 Mon Sep 17 00:00:00 2001 From: Maina Vienne <maina.vienne@inrae.fr> Date: Thu, 21 Jul 2022 16:44:59 +0200 Subject: [PATCH 05/13] Use metadata: step_01_clean --- modules/host_filter.nf | 52 ++++++++++++++++++------------------- modules/sickle.nf | 20 +++++++------- subworkflows/01_clean_qc.nf | 2 +- 3 files changed, 37 insertions(+), 37 deletions(-) diff --git a/modules/host_filter.nf b/modules/host_filter.nf index f7d0cb3..6da4d1b 100644 --- a/modules/host_filter.nf +++ b/modules/host_filter.nf @@ -1,5 +1,5 @@ process HOST_FILTER { - tag "${sampleId}" + tag "${meta.id}" publishDir "${params.outdir}/01_clean_qc/01_1_cleaned_reads/", mode: 'copy', pattern: 'cleaned_*.fastq.gz' publishDir "${params.outdir}/01_clean_qc/01_1_cleaned_reads/", mode: 'copy', pattern: '*.bam' @@ -9,32 +9,32 @@ process HOST_FILTER { else null} input: - tuple val(sampleId), path(read1), path(read2) + tuple val(meta), path(reads) path fasta path index output: - tuple val(sampleId), path("cleaned_${sampleId}_R1.fastq.gz"), path("cleaned_${sampleId}_R2.fastq.gz"), emit: reads - path "host_filter_flagstat/${sampleId}.host_filter.flagstat", emit: hf_report - path "${sampleId}.no_filter.flagstat", emit: nf_report + tuple val(meta), path("cleaned_${meta.id}*.fastq.gz"), emit: reads + path "host_filter_flagstat/${meta.id}.host_filter.flagstat", emit: hf_report + path "${meta.id}.no_filter.flagstat", emit: nf_report script: """ - bwa-mem2 mem -t ${task.cpus} ${fasta} ${read1} ${read2} > ${sampleId}.bam - samtools view -bhS -f 12 ${sampleId}.bam > ${sampleId}.without_host.bam + bwa-mem2 mem -t ${task.cpus} ${fasta} ${reads[0]} ${reads[1]} > ${meta.id}.bam + samtools view -bhS -f 12 ${meta.id}.bam > ${meta.id}.without_host.bam mkdir host_filter_flagstat - samtools flagstat ${sampleId}.bam > ${sampleId}.no_filter.flagstat - samtools flagstat ${sampleId}.without_host.bam >> host_filter_flagstat/${sampleId}.host_filter.flagstat - samtools sort -n -o ${sampleId}.without_host_sort.bam ${sampleId}.without_host.bam - samtools fastq -N -1 cleaned_${sampleId}_R1.fastq.gz -2 cleaned_${sampleId}_R2.fastq.gz ${sampleId}.without_host_sort.bam - rm ${sampleId}.bam - rm ${sampleId}.without_host.bam - rm ${sampleId}.without_host_sort.bam + samtools flagstat ${meta.id}.bam > ${meta.id}.no_filter.flagstat + samtools flagstat ${meta.id}.without_host.bam >> host_filter_flagstat/${meta.id}.host_filter.flagstat + samtools sort -n -o ${meta.id}.without_host_sort.bam ${meta.id}.without_host.bam + samtools fastq -N -1 cleaned_${meta.id}_R1.fastq.gz -2 cleaned_${meta.id}_R2.fastq.gz ${meta.id}.without_host_sort.bam + rm ${meta.id}.bam + rm ${meta.id}.without_host.bam + rm ${meta.id}.without_host_sort.bam """ } process HOST_FILTER_HIFI { - tag "${sampleId}" + tag "${meta.id}" label "MINIMAP2" publishDir "${params.outdir}/01_clean_qc/01_1_cleaned_reads/", mode: 'copy', pattern: 'cleaned_*.fastq.gz' @@ -45,27 +45,27 @@ process HOST_FILTER_HIFI { else null} input: - tuple val(sampleId), path(reads) + tuple val(meta.id), path(reads) path fasta output: - tuple val(sampleId), path("cleaned_${sampleId}.fastq.gz"), emit: reads - path "host_filter_flagstat/${sampleId}.host_filter.flagstat", emit: hf_report - path "${sampleId}.no_filter.flagstat", emit: nf_report + tuple val(meta.id), path("cleaned_${meta.id}.fastq.gz"), emit: reads + path "host_filter_flagstat/${meta.id}.host_filter.flagstat", emit: hf_report + path "${meta.id}.no_filter.flagstat", emit: nf_report script: """ - minimap2 -ax asm20 -t ${task.cpus} ${fasta} ${reads} | samtools sort -@ ${task.cpus} -o ${sampleId}.bam + minimap2 -ax asm20 -t ${task.cpus} ${fasta} ${reads} | samtools sort -@ ${task.cpus} -o ${meta.id}.bam - samtools view -@ ${task.cpus} -bh -f 4 ${sampleId}.bam > ${sampleId}.without_host.bam + samtools view -@ ${task.cpus} -bh -f 4 ${meta.id}.bam > ${meta.id}.without_host.bam mkdir host_filter_flagstat - samtools flagstat ${sampleId}.bam -@ ${task.cpus} > ${sampleId}.no_filter.flagstat - samtools flagstat ${sampleId}.without_host.bam -@ ${task.cpus} > host_filter_flagstat/${sampleId}.host_filter.flagstat - samtools fastq -@ ${task.cpus} ${sampleId}.without_host.bam | gzip > cleaned_${sampleId}.fastq.gz + samtools flagstat ${meta.id}.bam -@ ${task.cpus} > ${meta.id}.no_filter.flagstat + samtools flagstat ${meta.id}.without_host.bam -@ ${task.cpus} > host_filter_flagstat/${meta.id}.host_filter.flagstat + samtools fastq -@ ${task.cpus} ${meta.id}.without_host.bam | gzip > cleaned_${meta.id}.fastq.gz - rm ${sampleId}.bam - rm ${sampleId}.without_host.bam + rm ${meta.id}.bam + rm ${meta.id}.without_host.bam """ } diff --git a/modules/sickle.nf b/modules/sickle.nf index d4d010b..4eaf9eb 100644 --- a/modules/sickle.nf +++ b/modules/sickle.nf @@ -1,31 +1,31 @@ process SICKLE { - tag "${sampleId}" + tag "${meta.id}" publishDir "${params.outdir}/01_clean_qc/01_1_cleaned_reads/", mode: 'copy', pattern: 'cleaned_*.fastq.gz' publishDir "${params.outdir}/01_clean_qc/01_1_cleaned_reads/logs", mode: 'copy', pattern: '*_sickle.log' input: - tuple val(sampleId), path(read1), path(read2), val(paired) + tuple val(meta), path(reads) output: - tuple val(sampleId), path("*${sampleId}*_R1.fastq.gz"), path("*${sampleId}*_R2.fastq.gz"), emit: reads - path "${sampleId}_single_sickle.fastq.gz", emit: single - path "${sampleId}_sickle.log", emit: report + tuple val(meta), path("*${meta.id}*.fastq.gz"), emit: reads + path "${meta.id}_single_sickle.fastq.gz", emit: single + path "${meta.id}_sickle.log", emit: report script: - mode = paired ? 'pe' : 'se' + if (params.skip_host_filter) { // output are final cleaned files - options = "-o cleaned_${sampleId}_R1.fastq.gz -p cleaned_${sampleId}_R2.fastq.gz" + options = "-o cleaned_${meta.id}_R1.fastq.gz -p cleaned_${meta.id}_R2.fastq.gz" } else { //tempory files not saved in publish dir - options = "-o ${sampleId}_sickle_R1.fastq.gz -p ${sampleId}_sickle_R2.fastq.gz" + options = "-o ${meta.id}_sickle_R1.fastq.gz -p ${meta.id}_sickle_R2.fastq.gz" } options += " -t " + params.quality_type """ - sickle ${mode} -f ${read1} -r ${read2} $options \ - -s ${sampleId}_single_sickle.fastq.gz -g > ${sampleId}_sickle.log + sickle 'pe' -f ${reads[0]} -r ${reads[1]} $options \ + -s ${meta.id}_single_sickle.fastq.gz -g > ${meta.id}_sickle.log """ } \ No newline at end of file diff --git a/subworkflows/01_clean_qc.nf b/subworkflows/01_clean_qc.nf index 09c614f..c7dfd94 100644 --- a/subworkflows/01_clean_qc.nf +++ b/subworkflows/01_clean_qc.nf @@ -25,7 +25,7 @@ workflow STEP_01_CLEAN_QC { if (params.use_sickle) { SICKLE ( - ch_sickle_reads + ch_intermediate_reads ) ch_intermediate_reads = SICKLE.out.reads ch_sickle_report = SICKLE.out.report -- GitLab From 0ffa3e59b0f5968ddcc512068f95010bbbf73ca3 Mon Sep 17 00:00:00 2001 From: Maina Vienne <maina.vienne@inrae.fr> Date: Thu, 21 Jul 2022 16:46:55 +0200 Subject: [PATCH 06/13] Use metadata: step02_assembly and step_03 filtering Remaniment du subworkflow filtering pour ne comporte plus de process, creation du module filtering_cpm.nf contenant les process de filtrage --- main.nf | 3 ++ modules/assembly.nf | 56 +++++++++++++++++----------------- modules/filtering_cpm.nf | 45 +++++++++++++++++++++++++++ modules/metaquast.nf | 14 ++++----- modules/reads_deduplication.nf | 38 +++++++++++------------ subworkflows/02_assembly.nf | 11 +++---- subworkflows/03_filtering.nf | 55 ++++++--------------------------- subworkflows/hifi_reads.nf | 2 +- subworkflows/short_reads.nf | 18 ++++++----- 9 files changed, 126 insertions(+), 116 deletions(-) create mode 100644 modules/filtering_cpm.nf diff --git a/main.nf b/main.nf index 18f42cb..c998c18 100644 --- a/main.nf +++ b/main.nf @@ -231,6 +231,8 @@ workflow { } } .set { ch_assembly } + has_assembly = (file(params.input).splitCsv ( header:true, sep:',' ).assembly[0] != null) + //////////// // End check samplesheet //////////// @@ -281,6 +283,7 @@ workflow { ch_host_fasta, ch_host_index, ch_kaiju_db, + has_assembly, assembly_tool ) diff --git a/modules/assembly.nf b/modules/assembly.nf index 0f4ec71..22d962a 100644 --- a/modules/assembly.nf +++ b/modules/assembly.nf @@ -1,23 +1,23 @@ process METASPADES { - tag "${sampleId}" + tag "${meta.id}" publishDir "${params.outdir}/02_assembly", mode: 'copy' label 'ASSEMBLY_SR' input: - tuple val(sampleId), path(read1), path(read2) + tuple val(meta), path(reads) output: - tuple val(sampleId), path("metaspades/${sampleId}.contigs.fa"), emit: assembly - tuple val(sampleId), path("metaspades/${sampleId}.log"), path("metaspades/${sampleId}.params.txt"), emit: report + tuple val(meta), path("metaspades/${meta.id}.contigs.fa"), emit: assembly + tuple val(meta.id), path("metaspades/${meta.id}.log"), path("metaspades/${meta.id}.params.txt"), emit: report script: (_,mem,unit) = (task.memory =~ /(\d+) ([A-Z]B)/)[0] if ( unit =~ /GB/ ) { """ - metaspades.py -t ${task.cpus} -m $mem -1 ${read1} -2 ${read2} -o metaspades - mv metaspades/scaffolds.fasta metaspades/${sampleId}.contigs.fa - mv metaspades/spades.log metaspades/${sampleId}.log - mv metaspades/params.txt metaspades/${sampleId}.params.txt + metaspades.py -t ${task.cpus} -m $mem -1 ${reads[0]} -2 ${reads[1]} -o metaspades + mv metaspades/scaffolds.fasta metaspades/${meta.id}.contigs.fa + mv metaspades/spades.log metaspades/${meta.id}.log + mv metaspades/params.txt metaspades/${meta.id}.params.txt """ } else { error "Memory setting for the ASSEMBLY process is in $unit, it must be in GB (check config files) " @@ -26,72 +26,72 @@ process METASPADES { process MEGAHIT { - tag "${sampleId}" + tag "${meta.id}" publishDir "${params.outdir}/02_assembly", mode: 'copy' label 'ASSEMBLY_SR' input: - tuple val(sampleId), path(read1), path(read2) + tuple val(meta), path(reads) output: - tuple val(sampleId), path("megahit/${sampleId}.contigs.fa"), emit: assembly - tuple val(sampleId), path("megahit/${sampleId}.log"), path("megahit/${sampleId}.params.txt"), emit: report + tuple val(meta), path("megahit/${meta.id}.contigs.fa"), emit: assembly + tuple val(meta.id), path("megahit/${meta.id}.log"), path("megahit/${meta.id}.params.txt"), emit: report script: """ - megahit -t ${task.cpus} -1 ${read1} -2 ${read2} -o megahit --out-prefix "${sampleId}" - mv megahit/options.json megahit/${sampleId}.params.txt + megahit -t ${task.cpus} -1 ${reads[0]} -2 ${reads[1]} -o megahit --out-prefix "${meta.id}" + mv megahit/options.json megahit/${meta.id}.params.txt rm -r megahit/intermediate_contigs """ } process HIFIASM_META { - tag "${sampleId}" + tag "${meta.id}" publishDir "${params.outdir}/02_assembly", mode: 'copy' label 'ASSEMBLY_HIFI' input: - tuple val(sampleId), path(reads) + tuple val(meta), path(reads) output: - tuple val(sampleId), path("hifiasm-meta/${sampleId}.contigs.fa"), emit: assembly - tuple val(sampleId), path("hifiasm-meta/${sampleId}.log"), path("hifiasm-meta/${sampleId}.params.txt"), emit: report + tuple val(meta), path("hifiasm-meta/${meta.id}.contigs.fa"), emit: assembly + tuple val(meta.id), path("hifiasm-meta/${meta.id}.log"), path("hifiasm-meta/${meta.id}.params.txt"), emit: report script: """ mkdir hifiasm-meta - hifiasm_meta -t ${task.cpus} -o ${sampleId} $reads 2> hifiasm-meta/${sampleId}.log + hifiasm_meta -t ${task.cpus} -o ${meta.id} $reads 2> hifiasm-meta/${meta.id}.log # gfa to fasta format - awk '/^S/{print ">"\$2"\\n"\$3}' ${sampleId}.p_ctg.gfa | fold > hifiasm-meta/${sampleId}.contigs.fa + awk '/^S/{print ">"\$2"\\n"\$3}' ${meta.id}.p_ctg.gfa | fold > hifiasm-meta/${meta.id}.contigs.fa - mv ${sampleId}.cmd hifiasm-meta/${sampleId}.params.txt + mv ${meta.id}.cmd hifiasm-meta/${meta.id}.params.txt """ } process METAFLYE { - tag "${sampleId}" + tag "${meta.id}" publishDir "${params.outdir}/02_assembly", mode: 'copy' label 'ASSEMBLY_HIFI' input: - tuple val(sampleId), path(reads) + tuple val(meta), path(reads) output: - tuple val(sampleId), path("metaflye/${sampleId}.contigs.fa"), emit: assembly - tuple val(sampleId), path("metaflye/${sampleId}.log"), path("metaflye/${sampleId}.params.json"), emit: report + tuple val(meta), path("metaflye/${meta.id}.contigs.fa"), emit: assembly + tuple val(meta.id), path("metaflye/${meta.id}.log"), path("metaflye/${meta.id}.params.json"), emit: report script: """ mkdir metaflye - flye --pacbio-hifi $reads -o 'metaflye' --meta -t ${task.cpus} 2> metaflye/${sampleId}.log + flye --pacbio-hifi $reads -o 'metaflye' --meta -t ${task.cpus} 2> metaflye/${meta.id}.log - mv metaflye/assembly.fasta metaflye/${sampleId}.contigs.fa - mv metaflye/params.json metaflye/${sampleId}.params.json + mv metaflye/assembly.fasta metaflye/${meta.id}.contigs.fa + mv metaflye/params.json metaflye/${meta.id}.params.json """ } diff --git a/modules/filtering_cpm.nf b/modules/filtering_cpm.nf new file mode 100644 index 0000000..951a201 --- /dev/null +++ b/modules/filtering_cpm.nf @@ -0,0 +1,45 @@ +process CHUNK_ASSEMBLY_FILTER { + label 'ASSEMBLY_FILTER' + + input: + tuple val(meta), path(assembly_file), path(idxstats) + val min_cpm + + output: + tuple val(meta), path("${chunk_name}_select_cpm${min_cpm}.fasta"), emit: chunk_selected + tuple val(meta), path("${chunk_name}_discard_cpm${min_cpm}.fasta"), emit: chunk_discarded + + script: + chunk_name = assembly_file.baseName + """ + Filter_contig_per_cpm.py -i ${idxstats} -f ${assembly_file} -c ${min_cpm} -s ${chunk_name}_select_cpm${min_cpm}.fasta -d ${chunk_name}_discard_cpm${min_cpm}.fasta + """ +} + +process MERGE_ASSEMBLY_FILTER { + label 'ASSEMBLY_FILTER' + + tag "${meta.id}" + publishDir "${params.outdir}/03_filtering/", mode: 'copy' + + input: + tuple val(meta), path(select_fasta) + tuple val(meta), path(discard_fasta) + val min_cpm + + output: + tuple val(meta), path("${meta.id}_select_contigs_cpm${min_cpm}.fasta"), emit: merged_selected + tuple val(meta), path("${meta.id}_discard_contigs_cpm${min_cpm}.fasta"), emit: merged_discarded + + shell: + ''' + echo !{select_fasta} | sed "s/ /\\n/g" | sort > select_list + echo !{discard_fasta} | sed "s/ /\\n/g" | sort > discard_list + + for i in `cat select_list` ; do cat $i >> !{meta.id}_select_contigs_cpm!{min_cpm}.fasta ; done + for j in `cat discard_list` ; do cat $j >> !{meta.id}_discard_contigs_cpm!{min_cpm}.fasta ; done + + rm select_list + rm discard_list + ''' +} \ No newline at end of file diff --git a/modules/metaquast.nf b/modules/metaquast.nf index 5fbae96..0fa75f3 100644 --- a/modules/metaquast.nf +++ b/modules/metaquast.nf @@ -1,15 +1,15 @@ process QUAST { - tag "${sampleId}" + tag "${meta.id}" label 'QUAST' publishDir "${params.outdir}", mode: 'copy' input: - tuple val(sampleId), path(assembly) + tuple val(meta), path(assembly) val step output: - path "${outdirModule}/${sampleId}/*", emit: all - path "${{outdirModule}}/${sampleId}/report.tsv", emit: report + path "${outdirModule}/${meta.id}/*", emit: all + path "${{outdirModule}}/${meta.id}/report.tsv", emit: report script: @@ -22,8 +22,8 @@ process QUAST { } } """ - mkdir -p $outdirModule/${sampleId}/ - touch $outdirModule/${sampleId}/report.tsv - metaquast.py --threads ${task.cpus} --rna-finding --max-ref-number 0 --min-contig 0 ${assembly} -o $outdirModule/${sampleId} --labels ${sampleId} + mkdir -p $outdirModule/${meta.id}/ + touch $outdirModule/${meta.id}/report.tsv + metaquast.py --threads ${task.cpus} --rna-finding --max-ref-number 0 --min-contig 0 ${assembly} -o $outdirModule/${meta.id} --labels ${meta.id} """ } diff --git a/modules/reads_deduplication.nf b/modules/reads_deduplication.nf index 0d907ad..de2fdc7 100644 --- a/modules/reads_deduplication.nf +++ b/modules/reads_deduplication.nf @@ -1,34 +1,34 @@ process READS_DEDUPLICATION { - tag "${sampleId}" + tag "${meta.id}" publishDir "${params.outdir}/02_assembly", mode: 'copy', pattern: '*.fastq.gz' publishDir "${params.outdir}/02_assembly/logs", mode: 'copy', pattern: '*.idxstats' publishDir "${params.outdir}/02_assembly/logs", mode: 'copy', pattern: '*.flagstat' input: - tuple val(sampleId), path(assembly), path(read1), path(read2) + tuple val(meta), path(assembly), path(reads) output: - tuple val(sampleId), path("${sampleId}_R1_dedup.fastq.gz"), path("${sampleId}_R2_dedup.fastq.gz"), emit: dedup - tuple val(sampleId), path("${sampleId}.count_reads_on_contigs.idxstats"), emit: idxstats - path "${sampleId}.count_reads_on_contigs.flagstat", emit: flagstat + tuple val(meta), path("${meta.id}_R1_dedup.fastq.gz"), path("${meta.id}_R2_dedup.fastq.gz"), emit: dedup + tuple val(meta), path("${meta.id}.count_reads_on_contigs.idxstats"), emit: idxstats + path "${meta.id}.count_reads_on_contigs.flagstat", emit: flagstat script: """ mkdir logs bwa-mem2 index ${assembly} -p ${assembly} - bwa-mem2 mem ${assembly} ${read1} ${read2} | samtools view -bS - | samtools sort -n -o ${sampleId}.sort.bam - - samtools fixmate -m ${sampleId}.sort.bam ${sampleId}.fixmate.bam - samtools sort -o ${sampleId}.fixmate.positionsort.bam ${sampleId}.fixmate.bam - samtools markdup -r -S -s -f ${sampleId}.stats ${sampleId}.fixmate.positionsort.bam ${sampleId}.filtered.bam - samtools index ${sampleId}.filtered.bam - samtools idxstats ${sampleId}.filtered.bam > ${sampleId}.count_reads_on_contigs.idxstats - samtools flagstat ${sampleId}.filtered.bam > ${sampleId}.count_reads_on_contigs.flagstat - samtools sort -n -o ${sampleId}.filtered.sort.bam ${sampleId}.filtered.bam - samtools fastq -N -1 ${sampleId}_R1_dedup.fastq.gz -2 ${sampleId}_R2_dedup.fastq.gz ${sampleId}.filtered.sort.bam - rm ${sampleId}.sort.bam - rm ${sampleId}.fixmate.bam - rm ${sampleId}.fixmate.positionsort.bam - rm ${sampleId}.filtered.bam - rm ${sampleId}.filtered.sort.bam + bwa-mem2 mem ${assembly} ${reads[0]} ${reads[1]} | samtools view -bS - | samtools sort -n -o ${meta.id}.sort.bam - + samtools fixmate -m ${meta.id}.sort.bam ${meta.id}.fixmate.bam + samtools sort -o ${meta.id}.fixmate.positionsort.bam ${meta.id}.fixmate.bam + samtools markdup -r -S -s -f ${meta.id}.stats ${meta.id}.fixmate.positionsort.bam ${meta.id}.filtered.bam + samtools index ${meta.id}.filtered.bam + samtools idxstats ${meta.id}.filtered.bam > ${meta.id}.count_reads_on_contigs.idxstats + samtools flagstat ${meta.id}.filtered.bam > ${meta.id}.count_reads_on_contigs.flagstat + samtools sort -n -o ${meta.id}.filtered.sort.bam ${meta.id}.filtered.bam + samtools fastq -N -1 ${meta.id}_R1_dedup.fastq.gz -2 ${meta.id}_R2_dedup.fastq.gz ${meta.id}.filtered.sort.bam + rm ${meta.id}.sort.bam + rm ${meta.id}.fixmate.bam + rm ${meta.id}.fixmate.positionsort.bam + rm ${meta.id}.filtered.bam + rm ${meta.id}.filtered.sort.bam """ } \ No newline at end of file diff --git a/subworkflows/02_assembly.nf b/subworkflows/02_assembly.nf index 8658d58..355ebc3 100644 --- a/subworkflows/02_assembly.nf +++ b/subworkflows/02_assembly.nf @@ -16,21 +16,18 @@ workflow STEP_02_ASSEMBLY { if(assembly_tool == 'metaspades') { METASPADES(preprocessed_reads) ch_assembly = METASPADES.out.assembly - } + } else if(assembly_tool == 'megahit') { - MEGAHIT(preprocessed_reads) - ch_assembly = MEGAHIT.out.assembly - } + MEGAHIT(preprocessed_reads) + ch_assembly = MEGAHIT.out.assembly + } else { exit 1, "Invalid short read assembly parameter: ${assembly_tool}" } } - - // ch_filtered = Channel.value(false) ASSEMBLY_QUAST( ch_assembly, 'ASSEMBLY' ) - ch_assembly_report = ASSEMBLY_QUAST.out.report ch_assembly_and_preprocessed = ch_assembly.join(preprocessed_reads, remainder: true) diff --git a/subworkflows/03_filtering.nf b/subworkflows/03_filtering.nf index 5d04839..5c6d81e 100644 --- a/subworkflows/03_filtering.nf +++ b/subworkflows/03_filtering.nf @@ -1,50 +1,8 @@ -process CHUNK_ASSEMBLY_FILTER { - label 'ASSEMBLY_FILTER' - - input: - tuple val(sampleId), path(assembly_file), path(idxstats) - val min_cpm +include { CHUNK_ASSEMBLY_FILTER} from '../modules/filtering_cpm.nf' +include { MERGE_ASSEMBLY_FILTER} from '../modules/filtering_cpm.nf' +include { QUAST as FILTERED_QUAST } from '../modules/metaquast' - output: - tuple val(sampleId), path("${chunk_name}_select_cpm${min_cpm}.fasta"), emit: chunk_selected - tuple val(sampleId), path("${chunk_name}_discard_cpm${min_cpm}.fasta"), emit: chunk_discarded - - script: - chunk_name = assembly_file.baseName - """ - Filter_contig_per_cpm.py -i ${idxstats} -f ${assembly_file} -c ${min_cpm} -s ${chunk_name}_select_cpm${min_cpm}.fasta -d ${chunk_name}_discard_cpm${min_cpm}.fasta - """ -} - -process MERGE_ASSEMBLY_FILTER { - label 'ASSEMBLY_FILTER' - - tag "${sampleId}" - publishDir "${params.outdir}/03_filtering/", mode: 'copy' - - input: - tuple val(sampleId), path(select_fasta) - tuple val(sampleId), path(discard_fasta) - val min_cpm - - output: - tuple val(sampleId), path("${sampleId}_select_contigs_cpm${min_cpm}.fasta"), emit: merged_selected - tuple val(sampleId), path("${sampleId}_discard_contigs_cpm${min_cpm}.fasta"), emit: merged_discarded - - shell: - ''' - echo !{select_fasta} | sed "s/ /\\n/g" | sort > select_list - echo !{discard_fasta} | sed "s/ /\\n/g" | sort > discard_list - - for i in `cat select_list` ; do cat $i >> !{sampleId}_select_contigs_cpm!{min_cpm}.fasta ; done - for j in `cat discard_list` ; do cat $j >> !{sampleId}_discard_contigs_cpm!{min_cpm}.fasta ; done - - rm select_list - rm discard_list - ''' -} - -workflow ASSEMBLY_FILTER { +workflow STEP_03_ASSEMBLY_FILTER { take: assembly_and_idxstats min_cpm @@ -72,6 +30,11 @@ workflow ASSEMBLY_FILTER { ) ch_merged_selected = MERGE_ASSEMBLY_FILTER.out.merged_selected + FILTERED_QUAST( ch_merged_selected, 'FILTERING' ) + ch_filtered_report = FILTERED_QUAST.out.report + + emit: selected = ch_merged_selected + report = ch_filtered_report } \ No newline at end of file diff --git a/subworkflows/hifi_reads.nf b/subworkflows/hifi_reads.nf index d35441c..e9215b3 100644 --- a/subworkflows/hifi_reads.nf +++ b/subworkflows/hifi_reads.nf @@ -1,5 +1,5 @@ -include { ASSEMBLY_FILTER as S03_FILTERING } from './03_filtering' +include { STEP_03_ASSEMBLY_FILTER as S03_FILTERING } from './03_filtering' include { STEP_01_CLEAN_QC_HIFI} from './01_clean_qc' include { STEP_02_ASSEMBLY_HIFI as S02_ASSEMBLY } from './02_assembly' include { MINIMAP2_FILTERING } from '../modules/read_alignment' diff --git a/subworkflows/short_reads.nf b/subworkflows/short_reads.nf index bbac49f..8d1b128 100644 --- a/subworkflows/short_reads.nf +++ b/subworkflows/short_reads.nf @@ -1,7 +1,6 @@ include { STEP_01_CLEAN_QC as S01_CLEAN_QC } from './01_clean_qc' include { STEP_02_ASSEMBLY as S02_ASSEMBLY } from './02_assembly' -include { ASSEMBLY_FILTER as S03_FILTERING } from './03_filtering' -include { QUAST as S04_FILTERED_QUAST } from '../modules/metaquast' +include { STEP_03_ASSEMBLY_FILTER as S03_FILTERING } from './03_filtering' include { MERGE_FASTQ } from '../modules/merge_fastq.nf' workflow SHORT_READS { @@ -11,6 +10,7 @@ workflow SHORT_READS { host_fasta host_index kaiju_db + has_assembly assembly_tool main: @@ -62,7 +62,12 @@ workflow SHORT_READS { .branch { id, meta, fastq -> single : fastq.size() == 1 - return [meta.flatten(),fastq.flatten() ] + return [[id:meta.sample.unique().join(), + sample:meta.sample.unique().join(), + flowcell:meta.flowcell.join("_"), + group:meta.group.unique().join(), + assembly:meta.assembly.unique().join(), + type:meta.type.unique().join(),], fastq.flatten() ] multiple: fastq.size() > 1 return [[id:meta.sample.unique().join(), sample:meta.sample.unique().join(), @@ -80,12 +85,11 @@ workflow SHORT_READS { .mix(ch_reads_tmp.single) .set{ch_preprocessed_reads} - ch_preprocessed_reads.view() ///////////////////// //End manage Flowcell ///////////////////// - S02_ASSEMBLY ( ch_preprocessed_reads, assembly, assembly_tool ) + S02_ASSEMBLY ( ch_preprocessed_reads, assembly, has_assembly, assembly_tool ) ch_assembly = S02_ASSEMBLY.out.assembly ch_dedup = S02_ASSEMBLY.out.dedup ch_idxstats = S02_ASSEMBLY.out.idxstats @@ -109,9 +113,7 @@ workflow SHORT_READS { ch_min_contigs_cpm ) ch_assembly = S03_FILTERING.out.selected - - S04_FILTERED_QUAST( ch_assembly, 'FILTERING' ) - ch_filtered_report = S04_FILTERED_QUAST.out.report + ch_filtered_report = S03_FILTERING.out.report } emit: -- GitLab From 506cc34dd00dac963aa497a98a76ff6af63bc32f Mon Sep 17 00:00:00 2001 From: Maina Vienne <maina.vienne@inrae.fr> Date: Fri, 22 Jul 2022 11:01:55 +0200 Subject: [PATCH 07/13] Use metadata: step 04 prokka and step 05 alignment --- main.nf | 3 ++ modules/diamond.nf | 16 ++++----- modules/prokka.nf | 40 ++++++++++----------- modules/read_alignment.nf | 66 +++++++++++++++++----------------- modules/reads_deduplication.nf | 2 +- subworkflows/00_databases.nf | 3 +- subworkflows/05_alignment.nf | 4 +-- subworkflows/shared.nf | 3 +- 8 files changed, 71 insertions(+), 66 deletions(-) diff --git a/main.nf b/main.nf index c998c18..d29e7f2 100644 --- a/main.nf +++ b/main.nf @@ -243,6 +243,7 @@ workflow { ch_kaiju_db = Channel.empty() ch_eggnog_db = Channel.empty() ch_taxonomy = Channel.empty() + ch_diamond = Channel.empty() DATABASES () ch_host_fasta = DATABASES.out.host_fasta @@ -250,6 +251,7 @@ workflow { ch_kaiju_db = DATABASES.out.kaiju_db ch_eggnog_db = DATABASES.out.eggnog ch_taxonomy = DATABASES.out.taxonomy + ch_diamond = DATABASES.out.diamond ch_multiqc_config = Channel.empty() @@ -345,6 +347,7 @@ workflow { SH ( ch_reads, ch_assembly, + ch_diamond, ch_eggnog_db, ch_taxonomy ) diff --git a/modules/diamond.nf b/modules/diamond.nf index 13b4523..3081655 100644 --- a/modules/diamond.nf +++ b/modules/diamond.nf @@ -1,22 +1,22 @@ process DIAMOND { - publishDir "${params.outdir}/05_alignment/05_2_database_alignment/$sampleId", mode: 'copy' - tag "${sampleId}" + publishDir "${params.outdir}/05_alignment/05_2_database_alignment/$meta.id", mode: 'copy' + tag "${meta.id}" input: - tuple val(sampleId), path(faa) - val diamond_bank + tuple val(meta), path(faa) + path diamond_bank output: - tuple val(sampleId), path("${sampleId}_aln_diamond.m8"), emit: m8 + tuple val(meta), path("${meta.id}_aln_diamond.m8"), emit: m8 script: fmt="qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore qlen slen stitle" fmt_tab=fmt.replaceAll(" ","\t") """ echo "$fmt_tab" > head.m8 - diamond blastp -p ${task.cpus} -d ${diamond_bank} -q ${faa} -o ${sampleId}_aln_diamond.nohead.m8 -f 6 $fmt - cat head.m8 ${sampleId}_aln_diamond.nohead.m8 > ${sampleId}_aln_diamond.m8 - rm ${sampleId}_aln_diamond.nohead.m8 + diamond blastp -p ${task.cpus} -d ${diamond_bank} -q ${faa} -o ${meta.id}_aln_diamond.nohead.m8 -f 6 $fmt + cat head.m8 ${meta.id}_aln_diamond.nohead.m8 > ${meta.id}_aln_diamond.m8 + rm ${meta.id}_aln_diamond.nohead.m8 rm head.m8 """ } diff --git a/modules/prokka.nf b/modules/prokka.nf index 984e972..d6d0ca4 100644 --- a/modules/prokka.nf +++ b/modules/prokka.nf @@ -1,45 +1,45 @@ process PROKKA { - tag "${sampleId}" + tag "${meta.id}" input: - tuple val(sampleId), file(assembly_file) + tuple val(meta), file(assembly_file) output: - tuple val(sampleId), path("PROKKA_${sampleId}"), emit: prokka_results - path "PROKKA_${sampleId}/${sampleId}.txt", emit: report + tuple val(meta), path("PROKKA_${meta.id}"), emit: prokka_results + path "PROKKA_${meta.id}/${meta.id}.txt", emit: report script: """ - prokka --metagenome --noanno --rawproduct --outdir PROKKA_${sampleId} --prefix ${sampleId} ${assembly_file} --centre X --compliant --cpus ${task.cpus} - rm PROKKA_${sampleId}/*.gbk - gt gff3validator PROKKA_${sampleId}/${sampleId}.gff + prokka --metagenome --noanno --rawproduct --outdir PROKKA_${meta.id} --prefix ${meta.id} ${assembly_file} --centre X --compliant --cpus ${task.cpus} + rm PROKKA_${meta.id}/*.gbk + gt gff3validator PROKKA_${meta.id}/${meta.id}.gff """ } process RENAME_CONTIGS_AND_GENES { - tag "${sampleId}" + tag "${meta.id}" publishDir "${params.outdir}/04_structural_annot", mode: 'copy' label 'PYTHON' input: - tuple val(sampleId), path(prokka_results) + tuple val(meta), path(prokka_results) output: - tuple val(sampleId), path("${sampleId}.annotated.fna"), emit: fna - tuple val(sampleId), path("${sampleId}.annotated.ffn"), emit: ffn - tuple val(sampleId), path("${sampleId}.annotated.faa"), emit: faa - tuple val(sampleId), path("${sampleId}.annotated.gff"), emit: gff - tuple val(sampleId), path("${sampleId}_prot.len"), emit: prot_length + tuple val(meta), path("${meta.id}.annotated.fna"), emit: fna + tuple val(meta), path("${meta.id}.annotated.ffn"), emit: ffn + tuple val(meta), path("${meta.id}.annotated.faa"), emit: faa + tuple val(meta), path("${meta.id}.annotated.gff"), emit: gff + tuple val(meta), path("${meta.id}_prot.len"), emit: prot_length script: """ - grep "^gnl" ${prokka_results}/${sampleId}.gff > ${sampleId}_only_gnl.gff + grep "^gnl" ${prokka_results}/${meta.id}.gff > ${meta.id}_only_gnl.gff - Rename_contigs_and_genes.py -f ${sampleId}_only_gnl.gff -faa ${prokka_results}/${sampleId}.faa \ - -ffn ${prokka_results}/${sampleId}.ffn -fna ${prokka_results}/${sampleId}.fna \ - -p ${sampleId} -oGFF ${sampleId}.annotated.gff -oFAA ${sampleId}.annotated.faa \ - -oFFN ${sampleId}.annotated.ffn -oFNA ${sampleId}.annotated.fna + Rename_contigs_and_genes.py -f ${meta.id}_only_gnl.gff -faa ${prokka_results}/${meta.id}.faa \ + -ffn ${prokka_results}/${meta.id}.ffn -fna ${prokka_results}/${meta.id}.fna \ + -p ${meta.id} -oGFF ${meta.id}.annotated.gff -oFAA ${meta.id}.annotated.faa \ + -oFFN ${meta.id}.annotated.ffn -oFNA ${meta.id}.annotated.fna - samtools faidx ${sampleId}.annotated.faa; cut -f 1,2 ${sampleId}.annotated.faa.fai > ${sampleId}_prot.len + samtools faidx ${meta.id}.annotated.faa; cut -f 1,2 ${meta.id}.annotated.faa.fai > ${meta.id}_prot.len """ } diff --git a/modules/read_alignment.nf b/modules/read_alignment.nf index f2c922a..76b42fe 100644 --- a/modules/read_alignment.nf +++ b/modules/read_alignment.nf @@ -1,80 +1,80 @@ process BWA_MEM { - tag "${sampleId}" - publishDir "${params.outdir}/05_alignment/05_1_reads_alignment_on_contigs/${sampleId}", mode: 'copy' + tag "${meta.id}" + publishDir "${params.outdir}/05_alignment/05_1_reads_alignment_on_contigs/${meta.id}", mode: 'copy' input: - tuple val(sampleId), path(fna), path(read1), path(read2) + tuple val(meta), path(fna), path(reads) output: - tuple val(sampleId), path("${sampleId}.sort.bam"), path("${sampleId}.sort.bam.bai"), emit: bam - tuple val(sampleId), path("${sampleId}_coverage.tsv"), emit: sam_coverage - path "${sampleId}*" + tuple val(meta), path("${meta.id}.sort.bam"), path("${meta.id}.sort.bam.bai"), emit: bam + tuple val(meta.id), path("${meta.id}_coverage.tsv"), emit: sam_coverage + path "${meta.id}*" script: """ bwa-mem2 index ${fna} -p ${fna} - bwa-mem2 mem -t ${task.cpus} ${fna} ${read1} ${read2} | samtools view -@ ${task.cpus} -bS - | samtools sort -@ ${task.cpus} - -o ${sampleId}.sort.bam - samtools index -@ ${task.cpus} ${sampleId}.sort.bam + bwa-mem2 mem -t ${task.cpus} ${fna} ${reads[0]} ${reads[1]} | samtools view -@ ${task.cpus} -bS - | samtools sort -@ ${task.cpus} - -o ${meta.id}.sort.bam + samtools index -@ ${task.cpus} ${meta.id}.sort.bam - samtools flagstat -@ ${task.cpus} ${sampleId}.sort.bam > ${sampleId}.flagstat - samtools coverage ${sampleId}.sort.bam > ${sampleId}_coverage.tsv + samtools flagstat -@ ${task.cpus} ${meta.id}.sort.bam > ${meta.id}.flagstat + samtools coverage ${meta.id}.sort.bam > ${meta.id}_coverage.tsv - samtools idxstats ${sampleId}.sort.bam > ${sampleId}.sort.bam.idxstats + samtools idxstats ${meta.id}.sort.bam > ${meta.id}.sort.bam.idxstats """ } process MINIMAP2 { - tag "${sampleId}" + tag "${meta.id}" label 'MINIMAP2' - publishDir "${params.outdir}/05_alignment/05_1_reads_alignment_on_contigs/$sampleId", mode: 'copy' + publishDir "${params.outdir}/05_alignment/05_1_reads_alignment_on_contigs/$meta.id", mode: 'copy' input: - tuple val(sampleId), path(fna_prokka), path(reads) + tuple val(meta), path(fna_prokka), path(reads) output: - tuple val(sampleId), path("${sampleId}.sort.bam"), path("${sampleId}.sort.bam.bai"), emit: bam - tuple val(sampleId), path("${sampleId}_coverage.tsv"), emit: sam_coverage - path "${sampleId}*" + tuple val(meta), path("${meta.id}.sort.bam"), path("${meta.id}.sort.bam.bai"), emit: bam + tuple val(meta.id), path("${meta.id}_coverage.tsv"), emit: sam_coverage + path "${meta.id}*" script: """ # align reads to contigs, keep only primary aln and sort resulting bam - minimap2 -t ${task.cpus} -ax asm20 $fna_prokka $reads | samtools view -@ ${task.cpus} -b -F 2304 | samtools sort -@ ${task.cpus} -o ${sampleId}.sort.bam + minimap2 -t ${task.cpus} -ax asm20 $fna_prokka $reads | samtools view -@ ${task.cpus} -b -F 2304 | samtools sort -@ ${task.cpus} -o ${meta.id}.sort.bam - samtools index ${sampleId}.sort.bam -@ ${task.cpus} - samtools flagstat -@ ${task.cpus} ${sampleId}.sort.bam > ${sampleId}.flagstat - samtools coverage ${sampleId}.sort.bam > ${sampleId}_coverage.tsv + samtools index ${meta.id}.sort.bam -@ ${task.cpus} + samtools flagstat -@ ${task.cpus} ${meta.id}.sort.bam > ${meta.id}.flagstat + samtools coverage ${meta.id}.sort.bam > ${meta.id}_coverage.tsv - samtools idxstats ${sampleId}.sort.bam > ${sampleId}.sort.bam.idxstats + samtools idxstats ${meta.id}.sort.bam > ${meta.id}.sort.bam.idxstats """ } process MINIMAP2_FILTERING { - tag "${sampleId}" + tag "${meta.id}" label 'MINIMAP2' publishDir "${params.outdir}/02_assembly/logs/", mode: 'copy' input: - tuple val(sampleId), path(assembly), path(reads) + tuple val(meta), path(assembly), path(reads) output: - tuple val(sampleId), path("${sampleId}.idxstats"), emit: sam_idxstat - path "${sampleId}.flagstat", emit: sam_flagstat - path "${sampleId}*" + tuple val(meta.id), path("${meta.id}.idxstats"), emit: sam_idxstat + path "${meta.id}.flagstat", emit: sam_flagstat + path "${meta.id}*" script: """ # align reads to contigs, keep only primary aln and sort resulting bam - minimap2 -t ${task.cpus} -ax asm20 $assembly $reads | samtools view -@ ${task.cpus} -b -F 2304 | samtools sort -@ ${task.cpus} -o ${sampleId}.sort.bam + minimap2 -t ${task.cpus} -ax asm20 $assembly $reads | samtools view -@ ${task.cpus} -b -F 2304 | samtools sort -@ ${task.cpus} -o ${meta.id}.sort.bam - samtools index ${sampleId}.sort.bam -@ ${task.cpus} - samtools flagstat -@ ${task.cpus} ${sampleId}.sort.bam > ${sampleId}.flagstat - samtools coverage ${sampleId}.sort.bam > ${sampleId}_coverage.tsv + samtools index ${meta.id}.sort.bam -@ ${task.cpus} + samtools flagstat -@ ${task.cpus} ${meta.id}.sort.bam > ${meta.id}.flagstat + samtools coverage ${meta.id}.sort.bam > ${meta.id}_coverage.tsv - samtools idxstats ${sampleId}.sort.bam > ${sampleId}.idxstats + samtools idxstats ${meta.id}.sort.bam > ${meta.id}.idxstats - rm ${sampleId}.sort.bam* + rm ${meta.id}.sort.bam* """ } diff --git a/modules/reads_deduplication.nf b/modules/reads_deduplication.nf index de2fdc7..83d2b56 100644 --- a/modules/reads_deduplication.nf +++ b/modules/reads_deduplication.nf @@ -8,7 +8,7 @@ process READS_DEDUPLICATION { tuple val(meta), path(assembly), path(reads) output: - tuple val(meta), path("${meta.id}_R1_dedup.fastq.gz"), path("${meta.id}_R2_dedup.fastq.gz"), emit: dedup + tuple val(meta), path("${meta.id}*_dedup.fastq.gz"), emit: dedup tuple val(meta), path("${meta.id}.count_reads_on_contigs.idxstats"), emit: idxstats path "${meta.id}.count_reads_on_contigs.flagstat", emit: flagstat diff --git a/subworkflows/00_databases.nf b/subworkflows/00_databases.nf index 78e362e..ca8eadd 100644 --- a/subworkflows/00_databases.nf +++ b/subworkflows/00_databases.nf @@ -85,7 +85,7 @@ workflow DATABASES { ch_diamond = Channel.empty() if ( !(params.stop_at_clean) && !(params.stop_at_assembly) && !(params.stop_at_filtering) && !(params.stop_at_structural_annot) ) { - ch_diamond = Channel.fromPath(file(params.diamond_bank)) + ch_diamond =Channel.value(file(params.diamond_bank)) } GET_DB_VERSIONS( @@ -103,6 +103,7 @@ workflow DATABASES { kaiju_db = ch_kaiju_db eggnog = ch_eggnog taxonomy = ch_taxonomy.first() + diamond = ch_diamond } process INDEX_HOST { diff --git a/subworkflows/05_alignment.nf b/subworkflows/05_alignment.nf index 4bd23e1..05d2763 100644 --- a/subworkflows/05_alignment.nf +++ b/subworkflows/05_alignment.nf @@ -5,6 +5,7 @@ workflow STEP_05_ALIGNMENT { take: contigs_and_reads prokka_faa + diamond main: if (params.type == 'SR') { @@ -18,10 +19,9 @@ workflow STEP_05_ALIGNMENT { ch_bam = MINIMAP2.out.bam ch_sam_coverage = MINIMAP2.out.sam_coverage } - DIAMOND ( prokka_faa, - params.diamond_bank + diamond ) ch_m8 = DIAMOND.out.m8 diff --git a/subworkflows/shared.nf b/subworkflows/shared.nf index 1678a11..16a1393 100644 --- a/subworkflows/shared.nf +++ b/subworkflows/shared.nf @@ -7,6 +7,7 @@ workflow SHARED { take: reads assembly + diamond eggnog_db taxonomy @@ -37,7 +38,7 @@ workflow SHARED { ch_m8 = Channel.empty() ch_sam_coverage = Channel.empty() if ( !params.stop_at_clean && !params.stop_at_assembly && !params.stop_at_filtering && !params.stop_at_structural_annot ) { - S05_ALIGNMENT ( ch_contigs_and_reads, ch_prokka_faa ) + S05_ALIGNMENT ( ch_contigs_and_reads, ch_prokka_faa, diamond) ch_bam = S05_ALIGNMENT.out.bam ch_m8 = S05_ALIGNMENT.out.m8 ch_sam_coverage = S05_ALIGNMENT.out.sam_coverage -- GitLab From 59152ec8099be7b7095874a07f9317cf36d5408d Mon Sep 17 00:00:00 2001 From: Maina Vienne <maina.vienne@inrae.fr> Date: Fri, 22 Jul 2022 11:37:32 +0200 Subject: [PATCH 08/13] Use metadata: step 06 funct annot and step 07 affi taxo --- main.nf | 2 ++ modules/assign_taxonomy.nf | 34 ++++++++++++++-------------- modules/best_hits.nf | 6 ++--- modules/cd_hit.nf | 14 ++++++------ modules/eggnog_mapper.nf | 10 ++++---- modules/feature_counts.nf | 16 ++++++------- modules/read_alignment.nf | 2 +- subworkflows/06_functionnal_annot.nf | 10 ++++---- subworkflows/07_taxonomic_affi.nf | 6 ++--- 9 files changed, 51 insertions(+), 49 deletions(-) diff --git a/main.nf b/main.nf index d29e7f2..a4e51fe 100644 --- a/main.nf +++ b/main.nf @@ -279,6 +279,8 @@ workflow { ch_v_eggnogmapper = Channel.empty() if ( params.type.toUpperCase() == "SR" ) { + ch_multiqc_config = file(params.sr_multiqc_config, checkIfExists: true) + SR ( ch_reads, ch_assembly, diff --git a/modules/assign_taxonomy.nf b/modules/assign_taxonomy.nf index b4b414d..b7b8403 100644 --- a/modules/assign_taxonomy.nf +++ b/modules/assign_taxonomy.nf @@ -1,25 +1,25 @@ process ASSIGN_TAXONOMY { - tag "${sampleId}" - publishDir "${params.outdir}/07_taxo_affi/${sampleId}", mode: 'copy' + tag "${meta.id}" + publishDir "${params.outdir}/07_taxo_affi/${meta.id}", mode: 'copy' label 'PYTHON' input: tuple path(accession2taxid), path(new_taxdump) - tuple val(sampleId), path(m8), path(sam_coverage), path(prot_len) + tuple val(meta), path(m8), path(sam_coverage), path(prot_len) output: - tuple val(sampleId), path("${sampleId}.percontig.tsv"), emit: t_percontig - tuple val(sampleId), path("${sampleId}.pergene.tsv"), emit: t_pergene - tuple val(sampleId), path("${sampleId}.warn.tsv"), emit: t_warn - tuple val(sampleId), path("graphs"), emit: t_graphs - path "${sampleId}_quantif_percontig.tsv", emit: q_all - path "${sampleId}_quantif_percontig_by_superkingdom.tsv", emit: q_superkingdom - path "${sampleId}_quantif_percontig_by_phylum.tsv", emit: q_phylum - path "${sampleId}_quantif_percontig_by_order.tsv", emit: q_order - path "${sampleId}_quantif_percontig_by_class.tsv", emit: q_class - path "${sampleId}_quantif_percontig_by_family.tsv", emit: q_family - path "${sampleId}_quantif_percontig_by_genus.tsv", emit: q_genus - path "${sampleId}_quantif_percontig_by_species.tsv", emit: q_species + tuple val(meta.id), path("${meta.id}.percontig.tsv"), emit: t_percontig + tuple val(meta.id), path("${meta.id}.pergene.tsv"), emit: t_pergene + tuple val(meta.id), path("${meta.id}.warn.tsv"), emit: t_warn + tuple val(meta.id), path("graphs"), emit: t_graphs + path "${meta.id}_quantif_percontig.tsv", emit: q_all + path "${meta.id}_quantif_percontig_by_superkingdom.tsv", emit: q_superkingdom + path "${meta.id}_quantif_percontig_by_phylum.tsv", emit: q_phylum + path "${meta.id}_quantif_percontig_by_order.tsv", emit: q_order + path "${meta.id}_quantif_percontig_by_class.tsv", emit: q_class + path "${meta.id}_quantif_percontig_by_family.tsv", emit: q_family + path "${meta.id}_quantif_percontig_by_genus.tsv", emit: q_genus + path "${meta.id}_quantif_percontig_by_species.tsv", emit: q_species path "top_taxons_per_contig.tsv", emit: top_taxon_file script: @@ -38,9 +38,9 @@ process ASSIGN_TAXONOMY { aln2taxaffi.py -a ${accession2taxid} --taxonomy \$new_taxdump_var \ - -o ${sampleId} -b ${m8} --keep_only_best_aln \ + -o ${meta.id} -b ${m8} --keep_only_best_aln \ --query_length_file ${prot_len} -v --write_top_taxons - merge_contig_quantif_perlineage.py -c ${sampleId}.percontig.tsv -s ${sam_coverage} -o ${sampleId}_quantif_percontig + merge_contig_quantif_perlineage.py -c ${meta.id}.percontig.tsv -s ${sam_coverage} -o ${meta.id}_quantif_percontig new_taxdump_original=$new_taxdump if [ "\${new_taxdump_original#*.}" == "tar.gz" ] diff --git a/modules/best_hits.nf b/modules/best_hits.nf index 91660cc..7620953 100644 --- a/modules/best_hits.nf +++ b/modules/best_hits.nf @@ -2,13 +2,13 @@ process BEST_HITS { publishDir "${params.outdir}/06_func_annot/06_3_functional_annotation", mode: 'copy' input: - tuple val(sampleId), path(m8) + tuple val(meta), path(m8) output: - path "${sampleId}.best_hit", emit: best_hits + path "${meta.id}.best_hit", emit: best_hits script: """ - filter_diamond_hits.py -o ${sampleId}.best_hit ${m8} + filter_diamond_hits.py -o ${meta.id}.best_hit ${m8} """ } \ No newline at end of file diff --git a/modules/cd_hit.nf b/modules/cd_hit.nf index 921fe4e..af6913c 100644 --- a/modules/cd_hit.nf +++ b/modules/cd_hit.nf @@ -1,20 +1,20 @@ process INDIVIDUAL_CD_HIT { - tag "${sampleId}" + tag "${meta.id}" publishDir "${params.outdir}/06_func_annot/06_1_clustering", mode: 'copy' label 'CD_HIT' input: - tuple val(sampleId), path(ffn) + tuple val(meta), path(ffn) val pct_id output: - path("${sampleId}.cd-hit-est.${pct_id}.fasta"), emit: clstr_fasta - path("${sampleId}.cd-hit-est.${pct_id}.table_cluster_contigs.txt"), emit: clstr_table + path("${meta.id}.cd-hit-est.${pct_id}.fasta"), emit: clstr_fasta + path("${meta.id}.cd-hit-est.${pct_id}.table_cluster_contigs.txt"), emit: clstr_table script: """ - cd-hit-est -c ${pct_id} -i ${ffn} -o ${sampleId}.cd-hit-est.${pct_id}.fasta -T ${task.cpus} -M ${task.mem} -d 150 - cat ${sampleId}.cd-hit-est.${pct_id}.fasta.clstr | cd_hit_produce_table_clstr.py > ${sampleId}.cd-hit-est.${pct_id}.table_cluster_contigs.txt + cd-hit-est -c ${pct_id} -i ${ffn} -o ${meta.id}.cd-hit-est.${pct_id}.fasta -T ${task.cpus} -M ${task.mem} -d 150 + cat ${meta.id}.cd-hit-est.${pct_id}.fasta.clstr | cd_hit_produce_table_clstr.py > ${meta.id}.cd-hit-est.${pct_id}.table_cluster_contigs.txt """ } @@ -48,7 +48,7 @@ process GLOBAL_CD_HIT { workflow CD_HIT { take: -ch_assembly // channel: [ val(sampleid), path(assemblyfasta) ] +ch_assembly // channel: [ val(meta.id), path(assemblyfasta) ] ch_percentage_identity // channel: val main: diff --git a/modules/eggnog_mapper.nf b/modules/eggnog_mapper.nf index cee277e..c3521ee 100644 --- a/modules/eggnog_mapper.nf +++ b/modules/eggnog_mapper.nf @@ -1,20 +1,20 @@ process EGGNOG_MAPPER { publishDir "${params.outdir}/06_func_annot/06_3_functional_annotation", mode: 'copy' - tag "${sampleId}" + tag "${meta.id}" label 'EGGNOG' input: - tuple val(sampleId), path(faa) + tuple val(meta), path(faa) path db output: - path "${sampleId}_diamond_one2one.emapper.seed_orthologs", emit: seed - path "${sampleId}_diamond_one2one.emapper.annotations", emit: annot + path "${meta.id}_diamond_one2one.emapper.seed_orthologs", emit: seed + path "${meta.id}_diamond_one2one.emapper.annotations", emit: annot path 'v_eggnogmapper.txt', emit: version script: """ - /eggnog-mapper-2.0.4-rf1/emapper.py -i ${faa} --output ${sampleId}_diamond_one2one -m diamond --cpu ${task.cpus} --data_dir ${db} --target_orthologs one2one + /eggnog-mapper-2.0.4-rf1/emapper.py -i ${faa} --output ${meta.id}_diamond_one2one -m diamond --cpu ${task.cpus} --data_dir ${db} --target_orthologs one2one /eggnog-mapper-2.0.4-rf1/emapper.py -v &> v_eggnogmapper.txt """ } \ No newline at end of file diff --git a/modules/feature_counts.nf b/modules/feature_counts.nf index 9afb6d8..96366f7 100644 --- a/modules/feature_counts.nf +++ b/modules/feature_counts.nf @@ -1,20 +1,20 @@ // Quantification of reads on each gene in each sample. process FEATURE_COUNTS { - tag "${sampleId}" + tag "${meta.id}" label 'QUANTIFICATION' publishDir "${params.outdir}/06_func_annot/06_2_quantification", mode: 'copy' input: - tuple val(sampleId), file(gff_prokka), file(bam), file(bam_index) + tuple val(meta), file(gff_prokka), file(bam), file(bam_index) output: - path "${sampleId}.featureCounts.tsv", emit: count_table - path "${sampleId}.featureCounts.tsv.summary", emit: summary - path "${sampleId}.featureCounts.stdout" + path "${meta.id}.featureCounts.tsv", emit: count_table + path "${meta.id}.featureCounts.tsv.summary", emit: summary + path "${meta.id}.featureCounts.stdout" script: """ - featureCounts -T ${task.cpus} -p -O -t gene -g ID -a ${gff_prokka} -o ${sampleId}.featureCounts.tsv ${bam} &> ${sampleId}.featureCounts.stdout + featureCounts -T ${task.cpus} -p -O -t gene -g ID -a ${gff_prokka} -o ${meta.id}.featureCounts.tsv ${bam} &> ${meta.id}.featureCounts.stdout """ } @@ -42,8 +42,8 @@ process QUANTIFICATION_TABLE { workflow QUANTIFICATION { take: - ch_gff // channel: [ val(sampleid), path(gff) ] - ch_bam // channel: [ val(sampleid), path(bam), path(bam_index) ] + ch_gff // channel: [ val(meta), path(gff) ] + ch_bam // channel: [ val(meta), path(bam), path(bam_index) ] ch_individual_clstr_table ch_global_clstr_table diff --git a/modules/read_alignment.nf b/modules/read_alignment.nf index 76b42fe..49bcfd1 100644 --- a/modules/read_alignment.nf +++ b/modules/read_alignment.nf @@ -7,7 +7,7 @@ process BWA_MEM { output: tuple val(meta), path("${meta.id}.sort.bam"), path("${meta.id}.sort.bam.bai"), emit: bam - tuple val(meta.id), path("${meta.id}_coverage.tsv"), emit: sam_coverage + tuple val(meta), path("${meta.id}_coverage.tsv"), emit: sam_coverage path "${meta.id}*" diff --git a/subworkflows/06_functionnal_annot.nf b/subworkflows/06_functionnal_annot.nf index 4d205e2..621dcc4 100644 --- a/subworkflows/06_functionnal_annot.nf +++ b/subworkflows/06_functionnal_annot.nf @@ -10,11 +10,11 @@ include { FUNCTIONAL_ANNOT_TABLE } from '../modules/functional_annot_table' workflow STEP_06_FUNC_ANNOT { take: - ffn // channel: [ val(sampleid), path(ffn) ] - faa // channel: [ val(sampleid), path(faa) ] - gff // channel: [ val(sampleid), path(gff) ] - bam // channel: [ val(sampleid), path(bam), path(bam_index) ] - m8 // channel: [ val(sampleId), path(diamond_file) ] + ffn // channel: [ val(meta), path(ffn) ] + faa // channel: [ val(meta), path(faa) ] + gff // channel: [ val(meta), path(gff) ] + bam // channel: [ val(meta), path(bam), path(bam_index) ] + m8 // channel: [ val(meta), path(diamond_file) ] eggnog_db main: diff --git a/subworkflows/07_taxonomic_affi.nf b/subworkflows/07_taxonomic_affi.nf index cbcc534..9382704 100644 --- a/subworkflows/07_taxonomic_affi.nf +++ b/subworkflows/07_taxonomic_affi.nf @@ -4,9 +4,9 @@ include { QUANTIF_AND_TAXONOMIC_TABLE_CONTIGS } from '../modules/quantif_and_tax workflow STEP_07_TAXO_AFFI { take: taxonomy - diamond_result // channel: [ val(sampleId), path(diamond_file) ] - sam_coverage // channel: [ val(sampleId), path(samtools coverage) ] - prot_length // channel: [ val(sampleId), path(prot_length) ] + diamond_result // channel: [ val(meta), path(diamond_file) ] + sam_coverage // channel: [ val(meta), path(samtools coverage) ] + prot_length // channel: [ val(meta), path(prot_length) ] main: ch_assign_taxo_input = diamond_result.join(sam_coverage, remainder: true) .join(prot_length, remainder: true) -- GitLab From bad7f9f6b0a3456c341a714845d6a83e55718e40 Mon Sep 17 00:00:00 2001 From: Maina Vienne <maina.vienne@inrae.fr> Date: Fri, 22 Jul 2022 15:11:47 +0200 Subject: [PATCH 09/13] Use metadata: HiFi --- main.nf | 5 ----- modules/host_filter.nf | 4 ++-- modules/read_alignment.nf | 4 ++-- subworkflows/01_clean_qc.nf | 13 +++---------- subworkflows/hifi_reads.nf | 12 +----------- 5 files changed, 8 insertions(+), 30 deletions(-) diff --git a/main.nf b/main.nf index a4e51fe..a7efe0c 100644 --- a/main.nf +++ b/main.nf @@ -309,11 +309,6 @@ workflow { else if ( params.type.toUpperCase() == "HIFI" ) { ch_multiqc_config = file(params.hifi_multiqc_config, checkIfExists: true) - ch_inputs.map { item -> [ item.sample, item.assembly ] } // [sample, assembly] - .set { ch_assembly } - - ch_inputs.map { item -> [ item.sample, item.fastq_1 ] } // [sample, reads] - .set { ch_reads } HIFI_READS ( ch_reads, diff --git a/modules/host_filter.nf b/modules/host_filter.nf index 6da4d1b..5577387 100644 --- a/modules/host_filter.nf +++ b/modules/host_filter.nf @@ -45,11 +45,11 @@ process HOST_FILTER_HIFI { else null} input: - tuple val(meta.id), path(reads) + tuple val(meta), path(reads) path fasta output: - tuple val(meta.id), path("cleaned_${meta.id}.fastq.gz"), emit: reads + tuple val(meta), path("cleaned_${meta.id}.fastq.gz"), emit: reads path "host_filter_flagstat/${meta.id}.host_filter.flagstat", emit: hf_report path "${meta.id}.no_filter.flagstat", emit: nf_report diff --git a/modules/read_alignment.nf b/modules/read_alignment.nf index 49bcfd1..bcb34fd 100644 --- a/modules/read_alignment.nf +++ b/modules/read_alignment.nf @@ -34,7 +34,7 @@ process MINIMAP2 { output: tuple val(meta), path("${meta.id}.sort.bam"), path("${meta.id}.sort.bam.bai"), emit: bam - tuple val(meta.id), path("${meta.id}_coverage.tsv"), emit: sam_coverage + tuple val(meta), path("${meta.id}_coverage.tsv"), emit: sam_coverage path "${meta.id}*" script: @@ -60,7 +60,7 @@ process MINIMAP2_FILTERING { tuple val(meta), path(assembly), path(reads) output: - tuple val(meta.id), path("${meta.id}.idxstats"), emit: sam_idxstat + tuple val(meta), path("${meta.id}.idxstats"), emit: sam_idxstat path "${meta.id}.flagstat", emit: sam_flagstat path "${meta.id}*" diff --git a/subworkflows/01_clean_qc.nf b/subworkflows/01_clean_qc.nf index c7dfd94..044d532 100644 --- a/subworkflows/01_clean_qc.nf +++ b/subworkflows/01_clean_qc.nf @@ -90,11 +90,8 @@ workflow STEP_01_CLEAN_QC_HIFI { kaiju_db main: - outdir_raw = '01_clean_qc/01_2_qc/fastqc_raw' - raw_reads.map {it -> [it[0], it[1], outdir_raw]} - .set { ch_raw_reads_qc } - - FASTQC_HIFI_RAW(ch_raw_reads_qc) + + FASTQC_HIFI_RAW(raw_reads) ch_fastqc_raw_report = FASTQC_HIFI_RAW.out.zip if (!params.skip_host_filter) { @@ -106,11 +103,7 @@ workflow STEP_01_CLEAN_QC_HIFI { ch_before_filter_report = HOST_FILTER_HIFI.out.nf_report ch_after_filter_report = HOST_FILTER_HIFI.out.hf_report - - outdir_cleaned = '01_clean_qc/01_2_qc/fastqc_cleaned' - ch_preprocessed_reads.map {it -> [it[0], it[1], outdir_cleaned]} - .set { ch_preprocessed_reads_qc } - FASTQC_HIFI(ch_preprocessed_reads_qc) + FASTQC_HIFI(ch_preprocessed_reads) ch_fastqc_clean_report = FASTQC_HIFI.out.zip } diff --git a/subworkflows/hifi_reads.nf b/subworkflows/hifi_reads.nf index e9215b3..2fe7f6f 100644 --- a/subworkflows/hifi_reads.nf +++ b/subworkflows/hifi_reads.nf @@ -3,7 +3,6 @@ include { STEP_03_ASSEMBLY_FILTER as S03_FILTERING } from './03_filtering' include { STEP_01_CLEAN_QC_HIFI} from './01_clean_qc' include { STEP_02_ASSEMBLY_HIFI as S02_ASSEMBLY } from './02_assembly' include { MINIMAP2_FILTERING } from '../modules/read_alignment' -include { QUAST as S04_FILTERED_QUAST } from '../modules/metaquast' workflow HIFI_READS { take: @@ -56,9 +55,6 @@ workflow HIFI_READS { } - // // stat on assemblies - // ASSEMBLY_QUAST( ch_assembly, 'ASSEMBLY' ) - // ch_quast_report = ASSEMBLY_QUAST.out.report // FILTERING if ( !params.skip_filtering && !params.stop_at_clean && !params.stop_at_assembly) { @@ -87,11 +83,7 @@ workflow HIFI_READS { ) ch_assembly = S03_FILTERING.out.selected - - S04_FILTERED_QUAST( ch_assembly,'FILTERING' ) - - ch_assembly_filtered_report = S04_FILTERED_QUAST.out.report - + ch_assembly_filtered_report = S03_FILTERING.out.report } @@ -110,7 +102,5 @@ workflow HIFI_READS { assembly_filtered_report = ch_assembly_filtered_report kaiju_report = ch_kaiju_report - - } -- GitLab From 7cfe386f4b451e83d46138da79a3b19809454110 Mon Sep 17 00:00:00 2001 From: Maina Vienne <maina.vienne@inrae.fr> Date: Fri, 22 Jul 2022 15:31:43 +0200 Subject: [PATCH 10/13] Use metadata: kaiju --- modules/kaiju.nf | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/modules/kaiju.nf b/modules/kaiju.nf index c5db60d..2a0dadd 100644 --- a/modules/kaiju.nf +++ b/modules/kaiju.nf @@ -1,18 +1,18 @@ taxon_levels = "phylum class order family genus species" process KAIJU { - tag "${sampleId}" + tag "${meta.id}" label "KAIJU" publishDir "${params.outdir}/01_clean_qc/01_3_taxonomic_affiliation_reads", mode: 'copy', pattern: '*_kaiju.out.gz' input: - tuple val(sampleId), path(read1), path(read2) + tuple val(meta), path(reads) tuple path(nodes), path(fmi), path(names) output: - path "${sampleId}_kaiju.out.gz", emit: kaiju_result - path "${sampleId}.krona", emit: krona_tab_file + path "${meta.id}_kaiju.out.gz", emit: kaiju_result + path "${meta.id}.krona", emit: krona_tab_file path "*.summary_*", emit: k_all path "*.summary_species", emit: k_species path "*.summary_genus", emit: k_genus @@ -23,34 +23,34 @@ process KAIJU { script: """ - kaiju -z ${task.cpus} -t ${nodes} -f ${fmi} -i ${read1} -j ${read2} -o ${sampleId}_kaiju_MEM_verbose.out -a mem -v + kaiju -z ${task.cpus} -t ${nodes} -f ${fmi} -i ${reads[0]} -j ${reads[1]} -o ${meta.id}_kaiju_MEM_verbose.out -a mem -v - kaiju2krona -t ${nodes} -n ${names} -i ${sampleId}_kaiju_MEM_verbose.out -o ${sampleId}.krona -u + kaiju2krona -t ${nodes} -n ${names} -i ${meta.id}_kaiju_MEM_verbose.out -o ${meta.id}.krona -u for i in ${taxon_levels} ; do - kaiju2table -t ${nodes} -n ${names} -r \$i -o ${sampleId}_kaiju_MEM.out.summary_\$i ${sampleId}_kaiju_MEM_verbose.out + kaiju2table -t ${nodes} -n ${names} -r \$i -o ${meta.id}_kaiju_MEM.out.summary_\$i ${meta.id}_kaiju_MEM_verbose.out done - grep -v U ${sampleId}_kaiju_MEM_verbose.out | gzip > ${sampleId}_kaiju.out.gz + grep -v U ${meta.id}_kaiju_MEM_verbose.out | gzip > ${meta.id}_kaiju.out.gz - rm ${sampleId}_kaiju_MEM_verbose.out + rm ${meta.id}_kaiju_MEM_verbose.out """ } process KAIJU_HIFI { - tag "${sampleId}" + tag "${meta.id}" label "KAIJU" publishDir "${params.outdir}/01_clean_qc/01_3_taxonomic_affiliation_reads", mode: 'copy', pattern: '*_kaiju.out.gz' input: - tuple val(sampleId), path(reads) + tuple val(meta.id), path(reads) tuple path(nodes), path(fmi), path(names) output: - path "${sampleId}_kaiju.out.gz", emit: kaiju_result - path "${sampleId}.krona", emit: krona_tab_file + path "${meta.id}_kaiju.out.gz", emit: kaiju_result + path "${meta.id}.krona", emit: krona_tab_file path "*.summary_*", emit: k_all path "*.summary_species", emit: k_species path "*.summary_genus", emit: k_genus @@ -61,17 +61,17 @@ process KAIJU_HIFI { script: """ - kaiju -z ${task.cpus} -t ${nodes} -f ${fmi} -i ${reads} -o ${sampleId}_kaiju_MEM_verbose.out -a mem -v + kaiju -z ${task.cpus} -t ${nodes} -f ${fmi} -i ${reads} -o ${meta.id}_kaiju_MEM_verbose.out -a mem -v - kaiju2krona -t ${nodes} -n ${names} -i ${sampleId}_kaiju_MEM_verbose.out -o ${sampleId}.krona -u + kaiju2krona -t ${nodes} -n ${names} -i ${meta.id}_kaiju_MEM_verbose.out -o ${meta.id}.krona -u for i in ${taxon_levels} ; do - kaiju2table -t ${nodes} -n ${names} -r \$i -o ${sampleId}_kaiju_MEM.out.summary_\$i ${sampleId}_kaiju_MEM_verbose.out + kaiju2table -t ${nodes} -n ${names} -r \$i -o ${meta.id}_kaiju_MEM.out.summary_\$i ${meta.id}_kaiju_MEM_verbose.out done - grep -v U ${sampleId}_kaiju_MEM_verbose.out | gzip > ${sampleId}_kaiju.out.gz - rm ${sampleId}_kaiju_MEM_verbose.out + grep -v U ${meta.id}_kaiju_MEM_verbose.out | gzip > ${meta.id}_kaiju.out.gz + rm ${meta.id}_kaiju_MEM_verbose.out """ } -- GitLab From 99dcaed5cd8b1ee33b8ad9bf7030643d4c287267 Mon Sep 17 00:00:00 2001 From: Maina Vienne <maina.vienne@inrae.fr> Date: Mon, 25 Jul 2022 15:36:01 +0200 Subject: [PATCH 11/13] merge fastq hifi and fix kaiju --- modules/kaiju.nf | 2 +- subworkflows/hifi_reads.nf | 39 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+), 1 deletion(-) diff --git a/modules/kaiju.nf b/modules/kaiju.nf index 2a0dadd..a21dd81 100644 --- a/modules/kaiju.nf +++ b/modules/kaiju.nf @@ -45,7 +45,7 @@ process KAIJU_HIFI { publishDir "${params.outdir}/01_clean_qc/01_3_taxonomic_affiliation_reads", mode: 'copy', pattern: '*_kaiju.out.gz' input: - tuple val(meta.id), path(reads) + tuple val(meta), path(reads) tuple path(nodes), path(fmi), path(names) output: diff --git a/subworkflows/hifi_reads.nf b/subworkflows/hifi_reads.nf index 2fe7f6f..118409b 100644 --- a/subworkflows/hifi_reads.nf +++ b/subworkflows/hifi_reads.nf @@ -3,6 +3,7 @@ include { STEP_03_ASSEMBLY_FILTER as S03_FILTERING } from './03_filtering' include { STEP_01_CLEAN_QC_HIFI} from './01_clean_qc' include { STEP_02_ASSEMBLY_HIFI as S02_ASSEMBLY } from './02_assembly' include { MINIMAP2_FILTERING } from '../modules/read_alignment' +include { MERGE_FASTQ } from '../modules/merge_fastq.nf' workflow HIFI_READS { take: @@ -49,6 +50,44 @@ workflow HIFI_READS { // ASSEMBLY if (!params.stop_at_clean ) { + ////////////////// + // Manage Flowcell + ////////////////// + ch_reads_tmp = ch_preprocessed_reads + .map { + meta, fastq -> + [ meta.sample, meta, fastq ] + } + .groupTuple(by: [0]) + .branch { + id, meta, fastq -> + single : fastq.size() == 1 + return [[id:meta.sample.unique().join(), + sample:meta.sample.unique().join(), + flowcell:meta.flowcell.join("_"), + group:meta.group.unique().join(), + assembly:meta.assembly.unique().join(), + type:meta.type.unique().join(),], fastq.flatten() ] + multiple: fastq.size() > 1 + return [[id:meta.sample.unique().join(), + sample:meta.sample.unique().join(), + flowcell:meta.flowcell.join("_"), + group:meta.group.unique().join(), + assembly:meta.assembly.unique().join(), + type:meta.type.unique().join(),], fastq.flatten() ] + } + + + MERGE_FASTQ ( + ch_reads_tmp.multiple + ) + .reads + .mix(ch_reads_tmp.single) + .set{ch_preprocessed_reads} + + ///////////////////// + //End manage Flowcell + ///////////////////// S02_ASSEMBLY ( ch_preprocessed_reads, ch_assembly, has_assembly, assembly_tool ) ch_assembly = S02_ASSEMBLY.out.assembly ch_assembly_report = S02_ASSEMBLY.out.assembly_report -- GitLab From e92affd2459bba33965e179fb526b6121b7c8560 Mon Sep 17 00:00:00 2001 From: Maina Vienne <maina.vienne@inrae.fr> Date: Mon, 25 Jul 2022 16:07:07 +0200 Subject: [PATCH 12/13] publish merged fastq in results folder --- modules/merge_fastq.nf | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/modules/merge_fastq.nf b/modules/merge_fastq.nf index 4e5fe57..c81857f 100644 --- a/modules/merge_fastq.nf +++ b/modules/merge_fastq.nf @@ -1,6 +1,7 @@ process MERGE_FASTQ { tag "$meta.id" - label 'MERGE_FASTQ' + publishDir "${params.outdir}/02_assembly/merged_fastq", mode: 'copy' + label 'MERGE_FASTQ' input: tuple val(meta), path(reads) -- GitLab From 70c62c1c92369ec221337818befbcc52108702ec Mon Sep 17 00:00:00 2001 From: Maina Vienne <maina.vienne@inrae.fr> Date: Mon, 25 Jul 2022 17:07:29 +0200 Subject: [PATCH 13/13] fix merged fastq --- modules/merge_fastq.nf | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/modules/merge_fastq.nf b/modules/merge_fastq.nf index c81857f..888e67e 100644 --- a/modules/merge_fastq.nf +++ b/modules/merge_fastq.nf @@ -15,7 +15,9 @@ process MERGE_FASTQ { def read2 = [] readList.eachWithIndex{ v, ix -> ( ix & 1 ? read2 : read1 ) << v } """ - cat ${read1.join(' ')} > ${meta.sample}_1.merged.fastq.gz - cat ${read2.join(' ')} > ${meta.sample}_2.merged.fastq.gz + zcat ${read1.join(' ')} > ${meta.sample}_1.merged.fastq + zcat ${read2.join(' ')} > ${meta.sample}_2.merged.fastq + gzip ${meta.sample}_1.merged.fastq + gzip ${meta.sample}_2.merged.fastq """ } \ No newline at end of file -- GitLab