diff --git a/docs/source/output.md b/docs/source/output.md index 9c7562213f6e8cd698c461bd416377a3d05c97d7..fc4ee22c9ec069b11c0f10ca060a3e202682fc3f 100644 --- a/docs/source/output.md +++ b/docs/source/output.md @@ -267,7 +267,8 @@ If you want to make further analysis about intra-population genetic diversity (m | File | Description | | ----------------------- | --------------------------------------- | -| `gtdbtk.bac120.summary.tsv` | Taxonomic classifications provided by GTDB-Tk. One line = one bin id (1st column, `user_genome`), <br /> its taxonomical classification based on the closest reference genome from the GTDB-Tk database (2nd column, `classification`), <br /> the accession number of the closest reference genome (3rd column, `closest_genome_reference`). <br /> Please see GTDB-Tk documentation [here](https://ecogenomics.github.io/GTDBTk/files/summary.tsv.html) for information on additional columns. | +| `gtdbtk.bac120.summary.tsv` | Taxonomic classifications of bacteria provided by GTDB-Tk. One line = one bin id (1st column, `user_genome`), <br /> its taxonomical classification based on the closest reference genome from the GTDB-Tk database (2nd column, `classification`), <br /> the accession number of the closest reference genome (3rd column, `closest_genome_reference`). <br /> Please see GTDB-Tk documentation [here](https://ecogenomics.github.io/GTDBTk/files/summary.tsv.html) for information on additional columns. | +| `gtdbtk.ar53.summary.tsv` | Taxonomic classifications of archae provided by GTDB-Tk. One line = one bin id (1st column, `user_genome`), <br /> its taxonomical classification based on the closest reference genome from the GTDB-Tk database (2nd column, `classification`), <br /> the accession number of the closest reference genome (3rd column, `closest_genome_reference`). <br /> Please see GTDB-Tk documentation [here](https://ecogenomics.github.io/GTDBTk/files/summary.tsv.html) for information on additional columns. | #### 4. 08_4_mapping_on_final_bins diff --git a/modules/gtdbtk.nf b/modules/gtdbtk.nf index 8ec0172490a564001520b620aaef65e45f81680e..f820a006d28e1639b117f39b1a2ecb5049384b14 100644 --- a/modules/gtdbtk.nf +++ b/modules/gtdbtk.nf @@ -8,8 +8,9 @@ process GTDBTK { val mash_db output: - path "gtdbtk.bac120.summary.tsv*", emit: gtdbtk_affiliations_predictions - path "v_gtdbtk.txt", emit: v_gtdbtk + path "gtdbtk.bac120.summary.tsv*" , emit : gtdbtk_affiliations_predictions_bact + path "gtdbtk.ar53.summary.tsv*" , emit : gtdbtk_affiliations_predictions_arch + path "v_gtdbtk.txt" ,emit : v_gtdbtk script: """ @@ -18,5 +19,14 @@ process GTDBTK { gtdbtk classify_wf --genome_dir $bins_drep -x fa --out_dir ./ --mash_db $mash_db --pplacer_cpus ${task.cpus} --cpus ${task.cpus} echo \$(gtdbtk -h 2>&1) &> v_gtdbtk.txt + + if [ ! -f ./gtdbtk.bac120.summary.tsv ]; then + touch ./gtdbtk.bac120.summary.tsv + fi + + if [ ! -f ./gtdbtk.ar53.summary.tsv ]; then + touch ./gtdbtk.ar53.summary.tsv + fi + """ } \ No newline at end of file diff --git a/modules/sum_up_bins_informations.nf b/modules/sum_up_bins_informations.nf index 042d2dabbd71bad304e7d347889a999e35c691b2..e649b8249fb9a3260b84c6f5cafd4566dd830bbd 100644 --- a/modules/sum_up_bins_informations.nf +++ b/modules/sum_up_bins_informations.nf @@ -4,30 +4,47 @@ process GENOMES_ABUNDANCES_PER_SAMPLE { input: path coverage_files path flagstats_files - val(bins_folder) + val bins_folder path genomes_informations - path affiliations_predictions + path affiliations_predictions_arch + path affiliations_predictions_bact path heatmap_header_mqc path table_header_mqc output: - path "genomes_abundances.tsv" , emit: genomes_abundances - tuple path("stats/genomes_abundances_mqc.tsv"), path("stats/genomes_checkm_mqc.json"), path("stats/bins_general_stats_mqc.tsv"), emit: report + path "genomes_abundances_arch.tsv" , emit: genomes_abundances_arch + path "genomes_abundances_bact.tsv" , emit: genomes_abundances_bact + tuple path("stats/genomes_abundances_mqc_arch.tsv"), path("stats/genomes_checkm_mqc_arch.json"), path("stats/bins_general_stats_mqc_arch.tsv"), emit: report_arch + tuple path("stats/genomes_abundances_mqc_bact.tsv"), path("stats/genomes_checkm_mqc_bact.json"), path("stats/bins_general_stats_mqc_bact.tsv"), emit: report_bact script: """ mkdir -p stats - bins_per_sample_summarize.py --list_of_coverage_files ${coverage_files} \ - --list_of_flagstats_files ${flagstats_files} --affiliations_predictions ${affiliations_predictions} \ - --bins_folder ${bins_folder} --genomes_informations ${genomes_informations} \ - --output_file genomes_abundances.tsv --report_file stats/genomes_abundances_mqc.tsv \ - --checkm_file stats/genomes_checkm_mqc.json --table_file stats/bins_general_stats_mqc.tsv - cat ${table_header_mqc} > stats/tmp.txt && cat stats/bins_general_stats_mqc.tsv >> stats/tmp.txt \ - && mv stats/tmp.txt stats/bins_general_stats_mqc.tsv - - cat ${heatmap_header_mqc} > stats/tmp.txt && cat stats/genomes_abundances_mqc.tsv >> stats/tmp.txt \ - && mv stats/tmp.txt stats/genomes_abundances_mqc.tsv + type=("bact" "arch") + + for i in "\${type[@]}"; + do + if [ "\$i" == "bact" ]; then + var_name=${affiliations_predictions_bact} + else + var_name=${affiliations_predictions_arch} + fi + + bins_per_sample_summarize.py --list_of_coverage_files ${coverage_files} \\ + --list_of_flagstats_files ${flagstats_files} --affiliations_predictions \$var_name \\ + --bins_folder ${bins_folder} --genomes_informations ${genomes_informations} \\ + --output_file genomes_abundances_\${i}.tsv --report_file stats/genomes_abundances_mqc_\${i}.tsv \\ + --checkm_file stats/genomes_checkm_mqc_\${i}.json --table_file stats/bins_general_stats_mqc_\${i}.tsv + + cat ${table_header_mqc} > stats/tmp_\${i}.txt && cat stats/bins_general_stats_mqc_\${i}.tsv >> stats/tmp_\${i}.txt \\ + && mv stats/tmp_\${i}.txt stats/bins_general_stats_mqc_\${i}.tsv + + cat ${heatmap_header_mqc} > stats/tmp_\${i}.txt && cat stats/genomes_abundances_mqc_\${i}.tsv >> stats/tmp_\${i}.txt \\ + && mv stats/tmp_\${i}.txt stats/genomes_abundances_mqc_\${i}.tsv + + done + """ } diff --git a/subworkflows/08_binning.nf b/subworkflows/08_binning.nf index 4573dc22ff7e48301ed99c641ed8c976fb826a11..6bcc6734b65a42f46ea95fce6ceb4b33f9eac494 100644 --- a/subworkflows/08_binning.nf +++ b/subworkflows/08_binning.nf @@ -171,7 +171,8 @@ workflow STEP_08_BINNING { ch_bins_drep = Channel.empty() ch_bam_bins = Channel.empty() ch_reads_fna = Channel.empty() - ch_gtdbtk_affi = Channel.empty() + ch_gtdbtk_affi_arch = Channel.empty() + ch_gtdbtk_affi_bact = Channel.empty() ch_drep_stats = Channel.empty() ch_bins_assembly = ch_bins_set.multiple.join(ch_assembly) @@ -225,7 +226,8 @@ workflow STEP_08_BINNING { GTDBTK(ch_bins_drep, gtdbtk_db, mash_db) ch_gtdbtk_v = GTDBTK.out.v_gtdbtk - ch_gtdbtk_affi = GTDBTK.out.gtdbtk_affiliations_predictions + ch_gtdbtk_affi_arch = GTDBTK.out.gtdbtk_affiliations_predictions_arch + ch_gtdbtk_affi_bact = GTDBTK.out.gtdbtk_affiliations_predictions_bact ///////////////////////////// ////GENOMES ABUNDANCES @@ -254,7 +256,7 @@ workflow STEP_08_BINNING { ch_collect_flagstats = GET_ALIGNMENT_METRICS.out.sam_flagstat.collect() GENOMES_ABUNDANCES_PER_SAMPLE(ch_collect_coverages, ch_collect_flagstats, - ch_bins_drep, ch_drep_stats , ch_gtdbtk_affi , + ch_bins_drep, ch_drep_stats , ch_gtdbtk_affi_arch , ch_gtdbtk_affi_bact, ch_heatmap_header_multiqc, ch_table_header_multiqc) ch_bins_abundances_report = GENOMES_ABUNDANCES_PER_SAMPLE.out.report