Skip to content
Snippets Groups Projects
Commit bcbd5705 authored by Alexis Mergez's avatar Alexis Mergez
Browse files

Added comments

parent 301e5ab1
No related branches found
No related tags found
1 merge request!11v1.6
Pipeline #178650 passed with warnings
"""
Input statistics script for Pan1c workflow
Given a fasta input made for pggb, computes stats regarding the complexity of the sequences
Given a fasta input made for pggb (data/chrInputs), it computes statistics.
@author: alexis.mergez@inrae.fr
@version: 1.0
......@@ -47,48 +47,73 @@ arg_parser.add_argument(
)
args = arg_parser.parse_args()
## Toolbox
## Main script
"""
Sequence dictionnary :
- key : (Chromosome name (from filename), sequence id)
- value : sequence
"""
seqDict = {}
# Parsing fasta files
for filename in args.fastafiles:
# Getting chromosome name from fasta filename
chrName = os.path.basename(filename).split(".fa.gz")[0]
# Reading .fa.gz file and adding records to seqDict
# Reading bgzip fasta file and adding records to seqDict
with gzip.open(filename, "rt") as handle:
# Parsing fasta sequences using SeqIO
sequences = SeqIO.parse(handle, "fasta")
# Adding sequence to sequence dictionnary
for record in sequences:
seqDict[(chrName, record.id)] = record.seq
# Reading available chromosomes for seqDict keys
chromList = np.unique([x for x, y in seqDict.keys()])
"""
Data dictionnary :
- key : (pangenome name, chromosome id)
- value : statistics dictionnary
"""
aggregatedStats = {}
# Iterating over available chromosomes
for chrom in chromList:
"""
Statistics dictionnary (temporary):
- key : statistic name
- value : statistic value
"""
_stats = {
"input.#N": 0,
"input.total.length": 0
}
# Storing length and computed N percentage for each sequences in dedicated lists
_lengths, _Nper = [], []
# Iterating over sequences for the chrom fasta file in sequence dictionnary
for (chrName, seqid), seq in seqDict.items():
if chrName == chrom:
# Counting number of Ns
# Counting the number of Ns
_stats["input.#N"] += seq.count("N")
# Counting total bases
# Counting the total number bases
_stats["input.total.length"] += len(seq)
# Saving length and N percentage
# Saving the length and the N percentage
_lengths.append(len(seq))
_Nper.append((seq.count("N")/len(seq)))
# Adding stats
_stats["input.mean.N%"] = np.mean(_Nper)*100
_stats["input.mean.length"] = np.mean(_lengths)
_stats["input.#sequences"] = len(_lengths)
#Computing L50
# Computing L50 for each sequence of chrom
_lengths = sorted(_lengths, reverse = True)
halfTotal = _stats["input.total.length"]/2
cumulativeLength, L50 = 0, 0
......@@ -101,13 +126,17 @@ for chrom in chromList:
_stats["input.L50"] = L50
# Adding stats to according chromosome in the data dictionnary
aggregatedStats[(args.panname,chrom)] = _stats
if args.debug: print(aggregatedStats)
# Converting data dictionnary to pandas dataframe
df = pd.DataFrame.from_dict(aggregatedStats, orient='index')
df.reset_index(inplace=True)
df.rename(columns={df.columns[0]: 'pangenome.name', df.columns[1]: 'chrom.id'}, inplace = True)
# Saving to TSV
df.to_csv(
args.output,
sep='\t',
......
......@@ -24,51 +24,58 @@ while getopts "r:q:a:t:d:o:" option; do
esac
done
## Main script
# Reading query argument and creating an array containing the path to query fasta files
IFS=' ' read -r -a temp <<< "$qry"
#echo "${qryList[@]}"
asmList=("$ref")
## Sorting the array to put the reference in first
# Sorting the array to put the reference in first
for item in "${temp[@]}"; do
if [[ $item != "$ref" ]]; then
asmList+=($item)
fi
done
#asmList=("${ref}" "${qryList[@]}")
echo "${asmList[@]}"
# Array to store the created syri files
syriFileList=()
# Iterating 2 by 2 with overlap, over the array of fasta files
for ((i = 0; i < ${#asmList[@]} - 1; i++)); do
# Setting filepaths for later
bamFile="$(basename ${asmList[i]} .fa.gz)_$(basename ${asmList[i + 1]} .fa.gz).bam"
syriFile="$(basename ${asmList[i]} .fa.gz)_$(basename ${asmList[i + 1]} .fa.gz).syri.out"
# Adding the output syri file to the array
syriFileList+=($syriFile)
## Minimap2 genome vs genome alignment
# Minimap2 genome vs genome alignment
apptainer run --app minimap2 $appdir/PanGeTools.sif \
-ax asm5 --eqx ${asmList[i]} ${asmList[i + 1]} -t $threads | \
apptainer run --app samtools $appdir/PanGeTools.sif sort -O BAM -@ $threads - > $wrkdir/$bamFile
echo "${asmList[i]}"
## Syri on previous alignment
# Syri on previous alignment
apptainer run $appdir/pan1c-env.sif \
syri -c $wrkdir/$bamFile -r ${asmList[i]} -q ${asmList[i + 1]} -k -F B \
--nc $threads \
--dir $wrkdir --prefix "$(basename ${asmList[i]} .fa.gz)_$(basename ${asmList[i + 1]} .fa.gz)."
done
## Creating genomes.txt
# Creating genomes.txt for plotsr. It is used to give simple names to fasta files in the final figure
# Each line contains 2 columns : fasta filepath and its simpler name
echo -e "#files\tname" > $wrkdir/genomes.txt
for asm in "${asmList[@]}"; do
echo -e "${asm}\t$(basename $asm .fa.gz | cut -d'.' -f1)" >> $wrkdir/genomes.txt
done
## Generating the list of syri files for the plotsr command
# Generating the plotsr command
command="--genomes $wrkdir/genomes.txt -o $wrkdir/$output -f 12 -H 16 -W 9 -d 600 "
# Adding syri files to the command as each needs to be specified using "--sr" argument
for file in "${syriFileList[@]}"; do
command+="--sr $wrkdir/$file "
done
# Running plotsr
apptainer run $appdir/pan1c-env.sif plotsr \
$command
......@@ -30,24 +30,38 @@ arg_parser.add_argument(
)
args = arg_parser.parse_args()
## Main script
"""
Tags dictionnary :
- key : Main tool / apptainer image
- value : dictionnary of tags
"""
tags = {}
## Pan1c-workflow section
### Pan1c-workflow section
tags["Pan1c"] = {}
# Using git to get the version of the Pan1c workflow
_output = subprocess.run(
["git", "describe", "--tags"],
capture_output=True,
text=True,
).stdout[:-1]
# Getting the pggb commands used in the workflow from the config file
# ToDo : Get the command used from the pggb command logs !
with open(args.config, 'r') as handle:
pggbCmd = [line[:-1] for line in handle.readlines() if "pggb.params" in line][0].split(': ')[-1]
# Adding tags
tags["Pan1c"]["pan1c.version"] = _output
tags["Pan1c"]["pan1c.home"] = "https://forgemia.inra.fr/alexis.mergez/pan1c"
tags["Pan1c"]["pan1c.pggb.args"] = pggbCmd
## PanGeTools section
### PanGeTools section
tags["pangetools"] = {}
# Reading the apps versions from the apptainer tags
_output = subprocess.run(
["apptainer", "inspect", "-j", f"{args.appdir}/PanGeTools.sif"],
capture_output=True,
......@@ -58,13 +72,15 @@ labels = _output['data']['attributes']['labels']
tags["pangetools"]["image.version"] = labels['Version']
tags["pangetools"]["image.home"] = labels['about.home']
# Adding app versions to the tag dictionnary
for key in labels.keys():
if ".Version" in key:
tags["pangetools"][key.lower()] = labels[key]
## PGGB image section
### PGGB image section
tags["pggb"] = {}
# Reading the apps versions from the apptainer tags
_output = subprocess.run(
["apptainer", "inspect", "-j", f"{args.appdir}/pggb.sif"],
capture_output=True,
......@@ -75,13 +91,15 @@ labels = _output['data']['attributes']['labels']
tags["pggb"]["image.version"] = labels['Version']
tags["pggb"]["image.home"] = labels['about.home']
# Adding app versions to the tag dictionnary
for key in labels.keys():
if ".Version" in key:
tags["pggb"][key.lower()] = labels[key]
## Pan1c-Env section
### Pan1c-Env section
tags["pan1c-env"] = {}
# Reading the apps versions from the apptainer tags
_output = subprocess.run(
["apptainer", "inspect", "-j", f"{args.appdir}/pan1c-env.sif"],
capture_output=True,
......@@ -92,6 +110,7 @@ labels = _output['data']['attributes']['labels']
tags["pan1c-env"]["image.version"] = labels['Version']
tags["pan1c-env"]["image.home"] = labels['about.home']
# Adding app versions to the tag dictionnary
for key in labels.keys():
if ".Version" in key:
tags["pan1c-env"][key.lower()] = labels[key]
......@@ -99,6 +118,7 @@ for key in labels.keys():
## Pan1c-Box section
tags["pan1c-box"] = {}
# Reading the apps versions from the apptainer tags
_output = subprocess.run(
["apptainer", "inspect", "-j", f"{args.appdir}/pan1c-box.sif"],
capture_output=True,
......@@ -109,11 +129,12 @@ labels = _output['data']['attributes']['labels']
tags["pan1c-box"]["image.version"] = labels['Version']
tags["pan1c-box"]["image.home"] = labels['about.home']
# Adding app versions to the tag dictionnary
for key in labels.keys():
if ".Version" in key:
tags["pan1c-box"][key.lower()] = labels[key]
## Exporting tags
## Exporting tags to stdout
print("#\tThis graph have been created using the Pan1c workflow (https://forgemia.inra.fr/alexis.mergez/pan1c)\n#")
print("#\tTool versions and commands\n#")
for first_elem in tags.keys():
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment