Added comments

bcbd5705 · Alexis Mergez · 301e5ab1 · bcbd5705 · bcbd5705 · bcbd5705
Commit bcbd5705 authored 1 year ago by Alexis Mergez
--- a/scripts/chrInputStats.py
+++ b/scripts/chrInputStats.py
 """
 Input statistics script for Pan1c workflow

-Given a fasta input made for pggb, computes stats regarding the complexity of the sequences
+Given a fasta input made for pggb (data/chrInputs), it computes statistics. 

 @author: alexis.mergez@inrae.fr
 @version: 1.0 
@@ -47,48 +47,73 @@ arg_parser.add_argument(
    )
 args = arg_parser.parse_args()

-## Toolbox
-
 ## Main script
+"""
+Sequence dictionnary : 
+    - key : (Chromosome name (from filename), sequence id)
+    - value : sequence 
+"""
 seqDict = {}

 # Parsing fasta files
 for filename in args.fastafiles:
    
+    # Getting chromosome name from fasta filename
    chrName = os.path.basename(filename).split(".fa.gz")[0]

-    # Reading .fa.gz file and adding records to seqDict
+    # Reading bgzip fasta file and adding records to seqDict
    with gzip.open(filename, "rt") as handle:
+
+        # Parsing fasta sequences using SeqIO
        sequences = SeqIO.parse(handle, "fasta")
        
+        # Adding sequence to sequence dictionnary
        for record in sequences:
            seqDict[(chrName, record.id)] = record.seq

+# Reading available chromosomes for seqDict keys
 chromList = np.unique([x for x, y in seqDict.keys()])

+"""
+Data dictionnary :
+    - key : (pangenome name, chromosome id)
+    - value : statistics dictionnary
+"""
 aggregatedStats = {}

+# Iterating over available chromosomes
 for chrom in chromList:
+    """
+    Statistics dictionnary (temporary):
+        - key : statistic name
+        - value : statistic value
+    """
    _stats = {
        "input.#N": 0,
        "input.total.length": 0
        }
+    
+    # Storing length and computed N percentage for each sequences in dedicated lists
    _lengths, _Nper = [], []
+
+    # Iterating over sequences for the chrom fasta file in sequence dictionnary
    for (chrName, seqid), seq in seqDict.items():
+
        if chrName == chrom:
-            # Counting number of Ns
+            # Counting the number of Ns
            _stats["input.#N"] += seq.count("N")
-            # Counting total bases
+            # Counting the total number bases
            _stats["input.total.length"] += len(seq)
-            # Saving length and N percentage
+            # Saving the length and the N percentage
            _lengths.append(len(seq))
            _Nper.append((seq.count("N")/len(seq)))

+    # Adding stats
    _stats["input.mean.N%"] = np.mean(_Nper)*100
    _stats["input.mean.length"] = np.mean(_lengths)
    _stats["input.#sequences"] = len(_lengths)

-    #Computing L50
+    # Computing L50 for each sequence of chrom
    _lengths = sorted(_lengths, reverse = True)
    halfTotal = _stats["input.total.length"]/2
    cumulativeLength, L50 = 0, 0
@@ -101,13 +126,17 @@ for chrom in chromList:
    
    _stats["input.L50"] = L50

+    # Adding stats to according chromosome in the data dictionnary
    aggregatedStats[(args.panname,chrom)] = _stats

 if args.debug: print(aggregatedStats)    

+# Converting data dictionnary to pandas dataframe
 df = pd.DataFrame.from_dict(aggregatedStats, orient='index')
 df.reset_index(inplace=True)
 df.rename(columns={df.columns[0]: 'pangenome.name', df.columns[1]: 'chrom.id'}, inplace = True)
+
+# Saving to TSV
 df.to_csv(
    args.output,
    sep='\t',

--- a/scripts/getSyriFigs.sh
+++ b/scripts/getSyriFigs.sh
@@ -24,51 +24,58 @@ while getopts "r:q:a:t:d:o:" option; do
    esac
 done

+## Main script
+# Reading query argument and creating an array containing the path to query fasta files
 IFS=' ' read -r -a temp <<< "$qry"
-#echo "${qryList[@]}"

 asmList=("$ref")
-## Sorting the array to put the reference in first
+# Sorting the array to put the reference in first
 for item in "${temp[@]}"; do
    if [[ $item != "$ref" ]]; then
        asmList+=($item)
    fi
 done

-#asmList=("${ref}" "${qryList[@]}")
-echo "${asmList[@]}"
+# Array to store the created syri files 
 syriFileList=()

+# Iterating 2 by 2 with overlap, over the array of fasta files
 for ((i = 0; i < ${#asmList[@]} - 1; i++)); do

+    # Setting filepaths for later
    bamFile="$(basename ${asmList[i]} .fa.gz)_$(basename ${asmList[i + 1]} .fa.gz).bam"
    syriFile="$(basename ${asmList[i]} .fa.gz)_$(basename ${asmList[i + 1]} .fa.gz).syri.out"
+    
+    # Adding the output syri file to the array
    syriFileList+=($syriFile)

-    ## Minimap2 genome vs genome alignment
+    # Minimap2 genome vs genome alignment
    apptainer run --app minimap2 $appdir/PanGeTools.sif \
        -ax asm5 --eqx ${asmList[i]} ${asmList[i + 1]} -t $threads | \
        apptainer run --app samtools $appdir/PanGeTools.sif sort -O BAM -@ $threads - > $wrkdir/$bamFile

-    echo "${asmList[i]}"
-    ## Syri on previous alignment
+    # Syri on previous alignment
    apptainer run $appdir/pan1c-env.sif \
        syri -c $wrkdir/$bamFile -r ${asmList[i]} -q ${asmList[i + 1]} -k -F B \
        --nc $threads \
        --dir $wrkdir --prefix "$(basename ${asmList[i]} .fa.gz)_$(basename ${asmList[i + 1]} .fa.gz)."    
 done

-## Creating genomes.txt
+# Creating genomes.txt for plotsr. It is used to give simple names to fasta files in the final figure
+# Each line contains 2 columns : fasta filepath and its simpler name
 echo -e "#files\tname" > $wrkdir/genomes.txt
 for asm in "${asmList[@]}"; do
    echo -e "${asm}\t$(basename $asm .fa.gz | cut -d'.' -f1)" >> $wrkdir/genomes.txt
 done

-## Generating the list of syri files for the plotsr command
+# Generating the plotsr command
 command="--genomes $wrkdir/genomes.txt -o $wrkdir/$output -f 12 -H 16 -W 9 -d 600 "
+
+# Adding syri files to the command as each needs to be specified using "--sr" argument 
 for file in "${syriFileList[@]}"; do
    command+="--sr $wrkdir/$file "
 done

+# Running plotsr
 apptainer run $appdir/pan1c-env.sif plotsr \
    $command
--- a/scripts/getTags.py
+++ b/scripts/getTags.py
@@ -30,24 +30,38 @@ arg_parser.add_argument(
    )
 args = arg_parser.parse_args()

+## Main script
+"""
+Tags dictionnary :
+    - key : Main tool / apptainer image
+    - value : dictionnary of tags
+"""
 tags = {}

-## Pan1c-workflow section
+### Pan1c-workflow section
 tags["Pan1c"] = {}
+
+# Using git to get the version of the Pan1c workflow 
 _output = subprocess.run(
    ["git", "describe", "--tags"],
    capture_output=True,
    text=True,
 ).stdout[:-1]
+
+# Getting the pggb commands used in the workflow from the config file
+# ToDo : Get the command used from the pggb command logs !
 with open(args.config, 'r') as handle:
    pggbCmd = [line[:-1] for line in handle.readlines() if "pggb.params" in line][0].split(': ')[-1]

+# Adding tags
 tags["Pan1c"]["pan1c.version"] = _output
 tags["Pan1c"]["pan1c.home"] = "https://forgemia.inra.fr/alexis.mergez/pan1c"
 tags["Pan1c"]["pan1c.pggb.args"] = pggbCmd

-## PanGeTools section
+### PanGeTools section
 tags["pangetools"] = {}
+
+# Reading the apps versions from the apptainer tags
 _output = subprocess.run(
    ["apptainer", "inspect", "-j", f"{args.appdir}/PanGeTools.sif"],
    capture_output=True, 
@@ -58,13 +72,15 @@ labels = _output['data']['attributes']['labels']
 tags["pangetools"]["image.version"] = labels['Version']
 tags["pangetools"]["image.home"] = labels['about.home']

+# Adding app versions to the tag dictionnary
 for key in labels.keys():
    if ".Version" in key:
        tags["pangetools"][key.lower()] = labels[key]

-## PGGB image section
+### PGGB image section
 tags["pggb"] = {}

+# Reading the apps versions from the apptainer tags
 _output = subprocess.run(
    ["apptainer", "inspect", "-j", f"{args.appdir}/pggb.sif"],
    capture_output=True, 
@@ -75,13 +91,15 @@ labels = _output['data']['attributes']['labels']
 tags["pggb"]["image.version"] = labels['Version']
 tags["pggb"]["image.home"] = labels['about.home']

+# Adding app versions to the tag dictionnary
 for key in labels.keys():
    if ".Version" in key:
        tags["pggb"][key.lower()] = labels[key]

-## Pan1c-Env section
+### Pan1c-Env section
 tags["pan1c-env"] = {}

+# Reading the apps versions from the apptainer tags
 _output = subprocess.run(
    ["apptainer", "inspect", "-j", f"{args.appdir}/pan1c-env.sif"],
    capture_output=True, 
@@ -92,6 +110,7 @@ labels = _output['data']['attributes']['labels']
 tags["pan1c-env"]["image.version"] = labels['Version']
 tags["pan1c-env"]["image.home"] = labels['about.home']

+# Adding app versions to the tag dictionnary
 for key in labels.keys():
    if ".Version" in key:
        tags["pan1c-env"][key.lower()] = labels[key]
@@ -99,6 +118,7 @@ for key in labels.keys():
 ## Pan1c-Box section
 tags["pan1c-box"] = {}

+# Reading the apps versions from the apptainer tags
 _output = subprocess.run(
    ["apptainer", "inspect", "-j", f"{args.appdir}/pan1c-box.sif"],
    capture_output=True, 
@@ -109,11 +129,12 @@ labels = _output['data']['attributes']['labels']
 tags["pan1c-box"]["image.version"] = labels['Version']
 tags["pan1c-box"]["image.home"] = labels['about.home']

+# Adding app versions to the tag dictionnary
 for key in labels.keys():
    if ".Version" in key:
        tags["pan1c-box"][key.lower()] = labels[key]

-## Exporting tags
+## Exporting tags to stdout
 print("#\tThis graph have been created using the Pan1c workflow (https://forgemia.inra.fr/alexis.mergez/pan1c)\n#")
 print("#\tTool versions and commands\n#")
 for first_elem in tags.keys():