Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
Pan1c
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Package Registry
Container Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Alexis Mergez
Pan1c
Commits
bcbd5705
Commit
bcbd5705
authored
1 year ago
by
Alexis Mergez
Browse files
Options
Downloads
Patches
Plain Diff
Added comments
parent
301e5ab1
No related branches found
Branches containing commit
No related tags found
Tags containing commit
1 merge request
!11
v1.6
Pipeline
#178650
passed with warnings
1 year ago
Changes
3
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
scripts/chrInputStats.py
+37
-8
37 additions, 8 deletions
scripts/chrInputStats.py
scripts/getSyriFigs.sh
+16
-9
16 additions, 9 deletions
scripts/getSyriFigs.sh
scripts/getTags.py
+26
-5
26 additions, 5 deletions
scripts/getTags.py
with
79 additions
and
22 deletions
scripts/chrInputStats.py
+
37
−
8
View file @
bcbd5705
"""
Input statistics script for Pan1c workflow
Given a fasta input made for pggb
, computes stats regarding the complexity of the sequences
Given a fasta input made for pggb
(data/chrInputs), it computes statistics.
@author: alexis.mergez@inrae.fr
@version: 1.0
...
...
@@ -47,48 +47,73 @@ arg_parser.add_argument(
)
args
=
arg_parser
.
parse_args
()
## Toolbox
## Main script
"""
Sequence dictionnary :
- key : (Chromosome name (from filename), sequence id)
- value : sequence
"""
seqDict
=
{}
# Parsing fasta files
for
filename
in
args
.
fastafiles
:
# Getting chromosome name from fasta filename
chrName
=
os
.
path
.
basename
(
filename
).
split
(
"
.fa.gz
"
)[
0
]
# Reading
.fa.gz
file and adding records to seqDict
# Reading
bgzip fasta
file and adding records to seqDict
with
gzip
.
open
(
filename
,
"
rt
"
)
as
handle
:
# Parsing fasta sequences using SeqIO
sequences
=
SeqIO
.
parse
(
handle
,
"
fasta
"
)
# Adding sequence to sequence dictionnary
for
record
in
sequences
:
seqDict
[(
chrName
,
record
.
id
)]
=
record
.
seq
# Reading available chromosomes for seqDict keys
chromList
=
np
.
unique
([
x
for
x
,
y
in
seqDict
.
keys
()])
"""
Data dictionnary :
- key : (pangenome name, chromosome id)
- value : statistics dictionnary
"""
aggregatedStats
=
{}
# Iterating over available chromosomes
for
chrom
in
chromList
:
"""
Statistics dictionnary (temporary):
- key : statistic name
- value : statistic value
"""
_stats
=
{
"
input.#N
"
:
0
,
"
input.total.length
"
:
0
}
# Storing length and computed N percentage for each sequences in dedicated lists
_lengths
,
_Nper
=
[],
[]
# Iterating over sequences for the chrom fasta file in sequence dictionnary
for
(
chrName
,
seqid
),
seq
in
seqDict
.
items
():
if
chrName
==
chrom
:
# Counting number of Ns
# Counting
the
number of Ns
_stats
[
"
input.#N
"
]
+=
seq
.
count
(
"
N
"
)
# Counting t
otal
bases
# Counting t
he total number
bases
_stats
[
"
input.total.length
"
]
+=
len
(
seq
)
# Saving length and N percentage
# Saving
the
length and
the
N percentage
_lengths
.
append
(
len
(
seq
))
_Nper
.
append
((
seq
.
count
(
"
N
"
)
/
len
(
seq
)))
# Adding stats
_stats
[
"
input.mean.N%
"
]
=
np
.
mean
(
_Nper
)
*
100
_stats
[
"
input.mean.length
"
]
=
np
.
mean
(
_lengths
)
_stats
[
"
input.#sequences
"
]
=
len
(
_lengths
)
#Computing L50
#
Computing L50
for each sequence of chrom
_lengths
=
sorted
(
_lengths
,
reverse
=
True
)
halfTotal
=
_stats
[
"
input.total.length
"
]
/
2
cumulativeLength
,
L50
=
0
,
0
...
...
@@ -101,13 +126,17 @@ for chrom in chromList:
_stats
[
"
input.L50
"
]
=
L50
# Adding stats to according chromosome in the data dictionnary
aggregatedStats
[(
args
.
panname
,
chrom
)]
=
_stats
if
args
.
debug
:
print
(
aggregatedStats
)
# Converting data dictionnary to pandas dataframe
df
=
pd
.
DataFrame
.
from_dict
(
aggregatedStats
,
orient
=
'
index
'
)
df
.
reset_index
(
inplace
=
True
)
df
.
rename
(
columns
=
{
df
.
columns
[
0
]:
'
pangenome.name
'
,
df
.
columns
[
1
]:
'
chrom.id
'
},
inplace
=
True
)
# Saving to TSV
df
.
to_csv
(
args
.
output
,
sep
=
'
\t
'
,
...
...
This diff is collapsed.
Click to expand it.
scripts/getSyriFigs.sh
+
16
−
9
View file @
bcbd5705
...
...
@@ -24,51 +24,58 @@ while getopts "r:q:a:t:d:o:" option; do
esac
done
## Main script
# Reading query argument and creating an array containing the path to query fasta files
IFS
=
' '
read
-r
-a
temp
<<<
"
$qry
"
#echo "${qryList[@]}"
asmList
=(
"
$ref
"
)
#
# Sorting the array to put the reference in first
# Sorting the array to put the reference in first
for
item
in
"
${
temp
[@]
}
"
;
do
if
[[
$item
!=
"
$ref
"
]]
;
then
asmList+
=(
$item
)
fi
done
#asmList=("${ref}" "${qryList[@]}")
echo
"
${
asmList
[@]
}
"
# Array to store the created syri files
syriFileList
=()
# Iterating 2 by 2 with overlap, over the array of fasta files
for
((
i
=
0
;
i <
${#
asmList
[@]
}
- 1
;
i++
))
;
do
# Setting filepaths for later
bamFile
=
"
$(
basename
${
asmList
[i]
}
.fa.gz
)
_
$(
basename
${
asmList
[i + 1]
}
.fa.gz
)
.bam"
syriFile
=
"
$(
basename
${
asmList
[i]
}
.fa.gz
)
_
$(
basename
${
asmList
[i + 1]
}
.fa.gz
)
.syri.out"
# Adding the output syri file to the array
syriFileList+
=(
$syriFile
)
#
# Minimap2 genome vs genome alignment
# Minimap2 genome vs genome alignment
apptainer run
--app
minimap2
$appdir
/PanGeTools.sif
\
-ax
asm5
--eqx
${
asmList
[i]
}
${
asmList
[i + 1]
}
-t
$threads
|
\
apptainer run
--app
samtools
$appdir
/PanGeTools.sif
sort
-O
BAM -@
$threads
-
>
$wrkdir
/
$bamFile
echo
"
${
asmList
[i]
}
"
## Syri on previous alignment
# Syri on previous alignment
apptainer run
$appdir
/pan1c-env.sif
\
syri
-c
$wrkdir
/
$bamFile
-r
${
asmList
[i]
}
-q
${
asmList
[i + 1]
}
-k
-F
B
\
--nc
$threads
\
--dir
$wrkdir
--prefix
"
$(
basename
${
asmList
[i]
}
.fa.gz
)
_
$(
basename
${
asmList
[i + 1]
}
.fa.gz
)
."
done
## Creating genomes.txt
# Creating genomes.txt for plotsr. It is used to give simple names to fasta files in the final figure
# Each line contains 2 columns : fasta filepath and its simpler name
echo
-e
"#files
\t
name"
>
$wrkdir
/genomes.txt
for
asm
in
"
${
asmList
[@]
}
"
;
do
echo
-e
"
${
asm
}
\t
$(
basename
$asm
.fa.gz |
cut
-d
'.'
-f1
)
"
>>
$wrkdir
/genomes.txt
done
#
# Generating the
list of syri files for the
plotsr command
# Generating the plotsr command
command
=
"--genomes
$wrkdir
/genomes.txt -o
$wrkdir
/
$output
-f 12 -H 16 -W 9 -d 600 "
# Adding syri files to the command as each needs to be specified using "--sr" argument
for
file
in
"
${
syriFileList
[@]
}
"
;
do
command
+
=
"--sr
$wrkdir
/
$file
"
done
# Running plotsr
apptainer run
$appdir
/pan1c-env.sif plotsr
\
$command
This diff is collapsed.
Click to expand it.
scripts/getTags.py
+
26
−
5
View file @
bcbd5705
...
...
@@ -30,24 +30,38 @@ arg_parser.add_argument(
)
args
=
arg_parser
.
parse_args
()
## Main script
"""
Tags dictionnary :
- key : Main tool / apptainer image
- value : dictionnary of tags
"""
tags
=
{}
## Pan1c-workflow section
##
#
Pan1c-workflow section
tags
[
"
Pan1c
"
]
=
{}
# Using git to get the version of the Pan1c workflow
_output
=
subprocess
.
run
(
[
"
git
"
,
"
describe
"
,
"
--tags
"
],
capture_output
=
True
,
text
=
True
,
).
stdout
[:
-
1
]
# Getting the pggb commands used in the workflow from the config file
# ToDo : Get the command used from the pggb command logs !
with
open
(
args
.
config
,
'
r
'
)
as
handle
:
pggbCmd
=
[
line
[:
-
1
]
for
line
in
handle
.
readlines
()
if
"
pggb.params
"
in
line
][
0
].
split
(
'
:
'
)[
-
1
]
# Adding tags
tags
[
"
Pan1c
"
][
"
pan1c.version
"
]
=
_output
tags
[
"
Pan1c
"
][
"
pan1c.home
"
]
=
"
https://forgemia.inra.fr/alexis.mergez/pan1c
"
tags
[
"
Pan1c
"
][
"
pan1c.pggb.args
"
]
=
pggbCmd
## PanGeTools section
##
#
PanGeTools section
tags
[
"
pangetools
"
]
=
{}
# Reading the apps versions from the apptainer tags
_output
=
subprocess
.
run
(
[
"
apptainer
"
,
"
inspect
"
,
"
-j
"
,
f
"
{
args
.
appdir
}
/PanGeTools.sif
"
],
capture_output
=
True
,
...
...
@@ -58,13 +72,15 @@ labels = _output['data']['attributes']['labels']
tags
[
"
pangetools
"
][
"
image.version
"
]
=
labels
[
'
Version
'
]
tags
[
"
pangetools
"
][
"
image.home
"
]
=
labels
[
'
about.home
'
]
# Adding app versions to the tag dictionnary
for
key
in
labels
.
keys
():
if
"
.Version
"
in
key
:
tags
[
"
pangetools
"
][
key
.
lower
()]
=
labels
[
key
]
## PGGB image section
##
#
PGGB image section
tags
[
"
pggb
"
]
=
{}
# Reading the apps versions from the apptainer tags
_output
=
subprocess
.
run
(
[
"
apptainer
"
,
"
inspect
"
,
"
-j
"
,
f
"
{
args
.
appdir
}
/pggb.sif
"
],
capture_output
=
True
,
...
...
@@ -75,13 +91,15 @@ labels = _output['data']['attributes']['labels']
tags
[
"
pggb
"
][
"
image.version
"
]
=
labels
[
'
Version
'
]
tags
[
"
pggb
"
][
"
image.home
"
]
=
labels
[
'
about.home
'
]
# Adding app versions to the tag dictionnary
for
key
in
labels
.
keys
():
if
"
.Version
"
in
key
:
tags
[
"
pggb
"
][
key
.
lower
()]
=
labels
[
key
]
## Pan1c-Env section
##
#
Pan1c-Env section
tags
[
"
pan1c-env
"
]
=
{}
# Reading the apps versions from the apptainer tags
_output
=
subprocess
.
run
(
[
"
apptainer
"
,
"
inspect
"
,
"
-j
"
,
f
"
{
args
.
appdir
}
/pan1c-env.sif
"
],
capture_output
=
True
,
...
...
@@ -92,6 +110,7 @@ labels = _output['data']['attributes']['labels']
tags
[
"
pan1c-env
"
][
"
image.version
"
]
=
labels
[
'
Version
'
]
tags
[
"
pan1c-env
"
][
"
image.home
"
]
=
labels
[
'
about.home
'
]
# Adding app versions to the tag dictionnary
for
key
in
labels
.
keys
():
if
"
.Version
"
in
key
:
tags
[
"
pan1c-env
"
][
key
.
lower
()]
=
labels
[
key
]
...
...
@@ -99,6 +118,7 @@ for key in labels.keys():
## Pan1c-Box section
tags
[
"
pan1c-box
"
]
=
{}
# Reading the apps versions from the apptainer tags
_output
=
subprocess
.
run
(
[
"
apptainer
"
,
"
inspect
"
,
"
-j
"
,
f
"
{
args
.
appdir
}
/pan1c-box.sif
"
],
capture_output
=
True
,
...
...
@@ -109,11 +129,12 @@ labels = _output['data']['attributes']['labels']
tags
[
"
pan1c-box
"
][
"
image.version
"
]
=
labels
[
'
Version
'
]
tags
[
"
pan1c-box
"
][
"
image.home
"
]
=
labels
[
'
about.home
'
]
# Adding app versions to the tag dictionnary
for
key
in
labels
.
keys
():
if
"
.Version
"
in
key
:
tags
[
"
pan1c-box
"
][
key
.
lower
()]
=
labels
[
key
]
## Exporting tags
## Exporting tags
to stdout
print
(
"
#
\t
This graph have been created using the Pan1c workflow (https://forgemia.inra.fr/alexis.mergez/pan1c)
\n
#
"
)
print
(
"
#
\t
Tool versions and commands
\n
#
"
)
for
first_elem
in
tags
.
keys
():
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment