Gentoo Archives: gentoo-commits

From: Martin Mokrejs <mmokrejs@×××××××××××××××.cz>
To: gentoo-commits@l.g.o
Subject: [gentoo-commits] proj/sci:master commit in: sci-biology/SEECER/files/, sci-biology/SEECER/
Date: Tue, 21 Nov 2017 16:11:40
Message-Id: 1511280688.aab87747b89b64107677056a3d4874d8f5ee7bbf.mmokrejs@gentoo
1 commit: aab87747b89b64107677056a3d4874d8f5ee7bbf
2 Author: Martin Mokrejš <mmokrejs <AT> fold <DOT> natur <DOT> cuni <DOT> cz>
3 AuthorDate: Tue Nov 21 16:11:28 2017 +0000
4 Commit: Martin Mokrejs <mmokrejs <AT> fold <DOT> natur <DOT> cuni <DOT> cz>
5 CommitDate: Tue Nov 21 16:11:28 2017 +0000
6 URL: https://gitweb.gentoo.org/proj/sci.git/commit/?id=aab87747
7
8 sci-biology/SEECER: execute jellyfish1 instead of jellyfish
9
10 Also I wrote a few cleanup patches to expose THREADS
11 variable and cleanup the code.
12
13 Package-Manager: Portage-2.3.14, Repoman-2.3.6
14
15 sci-biology/SEECER/SEECER-0.1.3-r2.ebuild | 7 ++-
16 .../SEECER/files/rename_jellyfish_binary.patch | 11 ++++
17 sci-biology/SEECER/files/run_jellyfish.sh.patch | 72 ++++++++++++++++++++++
18 sci-biology/SEECER/files/run_seecer.sh.patch | 42 +++++++++++++
19 4 files changed, 130 insertions(+), 2 deletions(-)
20
21 diff --git a/sci-biology/SEECER/SEECER-0.1.3-r2.ebuild b/sci-biology/SEECER/SEECER-0.1.3-r2.ebuild
22 index 60862d8c8..0b7ec3bfa 100644
23 --- a/sci-biology/SEECER/SEECER-0.1.3-r2.ebuild
24 +++ b/sci-biology/SEECER/SEECER-0.1.3-r2.ebuild
25 @@ -5,7 +5,7 @@ EAPI=6
26
27 inherit eutils
28
29 -DESCRIPTION="SEquencing Error Corrector for RNA-Seq reads"
30 +DESCRIPTION="SEquence Error Corrector for RNA-Seq reads"
31 HOMEPAGE="http://sb.cs.cmu.edu/seecer/"
32 SRC_URI="
33 http://sb.cs.cmu.edu/seecer/downloads/"${P}".tar.gz
34 @@ -22,12 +22,15 @@ DEPEND="
35 sci-libs/gsl:0=
36 sci-biology/seqan:0="
37 RDEPEND="${DEPEND}
38 - =sci-biology/jellyfish-1.1.11"
39 + =sci-biology/jellyfish-1.1.11-r1"
40
41 S="${S}"/SEECER
42
43 PATCHES=(
44 "${FILESDIR}"/remove-hardcoded-paths.patch
45 + "${FILESDIR}"/run_seecer.sh.patch
46 + "${FILESDIR}"/run_jellyfish.sh.patch
47 + "${FILESDIR}"/rename_jellyfish_binary.patch
48 )
49
50 src_prepare(){
51
52 diff --git a/sci-biology/SEECER/files/rename_jellyfish_binary.patch b/sci-biology/SEECER/files/rename_jellyfish_binary.patch
53 new file mode 100644
54 index 000000000..c6548cee1
55 --- /dev/null
56 +++ b/sci-biology/SEECER/files/rename_jellyfish_binary.patch
57 @@ -0,0 +1,11 @@
58 +--- SEECER/bin/run_seecer.sh.ori 2017-11-21 16:56:28.808767468 +0100
59 ++++ SEECER/bin/run_seecer.sh 2017-11-21 16:57:07.469835728 +0100
60 +@@ -26,7 +26,7 @@
61 +
62 +
63 + BINDIR='' #this can be hardcoded to /absolute/path/to/SEECER/bin/
64 +-JF="jellyfish" #this may be hardcoded to /absolute/path/to/jellyfish/bin/
65 ++JF="jellyfish1" #this may be hardcoded to /absolute/path/to/jellyfish/bin/jellyfish
66 +
67 + K=17
68 + SEECER_PARAMS=""
69
70 diff --git a/sci-biology/SEECER/files/run_jellyfish.sh.patch b/sci-biology/SEECER/files/run_jellyfish.sh.patch
71 new file mode 100644
72 index 000000000..7631f5a4c
73 --- /dev/null
74 +++ b/sci-biology/SEECER/files/run_jellyfish.sh.patch
75 @@ -0,0 +1,72 @@
76 +--- SEECER-0.1.3/bin/run_jellyfish.sh.ori 2017-11-21 16:41:54.164599838 +0100
77 ++++ SEECER-0.1.3/bin/run_jellyfish.sh 2017-11-21 16:46:28.022166903 +0100
78 +@@ -1,18 +1,45 @@
79 + #!/bin/bash
80 ++
81 ++# Usage: run_jellyfish.sh jellyfish_binpath tempfile_prefix kmersize mincount tmpdir infile1 [infile2] threads
82 + JF=$1
83 + LCOUNT=$4
84 + TMPDIR=$5
85 ++THREADS=${8:-32}
86 +
87 + if [ -z "$JF" ]; then
88 + echo "No path to jellyfish binary provided, exiting.";
89 + exit 255;
90 + fi
91 +
92 ++# Usage: jellyfish count [options] file:path+
93 ++#
94 ++# Count k-mers or qmers in fasta or fastq files
95 ++#
96 ++# Options (default value in (), *required):
97 ++# -m, --mer-len=uint32 *Length of mer
98 ++# -s, --size=uint64 *Hash size
99 ++# -t, --threads=uint32 Number of threads (1)
100 ++# -o, --output=string Output prefix (mer_counts)
101 ++# -c, --counter-len=Length in bits Length of counting field (7)
102 ++# --out-counter-len=Length in bytes Length of counter field in output (4)
103 ++# -C, --both-strands Count both strand, canonical representation (false)
104 ++# -p, --reprobes=uint32 Maximum number of reprobes (62)
105 ++# -r, --raw Write raw database (false)
106 ++# -q, --quake Quake compatibility mode (false)
107 ++# --quality-start=uint32 Starting ASCII for quality values (64)
108 ++# --min-quality=uint32 Minimum quality. A base with lesser quality becomes an N (0)
109 ++# -L, --lower-count=uint64 Don't output k-mer with count < lower-count
110 ++# -U, --upper-count=uint64 Don't output k-mer with count > upper-count
111 ++# --invalid-char=warn|ignore|error How to treat invalid characters. The char is changed to a N. (warn)
112 ++# --matrix=Matrix file Hash function binary matrix
113 ++# --timing=Timing file Print timing information
114 ++# --stats=Stats file Print stats
115 ++#
116 + if [ "$#" -eq "4" ];
117 + then
118 +-$JF count -m $3 -o $TMPDIR/jf_tmp -c 3 -s 10000000 -t 32 --both-strands $6 || exit 255
119 ++$JF count -m $3 -o $TMPDIR/jf_tmp -c 3 -s 10000000 -t $THREADS --both-strands $6 || exit 255
120 + else
121 +-$JF count -m $3 -o $TMPDIR/jf_tmp -c 3 -s 10000000 -t 32 --both-strands $6 $7 || exit 255
122 ++$JF count -m $3 -o $TMPDIR/jf_tmp -c 3 -s 10000000 -t $THREADS --both-strands $6 $7 || exit 255
123 + fi;
124 +
125 + # merge
126 +@@ -25,5 +52,21 @@
127 + rm $TMPDIR/jf_tmp_*
128 + fi
129 +
130 ++#
131 ++# Usage: jellyfish dump [options] db:path
132 ++#
133 ++# Dump k-mer counts
134 ++#
135 ++# By default, dump in a fasta format where the header is the count and
136 ++# the sequence is the sequence of the k-mer. The column format is a 2
137 ++# column output: k-mer count.
138 ++#
139 ++# Options (default value in (), *required):
140 ++# -c, --column Column format (false)
141 ++# -t, --tab Tab separator (false)
142 ++# -L, --lower-count=uint64 Don't output k-mer with count < lower-count
143 ++# -U, --upper-count=uint64 Don't output k-mer with count > upper-count
144 ++# -o, --output=string Output file
145 ++#
146 + $JF dump --lower-count=$LCOUNT -o $2 -c $TMPDIR/jf_merged_$3 || exit 255
147 + rm $TMPDIR/jf_merged_$3
148
149 diff --git a/sci-biology/SEECER/files/run_seecer.sh.patch b/sci-biology/SEECER/files/run_seecer.sh.patch
150 new file mode 100644
151 index 000000000..a20c7917f
152 --- /dev/null
153 +++ b/sci-biology/SEECER/files/run_seecer.sh.patch
154 @@ -0,0 +1,42 @@
155 +--- SEECER/bin/run_seecer.sh.old 2013-10-02 18:55:24.000000000 +0200
156 ++++ SEECER/bin/run_seecer.sh 2017-11-21 16:24:24.065584149 +0100
157 +@@ -33,6 +33,7 @@
158 + SeecerStep=1
159 + LCOUNT=3
160 + TMPDIR=''
161 ++THREADS=32
162 +
163 + usage=$(cat << EOF
164 + # This script runs the SEECER pipeline of 4 steps:
165 +@@ -54,11 +55,12 @@
166 + -j <v> : specify the location of JELLYFISH binary (default = $JF).
167 + -p <v> : specify extra SEECER parameters (default = '').
168 + -s <v> : specify the starting step ( default = 1). Values = 1,2,3,4.
169 ++ -c <v> : number of threads (default = 32).
170 + -h : help message
171 + EOF
172 + );
173 +
174 +-while getopts ":j:p:k:s:t:h" opt; do
175 ++while getopts ":j:p:k:s:t:c:h" opt; do
176 + case $opt in
177 + t)
178 + TMPDIR=$OPTARG
179 +@@ -75,6 +77,8 @@
180 + s)
181 + SeecerStep=$OPTARG
182 + ;;
183 ++ c)
184 ++ THREADS=$OPTARG
185 + \?)
186 + echo "Invalid option: -$OPTARG" >&2
187 + echo "$usage"
188 +@@ -170,7 +177,7 @@
189 + then
190 + echo "++ Step 2: Running JELLYFISH to count kmers ..."
191 + echo
192 +- bash "${BINDIR}"run_jellyfish.sh $JF $TMPDIR/counts_${K}_${LCOUNT} $K $LCOUNT $TMPDIR $Read1_N $Read2_N || exit 255
193 ++ bash "${BINDIR}"run_jellyfish.sh $JF $TMPDIR/counts_${K}_${LCOUNT} $K $LCOUNT $TMPDIR $Read1_N $Read2_N $THREADS || exit 255
194 + fi;
195 +
196 + if [ ! -r $TMPDIR/counts_${K}_${LCOUNT} ];