1 |
commit: 85fe7624385b5d424a1d250106a55c4aad74217f |
2 |
Author: Martin Mokrejš <mmokrejs <AT> fold <DOT> natur <DOT> cuni <DOT> cz> |
3 |
AuthorDate: Tue Apr 26 12:07:50 2016 +0000 |
4 |
Commit: Martin Mokrejs <mmokrejs <AT> fold <DOT> natur <DOT> cuni <DOT> cz> |
5 |
CommitDate: Tue Apr 26 12:07:50 2016 +0000 |
6 |
URL: https://gitweb.gentoo.org/proj/sci.git/commit/?id=85fe7624 |
7 |
|
8 |
sci-biology/full_lengther_next: version bump |
9 |
|
10 |
Package-Manager: portage-2.2.28 |
11 |
|
12 |
.../full_lengther_next/files/download_fln_dbs.rb | 260 --------------------- |
13 |
....5.7.ebuild => full_lengther_next-0.6.0.ebuild} | 7 +- |
14 |
2 files changed, 1 insertion(+), 266 deletions(-) |
15 |
|
16 |
diff --git a/sci-biology/full_lengther_next/files/download_fln_dbs.rb b/sci-biology/full_lengther_next/files/download_fln_dbs.rb |
17 |
deleted file mode 100644 |
18 |
index 90eacbb..0000000 |
19 |
--- a/sci-biology/full_lengther_next/files/download_fln_dbs.rb |
20 |
+++ /dev/null |
21 |
@@ -1,260 +0,0 @@ |
22 |
-#!/usr/bin/env ruby |
23 |
- |
24 |
-# 15-2-2011 Noe Fernandez-Pozo |
25 |
-# Script to download Full-LengtherNext databases. |
26 |
-# Once in UniProtKB/Swiss-Prot, a protein entry is removed from UniProtKB/TrEMBL. |
27 |
- |
28 |
-require 'net/ftp' |
29 |
-require 'open-uri' |
30 |
- |
31 |
-class FtpClient |
32 |
- |
33 |
-def initialize |
34 |
-end |
35 |
- |
36 |
-def connect(server) |
37 |
- @server=server |
38 |
-end |
39 |
- |
40 |
-def login |
41 |
- |
42 |
-end |
43 |
- |
44 |
-def chdir(dir) |
45 |
- @dir=dir |
46 |
-end |
47 |
- |
48 |
-def getbinaryfile(file,output_file) |
49 |
- if !File.exists?(output_file) && !File.exists?(output_file.gsub('.gz','')) |
50 |
- puts " - Downloading" |
51 |
- cmd="wget #{@server}/#{@dir}/#{file} -O #{output_file}" |
52 |
- system(cmd) |
53 |
- else |
54 |
- puts "File #{output_file}, or #{output_file.gsub('.gz','')} already exists. Skip download" |
55 |
- end |
56 |
- |
57 |
-end |
58 |
- |
59 |
-def close |
60 |
-end |
61 |
- |
62 |
-end |
63 |
-################################################### Functions |
64 |
- |
65 |
-def download_ncrna(formatted_db_path) |
66 |
- |
67 |
- if !File.exists?(File.join(formatted_db_path, "nc_rna_db")) |
68 |
- Dir.mkdir(File.join(formatted_db_path, "nc_rna_db")) |
69 |
- end |
70 |
- |
71 |
- puts "Downloading ncRNA database" |
72 |
- open(File.join(formatted_db_path, "nc_rna_db/ncrna_fln_100.fasta.zip"), "wb") do |my_file| |
73 |
- my_file.print open('http://www.scbi.uma.es/downloads/FLNDB/ncrna_fln_100.fasta.zip').read |
74 |
- end |
75 |
- puts "\nncRNA database downloaded" |
76 |
- |
77 |
- ncrna_zip=File.join(formatted_db_path,'nc_rna_db','ncrna_fln_100.fasta.zip') |
78 |
- ncrna_out_dir=File.join(formatted_db_path,'nc_rna_db') |
79 |
- system("unzip", ncrna_zip, "-d", ncrna_out_dir) |
80 |
- system("rm", ncrna_zip) |
81 |
- |
82 |
- puts "\nncRNA database decompressed" |
83 |
- |
84 |
- ncrna_fasta=File.join(formatted_db_path,'nc_rna_db','ncrna_fln_100.fasta') |
85 |
- system("makeblastdb", "-in", ncrna_fasta, "-dbtype", "nucl", "-parse_seqids") |
86 |
- |
87 |
- puts "\nncRNA database completed" |
88 |
-end |
89 |
- |
90 |
-def conecta_uniprot(my_array, formatted_db_path) |
91 |
- |
92 |
- #$ftp = Net::FTP.new() |
93 |
- $ftp = FtpClient.new() |
94 |
- |
95 |
- if !File.exists?(formatted_db_path) |
96 |
- Dir.mkdir(formatted_db_path) |
97 |
- end |
98 |
- |
99 |
- $ftp.connect('ftp://ftp.uniprot.org') |
100 |
- |
101 |
- $ftp.login |
102 |
- |
103 |
- puts "connected to UniProt" |
104 |
- |
105 |
- my_array.each do |db_group| |
106 |
- puts "Downloading #{db_group}" |
107 |
- download_uniprot(db_group, formatted_db_path) |
108 |
- end |
109 |
- |
110 |
- varsplic_out=File.join(formatted_db_path,'uniprot_sprot_varsplic.fasta.gz') |
111 |
- $ftp.chdir("/pub/databases/uniprot/current_release/knowledgebase/complete") |
112 |
- $ftp.getbinaryfile("uniprot_sprot_varsplic.fasta.gz", varsplic_out) |
113 |
- |
114 |
- puts "isoform files downloaded" |
115 |
- |
116 |
- $ftp.close |
117 |
- |
118 |
-end |
119 |
- |
120 |
-def download_uniprot(uniprot_group, formatted_db_path) |
121 |
- |
122 |
- sp_out=File.join(formatted_db_path,"uniprot_sprot_#{uniprot_group}.dat.gz") |
123 |
- tr_out=File.join(formatted_db_path,"uniprot_trembl_#{uniprot_group}.dat.gz") |
124 |
- $ftp.chdir("/pub/databases/uniprot/current_release/knowledgebase/taxonomic_divisions") |
125 |
- puts " from ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/taxonomic_divisions/uniprot_sprot_#{uniprot_group}.dat.gz" |
126 |
- puts " from ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/taxonomic_divisions/uniprot_trembl_#{uniprot_group}.dat.gz" |
127 |
- $ftp.getbinaryfile("uniprot_sprot_#{uniprot_group}.dat.gz", sp_out) |
128 |
- $ftp.getbinaryfile("uniprot_trembl_#{uniprot_group}.dat.gz", tr_out) |
129 |
- |
130 |
- puts "#{uniprot_group} files downloaded" |
131 |
- |
132 |
-end |
133 |
- |
134 |
-def filter_incomplete_seqs(file_name, isoform_hash, formatted_db_path) |
135 |
- |
136 |
- puts "filtering sequences from #{file_name}" |
137 |
- |
138 |
- # UniProtKB fragments with FT NON_CONS and FT NON_TER features. |
139 |
- # |
140 |
- # * FT NON_TER: The residue at an extremity of the sequence is not the terminal residue. If applied to position 1, this signifies that the first position is not the N-terminus of the complete molecule. If applied to the last position, it means that this position is not the C-terminus of the complete molecule. There is no description field for this key. Examples of NON_TER key feature lines: |
141 |
- # FT NON_TER 1 1 |
142 |
- # FT NON_TER 29 29 |
143 |
- # * FT NON_CONS: Non-consecutive residues. Indicates that two residues in a sequence are not consecutive and that there are a number of unreported or missing residues between them. Example of a NON_CONS key feature line: |
144 |
- # FT NON_CONS 1683 1684 |
145 |
- # |
146 |
- # NON_CONS fragments are not indicated as non-consecutive in InterPro and being non-consecutive the match to methods may be incorrect if the method spans the 'break'. |
147 |
- |
148 |
- newseq=false |
149 |
- print_seq=true |
150 |
- id='' |
151 |
- description = '' |
152 |
- organism_name = '' |
153 |
- seq = '' |
154 |
- organelle = '' |
155 |
- |
156 |
- file_name =~ /uniprot_([a-z]+)_([a-z]+).dat/ |
157 |
- db_name = $1 |
158 |
- output_name = $2 |
159 |
- db_name.sub!('sprot','sp') |
160 |
- db_name.sub!('trembl','tr') |
161 |
- |
162 |
- if !File.exists?(File.join(formatted_db_path, "#{db_name}_#{output_name}")) |
163 |
- Dir.mkdir(File.join(formatted_db_path, "#{db_name}_#{output_name}")) |
164 |
- end |
165 |
- |
166 |
- output_file = File.new(File.join(formatted_db_path, "#{db_name}_#{output_name}/#{db_name}_#{output_name}.fasta"), "w") |
167 |
- |
168 |
- File.open(file_name).each_line do |line| |
169 |
- if (newseq == false) |
170 |
- if (line =~ /^AC\s+(\w+);/) |
171 |
- id=$1 |
172 |
- newseq = true |
173 |
- description = '' |
174 |
- organism_name = '' |
175 |
- seq = '' |
176 |
- print_seq = true |
177 |
- organelle = '' |
178 |
- end |
179 |
- else |
180 |
- if (line =~ /^DE\s+(.+)\;*/) |
181 |
- if (description == '') |
182 |
- description = $1 |
183 |
- description.sub!(/RecName: Full=/,'sp=') |
184 |
- description.sub!(/SubName: Full=/,'tr=') |
185 |
- end |
186 |
- if (line =~ /Flags: Fragment/) |
187 |
- # puts "#{id} #{line}" |
188 |
- print_seq=false |
189 |
- end |
190 |
- elsif (line =~ /^OS\s+(.+)/) |
191 |
- organism_name = $1 |
192 |
- elsif (line =~ /^OG\s+(.+)/) |
193 |
- organelle = $1 |
194 |
- elsif (line =~ /^FT\s+NON_TER\s+/) |
195 |
- print_seq=false |
196 |
- # puts "#{id} NON_TER" |
197 |
- elsif (line =~ /^FT\s+NON_CONS\s+(\d+)\s+/) |
198 |
- print_seq=false |
199 |
- # puts "#{id} NON_CONS" |
200 |
- elsif (line =~ /^\s+([\w\s]+)/) |
201 |
- seq += $1 |
202 |
- elsif (line =~ /^\/\//) |
203 |
- seq.gsub!(/\s*/,'') |
204 |
- if (seq !~ /^M/i) |
205 |
- print_seq=false |
206 |
- end |
207 |
- newseq = false |
208 |
- |
209 |
- if (print_seq) |
210 |
- output_file.puts ">#{id} #{description} #{organism_name} #{organelle}\n#{seq}" |
211 |
- if (!isoform_hash[id].nil?) |
212 |
- output_file.puts isoform_hash[id] |
213 |
- end |
214 |
- end |
215 |
- end |
216 |
- end |
217 |
- end |
218 |
- output_file.close |
219 |
-end |
220 |
- |
221 |
-def load_isoform_hash(file) |
222 |
- |
223 |
- isoform_hash = {} |
224 |
- my_fasta = '' |
225 |
- acc = '' |
226 |
- File.open(file).each do |line| |
227 |
- line.chomp! |
228 |
- if (line =~ /(^>\w+\|(\w+)\-\d\|.+)/) |
229 |
- if (isoform_hash[acc].nil?) |
230 |
- isoform_hash[acc]= "#{my_fasta}\n" |
231 |
- else |
232 |
- isoform_hash[acc]+= "#{my_fasta}\n" |
233 |
- end |
234 |
- my_fasta = "#{$1}\n" |
235 |
- acc = $2 |
236 |
- else |
237 |
- my_fasta += line |
238 |
- end |
239 |
- end |
240 |
- |
241 |
- return isoform_hash |
242 |
-end |
243 |
- |
244 |
-################################################### MAIN |
245 |
- |
246 |
-ROOT_PATH=File.dirname(__FILE__) |
247 |
- |
248 |
-if ENV['BLASTDB'] && File.exists?(ENV['BLASTDB']) |
249 |
- formatted_db_path = ENV['BLASTDB'] |
250 |
-else # otherwise use ROOTPATH + DB |
251 |
- formatted_db_path = File.expand_path(File.join(ROOT_PATH, "blast_dbs")) |
252 |
-end |
253 |
- |
254 |
-ENV['BLASTDB']=formatted_db_path |
255 |
-puts "Databases will be downloaded at: #{ENV['BLASTDB']}" |
256 |
-puts "\nTo set the path for storing databases, execute next line in your terminal or add it to your .bash_profile:\n\n\texport BLASTDB=/my_path/\n\n" |
257 |
- |
258 |
-my_array = ["human","fungi","invertebrates","mammals","plants","rodents","vertebrates"] |
259 |
-# my_array = ["plants","human"] # used for a shoter test |
260 |
- |
261 |
-conecta_uniprot(my_array, formatted_db_path) |
262 |
-system('gunzip '+File.join(formatted_db_path,'*.gz')) |
263 |
- |
264 |
-isoform_hash = {} |
265 |
-isoform_hash = load_isoform_hash(File.join(formatted_db_path, "uniprot_sprot_varsplic.fasta")) |
266 |
- |
267 |
-download_ncrna(formatted_db_path) |
268 |
- |
269 |
-my_array.each do |db_group| |
270 |
- |
271 |
- filter_incomplete_seqs(File.join(formatted_db_path, "uniprot_sprot_#{db_group}.dat"), isoform_hash, formatted_db_path) |
272 |
- filter_incomplete_seqs(File.join(formatted_db_path, "uniprot_trembl_#{db_group}.dat"), isoform_hash, formatted_db_path) |
273 |
- |
274 |
- sp_fasta=File.join(formatted_db_path,"sp_#{db_group}","sp_#{db_group}.fasta") |
275 |
- tr_fasta=File.join(formatted_db_path,"tr_#{db_group}","tr_#{db_group}.fasta") |
276 |
- system("makeblastdb -in #{sp_fasta} -dbtype 'prot' -parse_seqids") |
277 |
- system("makeblastdb -in #{tr_fasta} -dbtype 'prot' -parse_seqids") |
278 |
- |
279 |
-end |
280 |
- |
281 |
-puts "download_fln_dbs.rb has finished" |
282 |
|
283 |
diff --git a/sci-biology/full_lengther_next/full_lengther_next-0.5.7.ebuild b/sci-biology/full_lengther_next/full_lengther_next-0.6.0.ebuild |
284 |
similarity index 82% |
285 |
rename from sci-biology/full_lengther_next/full_lengther_next-0.5.7.ebuild |
286 |
rename to sci-biology/full_lengther_next/full_lengther_next-0.6.0.ebuild |
287 |
index 1f3476f..64d300a 100644 |
288 |
--- a/sci-biology/full_lengther_next/full_lengther_next-0.5.7.ebuild |
289 |
+++ b/sci-biology/full_lengther_next/full_lengther_next-0.6.0.ebuild |
290 |
@@ -1,4 +1,4 @@ |
291 |
-# Copyright 1999-2015 Gentoo Foundation |
292 |
+# Copyright 1999-2016 Gentoo Foundation |
293 |
# Distributed under the terms of the GNU General Public License v2 |
294 |
# $Id$ |
295 |
|
296 |
@@ -32,8 +32,3 @@ RDEPEND="${DEPEND} |
297 |
>=sci-biology/scbi_plot-0.0.6 |
298 |
>=dev-ruby/xml-simple-1.0.12 |
299 |
>=dev-ruby/gnuplot-2.3.0" |
300 |
- |
301 |
-#src_prepare(){ |
302 |
-# cp "${FILESDIR}"/download_fln_dbs.rb all/full_lengther_next-0.0.8/bin || die |
303 |
-# chmod a+rx all/full_lengther_next-0.0.8/bin/*.rb || die |
304 |
-#} |