Gentoo Archives: gentoo-commits

From: Martin Mokrejs <mmokrejs@×××××××××××××××.cz>
To: gentoo-commits@l.g.o
Subject: [gentoo-commits] proj/sci:master commit in: sci-biology/full_lengther_next/files/, sci-biology/full_lengther_next/
Date: Tue, 26 Apr 2016 12:08:14
Message-Id: 1461672470.85fe7624385b5d424a1d250106a55c4aad74217f.mmokrejs@gentoo
1 commit: 85fe7624385b5d424a1d250106a55c4aad74217f
2 Author: Martin Mokrejš <mmokrejs <AT> fold <DOT> natur <DOT> cuni <DOT> cz>
3 AuthorDate: Tue Apr 26 12:07:50 2016 +0000
4 Commit: Martin Mokrejs <mmokrejs <AT> fold <DOT> natur <DOT> cuni <DOT> cz>
5 CommitDate: Tue Apr 26 12:07:50 2016 +0000
6 URL: https://gitweb.gentoo.org/proj/sci.git/commit/?id=85fe7624
7
8 sci-biology/full_lengther_next: version bump
9
10 Package-Manager: portage-2.2.28
11
12 .../full_lengther_next/files/download_fln_dbs.rb | 260 ---------------------
13 ....5.7.ebuild => full_lengther_next-0.6.0.ebuild} | 7 +-
14 2 files changed, 1 insertion(+), 266 deletions(-)
15
16 diff --git a/sci-biology/full_lengther_next/files/download_fln_dbs.rb b/sci-biology/full_lengther_next/files/download_fln_dbs.rb
17 deleted file mode 100644
18 index 90eacbb..0000000
19 --- a/sci-biology/full_lengther_next/files/download_fln_dbs.rb
20 +++ /dev/null
21 @@ -1,260 +0,0 @@
22 -#!/usr/bin/env ruby
23 -
24 -# 15-2-2011 Noe Fernandez-Pozo
25 -# Script to download Full-LengtherNext databases.
26 -# Once in UniProtKB/Swiss-Prot, a protein entry is removed from UniProtKB/TrEMBL.
27 -
28 -require 'net/ftp'
29 -require 'open-uri'
30 -
31 -class FtpClient
32 -
33 -def initialize
34 -end
35 -
36 -def connect(server)
37 - @server=server
38 -end
39 -
40 -def login
41 -
42 -end
43 -
44 -def chdir(dir)
45 - @dir=dir
46 -end
47 -
48 -def getbinaryfile(file,output_file)
49 - if !File.exists?(output_file) && !File.exists?(output_file.gsub('.gz',''))
50 - puts " - Downloading"
51 - cmd="wget #{@server}/#{@dir}/#{file} -O #{output_file}"
52 - system(cmd)
53 - else
54 - puts "File #{output_file}, or #{output_file.gsub('.gz','')} already exists. Skip download"
55 - end
56 -
57 -end
58 -
59 -def close
60 -end
61 -
62 -end
63 -################################################### Functions
64 -
65 -def download_ncrna(formatted_db_path)
66 -
67 - if !File.exists?(File.join(formatted_db_path, "nc_rna_db"))
68 - Dir.mkdir(File.join(formatted_db_path, "nc_rna_db"))
69 - end
70 -
71 - puts "Downloading ncRNA database"
72 - open(File.join(formatted_db_path, "nc_rna_db/ncrna_fln_100.fasta.zip"), "wb") do |my_file|
73 - my_file.print open('http://www.scbi.uma.es/downloads/FLNDB/ncrna_fln_100.fasta.zip').read
74 - end
75 - puts "\nncRNA database downloaded"
76 -
77 - ncrna_zip=File.join(formatted_db_path,'nc_rna_db','ncrna_fln_100.fasta.zip')
78 - ncrna_out_dir=File.join(formatted_db_path,'nc_rna_db')
79 - system("unzip", ncrna_zip, "-d", ncrna_out_dir)
80 - system("rm", ncrna_zip)
81 -
82 - puts "\nncRNA database decompressed"
83 -
84 - ncrna_fasta=File.join(formatted_db_path,'nc_rna_db','ncrna_fln_100.fasta')
85 - system("makeblastdb", "-in", ncrna_fasta, "-dbtype", "nucl", "-parse_seqids")
86 -
87 - puts "\nncRNA database completed"
88 -end
89 -
90 -def conecta_uniprot(my_array, formatted_db_path)
91 -
92 - #$ftp = Net::FTP.new()
93 - $ftp = FtpClient.new()
94 -
95 - if !File.exists?(formatted_db_path)
96 - Dir.mkdir(formatted_db_path)
97 - end
98 -
99 - $ftp.connect('ftp://ftp.uniprot.org')
100 -
101 - $ftp.login
102 -
103 - puts "connected to UniProt"
104 -
105 - my_array.each do |db_group|
106 - puts "Downloading #{db_group}"
107 - download_uniprot(db_group, formatted_db_path)
108 - end
109 -
110 - varsplic_out=File.join(formatted_db_path,'uniprot_sprot_varsplic.fasta.gz')
111 - $ftp.chdir("/pub/databases/uniprot/current_release/knowledgebase/complete")
112 - $ftp.getbinaryfile("uniprot_sprot_varsplic.fasta.gz", varsplic_out)
113 -
114 - puts "isoform files downloaded"
115 -
116 - $ftp.close
117 -
118 -end
119 -
120 -def download_uniprot(uniprot_group, formatted_db_path)
121 -
122 - sp_out=File.join(formatted_db_path,"uniprot_sprot_#{uniprot_group}.dat.gz")
123 - tr_out=File.join(formatted_db_path,"uniprot_trembl_#{uniprot_group}.dat.gz")
124 - $ftp.chdir("/pub/databases/uniprot/current_release/knowledgebase/taxonomic_divisions")
125 - puts " from ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/taxonomic_divisions/uniprot_sprot_#{uniprot_group}.dat.gz"
126 - puts " from ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/taxonomic_divisions/uniprot_trembl_#{uniprot_group}.dat.gz"
127 - $ftp.getbinaryfile("uniprot_sprot_#{uniprot_group}.dat.gz", sp_out)
128 - $ftp.getbinaryfile("uniprot_trembl_#{uniprot_group}.dat.gz", tr_out)
129 -
130 - puts "#{uniprot_group} files downloaded"
131 -
132 -end
133 -
134 -def filter_incomplete_seqs(file_name, isoform_hash, formatted_db_path)
135 -
136 - puts "filtering sequences from #{file_name}"
137 -
138 - # UniProtKB fragments with FT NON_CONS and FT NON_TER features.
139 - #
140 - # * FT NON_TER: The residue at an extremity of the sequence is not the terminal residue. If applied to position 1, this signifies that the first position is not the N-terminus of the complete molecule. If applied to the last position, it means that this position is not the C-terminus of the complete molecule. There is no description field for this key. Examples of NON_TER key feature lines:
141 - # FT NON_TER 1 1
142 - # FT NON_TER 29 29
143 - # * FT NON_CONS: Non-consecutive residues. Indicates that two residues in a sequence are not consecutive and that there are a number of unreported or missing residues between them. Example of a NON_CONS key feature line:
144 - # FT NON_CONS 1683 1684
145 - #
146 - # NON_CONS fragments are not indicated as non-consecutive in InterPro and being non-consecutive the match to methods may be incorrect if the method spans the 'break'.
147 -
148 - newseq=false
149 - print_seq=true
150 - id=''
151 - description = ''
152 - organism_name = ''
153 - seq = ''
154 - organelle = ''
155 -
156 - file_name =~ /uniprot_([a-z]+)_([a-z]+).dat/
157 - db_name = $1
158 - output_name = $2
159 - db_name.sub!('sprot','sp')
160 - db_name.sub!('trembl','tr')
161 -
162 - if !File.exists?(File.join(formatted_db_path, "#{db_name}_#{output_name}"))
163 - Dir.mkdir(File.join(formatted_db_path, "#{db_name}_#{output_name}"))
164 - end
165 -
166 - output_file = File.new(File.join(formatted_db_path, "#{db_name}_#{output_name}/#{db_name}_#{output_name}.fasta"), "w")
167 -
168 - File.open(file_name).each_line do |line|
169 - if (newseq == false)
170 - if (line =~ /^AC\s+(\w+);/)
171 - id=$1
172 - newseq = true
173 - description = ''
174 - organism_name = ''
175 - seq = ''
176 - print_seq = true
177 - organelle = ''
178 - end
179 - else
180 - if (line =~ /^DE\s+(.+)\;*/)
181 - if (description == '')
182 - description = $1
183 - description.sub!(/RecName: Full=/,'sp=')
184 - description.sub!(/SubName: Full=/,'tr=')
185 - end
186 - if (line =~ /Flags: Fragment/)
187 - # puts "#{id} #{line}"
188 - print_seq=false
189 - end
190 - elsif (line =~ /^OS\s+(.+)/)
191 - organism_name = $1
192 - elsif (line =~ /^OG\s+(.+)/)
193 - organelle = $1
194 - elsif (line =~ /^FT\s+NON_TER\s+/)
195 - print_seq=false
196 - # puts "#{id} NON_TER"
197 - elsif (line =~ /^FT\s+NON_CONS\s+(\d+)\s+/)
198 - print_seq=false
199 - # puts "#{id} NON_CONS"
200 - elsif (line =~ /^\s+([\w\s]+)/)
201 - seq += $1
202 - elsif (line =~ /^\/\//)
203 - seq.gsub!(/\s*/,'')
204 - if (seq !~ /^M/i)
205 - print_seq=false
206 - end
207 - newseq = false
208 -
209 - if (print_seq)
210 - output_file.puts ">#{id} #{description} #{organism_name} #{organelle}\n#{seq}"
211 - if (!isoform_hash[id].nil?)
212 - output_file.puts isoform_hash[id]
213 - end
214 - end
215 - end
216 - end
217 - end
218 - output_file.close
219 -end
220 -
221 -def load_isoform_hash(file)
222 -
223 - isoform_hash = {}
224 - my_fasta = ''
225 - acc = ''
226 - File.open(file).each do |line|
227 - line.chomp!
228 - if (line =~ /(^>\w+\|(\w+)\-\d\|.+)/)
229 - if (isoform_hash[acc].nil?)
230 - isoform_hash[acc]= "#{my_fasta}\n"
231 - else
232 - isoform_hash[acc]+= "#{my_fasta}\n"
233 - end
234 - my_fasta = "#{$1}\n"
235 - acc = $2
236 - else
237 - my_fasta += line
238 - end
239 - end
240 -
241 - return isoform_hash
242 -end
243 -
244 -################################################### MAIN
245 -
246 -ROOT_PATH=File.dirname(__FILE__)
247 -
248 -if ENV['BLASTDB'] && File.exists?(ENV['BLASTDB'])
249 - formatted_db_path = ENV['BLASTDB']
250 -else # otherwise use ROOTPATH + DB
251 - formatted_db_path = File.expand_path(File.join(ROOT_PATH, "blast_dbs"))
252 -end
253 -
254 -ENV['BLASTDB']=formatted_db_path
255 -puts "Databases will be downloaded at: #{ENV['BLASTDB']}"
256 -puts "\nTo set the path for storing databases, execute next line in your terminal or add it to your .bash_profile:\n\n\texport BLASTDB=/my_path/\n\n"
257 -
258 -my_array = ["human","fungi","invertebrates","mammals","plants","rodents","vertebrates"]
259 -# my_array = ["plants","human"] # used for a shoter test
260 -
261 -conecta_uniprot(my_array, formatted_db_path)
262 -system('gunzip '+File.join(formatted_db_path,'*.gz'))
263 -
264 -isoform_hash = {}
265 -isoform_hash = load_isoform_hash(File.join(formatted_db_path, "uniprot_sprot_varsplic.fasta"))
266 -
267 -download_ncrna(formatted_db_path)
268 -
269 -my_array.each do |db_group|
270 -
271 - filter_incomplete_seqs(File.join(formatted_db_path, "uniprot_sprot_#{db_group}.dat"), isoform_hash, formatted_db_path)
272 - filter_incomplete_seqs(File.join(formatted_db_path, "uniprot_trembl_#{db_group}.dat"), isoform_hash, formatted_db_path)
273 -
274 - sp_fasta=File.join(formatted_db_path,"sp_#{db_group}","sp_#{db_group}.fasta")
275 - tr_fasta=File.join(formatted_db_path,"tr_#{db_group}","tr_#{db_group}.fasta")
276 - system("makeblastdb -in #{sp_fasta} -dbtype 'prot' -parse_seqids")
277 - system("makeblastdb -in #{tr_fasta} -dbtype 'prot' -parse_seqids")
278 -
279 -end
280 -
281 -puts "download_fln_dbs.rb has finished"
282
283 diff --git a/sci-biology/full_lengther_next/full_lengther_next-0.5.7.ebuild b/sci-biology/full_lengther_next/full_lengther_next-0.6.0.ebuild
284 similarity index 82%
285 rename from sci-biology/full_lengther_next/full_lengther_next-0.5.7.ebuild
286 rename to sci-biology/full_lengther_next/full_lengther_next-0.6.0.ebuild
287 index 1f3476f..64d300a 100644
288 --- a/sci-biology/full_lengther_next/full_lengther_next-0.5.7.ebuild
289 +++ b/sci-biology/full_lengther_next/full_lengther_next-0.6.0.ebuild
290 @@ -1,4 +1,4 @@
291 -# Copyright 1999-2015 Gentoo Foundation
292 +# Copyright 1999-2016 Gentoo Foundation
293 # Distributed under the terms of the GNU General Public License v2
294 # $Id$
295
296 @@ -32,8 +32,3 @@ RDEPEND="${DEPEND}
297 >=sci-biology/scbi_plot-0.0.6
298 >=dev-ruby/xml-simple-1.0.12
299 >=dev-ruby/gnuplot-2.3.0"
300 -
301 -#src_prepare(){
302 -# cp "${FILESDIR}"/download_fln_dbs.rb all/full_lengther_next-0.0.8/bin || die
303 -# chmod a+rx all/full_lengther_next-0.0.8/bin/*.rb || die
304 -#}