Gentoo Archives: gentoo-commits

From: "Michał Górny" <mgorny@g.o>
To: gentoo-commits@l.g.o
Subject: [gentoo-commits] repo/gentoo:master commit in: sci-biology/goby-cpp/files/, sci-biology/goby-cpp/
Date: Sun, 10 Dec 2017 00:05:57
Message-Id: 1512864328.9410f41fefdcd2fb758ae0b6921fb66be1d98378.mgorny@gentoo
1 commit: 9410f41fefdcd2fb758ae0b6921fb66be1d98378
2 Author: Michał Górny <mgorny <AT> gentoo <DOT> org>
3 AuthorDate: Sat Dec 9 23:20:58 2017 +0000
4 Commit: Michał Górny <mgorny <AT> gentoo <DOT> org>
5 CommitDate: Sun Dec 10 00:05:28 2017 +0000
6 URL: https://gitweb.gentoo.org/repo/gentoo.git/commit/?id=9410f41f
7
8 sci-biology/goby-cpp: [QA] Move aux files to a distfile
9
10 Closes: https://bugs.gentoo.org/620614
11
12 sci-biology/goby-cpp/Manifest | 7 +-
13 sci-biology/goby-cpp/files/Alignments.proto | 597 ---------------------
14 sci-biology/goby-cpp/files/Reads.proto | 96 ----
15 .../files/goby-cpp-2.0.1-underlinking.patch | 16 -
16 sci-biology/goby-cpp/goby-cpp-2.0.1.ebuild | 9 +-
17 5 files changed, 9 insertions(+), 716 deletions(-)
18
19 diff --git a/sci-biology/goby-cpp/Manifest b/sci-biology/goby-cpp/Manifest
20 index 88622e3c813..2bd240a998a 100644
21 --- a/sci-biology/goby-cpp/Manifest
22 +++ b/sci-biology/goby-cpp/Manifest
23 @@ -1,3 +1,4 @@
24 -DIST goby_1.9.7.3-cpp.zip 127215 SHA256 8493daa7c850732c6c48d4512bd26b7eec411a729b39d9861a4a6aae08faa674 SHA512 56bf190224b6f22e0578cea4cc950e52e746655c75ffc13675276787b4d0ced682f891f6ecf7af3cf124b535ac3afc8711b0ecff44d6fd25fe521de7371c3486 WHIRLPOOL ae7ead1b0364383b46d4ef8b59453146c68384b379c26498fc9b24d014ba096a99723bad42cfeb84d44c20e4fc14882bbad303ab8c981889f90dff88a882c5c0
25 -DIST goby_1.9.8.1-cpp.zip 134904 SHA256 2f1bd87f2870af178f34a8e7c11819aa9e42f35e20f1985d2ceb054f452e2a97 SHA512 d31cd7f0be19074bfe8da74d9f2510f0e0f15fe6c485bbed8520052468d2cd2f1bc5fcad8b0d6a1586f5acde73db326059f45994ecfbb5fb6c09692d8e155190 WHIRLPOOL 6ce51c46f8802d31068f510f6da13b2920086eafdae24506830b42d79e48eb6ed9cac48a96090a81964daebf4a0c8f21c490ca3b0af2f589ac57647bde1be79f
26 -DIST goby_2.0.1-cpp.zip 177718 SHA256 5ec57b833cb1a0f53e975112d1c360b14a9b17cfff3fb0ad77dd70672c1881db SHA512 992bd10d5538dec1478820f26151dd311f4de13e7947b49f0b06d6cbdd4b71deeb3aa8a4c6a598fb92fbcb9cbf4ff97bf81205c9389d4a0da4443317e48aea9f WHIRLPOOL ab94cf674703917b6f0cde812d0fbcd94e18fb6055b30d6a1eefa1e4cb5b76bbe18c67388c66e25e87e522df9a9946b0eae5a164428abe874a382f5bc39a13d0
27 +DIST goby-cpp-2.0.1-files.tar.bz2 8354 BLAKE2B 0169e1bbcdc27f359cde47df708546dd6af0a68334295b247a6aac9122b7e9b1ee590fe0b57052c642b7e25478f5b118c70bec0c4b4af3694ab0f68c1c9ea73a SHA512 6f0cf466688cdbe9fe646cdff78dd0721fd0b0819c354c63e7c39c45895c319754cdadf23aeb9d544b0b2c68f1168583cb541ec160ba7f567fa0218dbad38e1e
28 +DIST goby_1.9.7.3-cpp.zip 127215 BLAKE2B 0673c36b503a6daee5fdaaf96fb415277502c0a49e530eb39983d4718f4a1d8eb9a6ff0a3202413c358600aafc2bf73482be12462f798923c13e19a6bcd590b1 SHA512 56bf190224b6f22e0578cea4cc950e52e746655c75ffc13675276787b4d0ced682f891f6ecf7af3cf124b535ac3afc8711b0ecff44d6fd25fe521de7371c3486
29 +DIST goby_1.9.8.1-cpp.zip 134904 BLAKE2B 800f3bcbe9f721bfb636f514630fb1ceba3a1fe41616f63fc15f9f2a24394ef9be90419ccad0c9bd8b29100eeaea57659ba013042cf4a11b6038fc6dee782619 SHA512 d31cd7f0be19074bfe8da74d9f2510f0e0f15fe6c485bbed8520052468d2cd2f1bc5fcad8b0d6a1586f5acde73db326059f45994ecfbb5fb6c09692d8e155190
30 +DIST goby_2.0.1-cpp.zip 177718 BLAKE2B 666b50fdc199693f8a4f9b6007f6609e91ab6093b643da88e580c9a3438a150cd7be78d2b5dcdd2fe905263d32ebbac1e0e47dbc637fd5d59f877e7cbdaaeeb2 SHA512 992bd10d5538dec1478820f26151dd311f4de13e7947b49f0b06d6cbdd4b71deeb3aa8a4c6a598fb92fbcb9cbf4ff97bf81205c9389d4a0da4443317e48aea9f
31
32 diff --git a/sci-biology/goby-cpp/files/Alignments.proto b/sci-biology/goby-cpp/files/Alignments.proto
33 deleted file mode 100644
34 index fe7f5664764..00000000000
35 --- a/sci-biology/goby-cpp/files/Alignments.proto
36 +++ /dev/null
37 @@ -1,597 +0,0 @@
38 -package goby;
39 -
40 -option java_package = "edu.cornell.med.icb.goby.alignments";
41 -
42 -option optimize_for = SPEED;
43 -
44 -/*
45 - This message is written to 'basename'.entries as a very large chunked collection.
46 -*/
47 -message AlignmentCollection {
48 - repeated AlignmentEntry alignment_entries = 1;
49 -}
50 -
51 -
52 -message AlignmentEntry {
53 - /* Multiplicity of this entry. The number of times this alignment entry would be repeated exactly the same if
54 - query redundancy had not been removed by read factorization.
55 - */
56 - optional uint32 multiplicity = 7;
57 -
58 - /*
59 - Compressed stream of data. Removed since Goby 2.0 supports chunk codecs. Do not reuse field index 23
60 - optional bytes compressed_data = 23;
61 - */
62 -
63 - /* An integer that uniquely identifies the query (a short read) in a set of alignment runs. When several
64 - alignment runs are made with the same set of query sequences, equality of query index means that the query
65 - sequences were the same. (Comparing integers for equality is much faster than comparing strings.)
66 - This field is required (enforced by semantic validation in Goby 2.0+).
67 - */
68 - optional uint32 query_index = 1;
69 - /* An integer that uniquely identifies the target (e.g., a chromosome) in a set of alignment runs. When several
70 - alignment runs are made with the same set of target sequences, equality of target index means that the target
71 - sequence was the same across the runs. (Comparing integers for equality is much faster than comparing strings.)
72 - This field is required (enforced by semantic validation in Goby 2.0+).
73 - */
74 - optional uint32 target_index = 2;
75 - /*
76 - The position on the target of the start of the alignment between the query and the target.
77 - In the following example, position is 3 because the third base of the query 'C' was aligned with
78 - position 3 of the reference (two read bases were soft clipped: "ct"). This example shows that the
79 - alignment can start at a mismatch if it was so constructed by the aligner.
80 -
81 - 0123456789
82 - AAAAGTCAAA target
83 - ctCGTC query
84 - This field is required (enforced by semantic validation in Goby 2.0+).
85 - */
86 - optional uint32 position = 3;
87 -
88 - /*
89 - True when the query matches the target on the reverse strand
90 - */
91 - optional bool matching_reverse_strand = 6;
92 -
93 - /*
94 - The position on the query where the alignment starts. This value is different from zero
95 - when some bases/residues of the query could not be aligned with the target.
96 - TODO: Rename this to left_trim. Add a right_trim property.
97 - */
98 - optional uint32 query_position = 5;
99 -
100 - /*
101 - The score of the alignment, where larger scores indicate better matches between the query and the target.
102 - If an aligner outputs only the number of mismatches between query and target, the score is taken to be
103 - -(#mismatches(query,target)).
104 - */
105 - optional float score = 4;
106 -
107 - /*
108 - Number of bases/residues that differ in the alignment between query and target sequences.
109 - */
110 - optional uint32 number_of_mismatches = 8;
111 -
112 - /*
113 - Cumulative number of insertions and/or deletions present in the alignment.
114 - */
115 - optional uint32 number_of_indels = 9;
116 -
117 - /*
118 - Number of bases that have been aligned for the query. Please note that query_aligned_length must be
119 - less or equal to query_length.
120 - */
121 - optional uint32 query_aligned_length = 11;
122 -
123 - /*
124 - Number of bases that have been aligned for the target.
125 - */
126 - optional uint32 target_aligned_length = 12;
127 -
128 - repeated SequenceVariation sequence_variations = 13;
129 -
130 - /*
131 - Length of the query sequence.
132 - */
133 - optional uint32 query_length = 10;
134 - /*
135 - Mapping Quality (phred-scaled posterior probability that the mapping
136 - position of this read is incorrect). Please note that different aligners
137 - may estimate mapping quality with different approaches, resulting in aligner
138 - specific differences in the distribution of mapping quality. It is recommended
139 - to condition mapping quality on the aligner that produced the specific alignment
140 - being processed. See aligner name and version in the header.
141 - Note that the following description is preliminary. A clear specification is
142 - needed:
143 - The mapping quality should be proportional to the
144 - log of the probability that the given mapping is the "correct" one.
145 - So if there are five equally good mappings of a read to the genome,
146 - the probability of each would be 0.2, and the mapping quality would be
147 - something like -10*log10(1-0.2) = 1. If a mapping is highly likely,
148 - say a 1e-4 of it being wrong, then the mapping quality would be
149 - -10*log10(1e-4) = 40.
150 - */
151 - optional int32 mapping_quality = 14;
152 -
153 - /*
154 - If this read was aligned with a pair, the flags for the pair alignment (based on SAM):
155 - 000000001 paired
156 - 000000010 properly paired
157 - 000000100 read unmapped
158 - 000001000 mate unmapped
159 - 000010000 read reverse strand
160 - 000100000 mate reverse strand
161 - 001000000 first in pair
162 - 010000000 second in pair
163 - 100000000 not primary alignment
164 - */
165 - optional uint32 pair_flags = 15;
166 -
167 - /*
168 - If there is an alignment entry for the paired read (the paired read was mapped), a link to the entry is given.
169 - */
170 - optional RelatedAlignmentEntry pair_alignment_link = 16;
171 -
172 - /* Index of the read fragment from which this alignment was obtained. */
173 - optional uint32 fragment_index = 17;
174 -
175 - /* If a read spans exon-exon junctions some aligners (e.g., GSNAP) will output two or more
176 - alignment entries, one for each matching part of the read, and link these entries with
177 - spliced_alignment_links. The field spliced_forward_alignment_link points to the next
178 - AlignmentEntry in the chain of spliced alignments.
179 - */
180 - optional RelatedAlignmentEntry spliced_forward_alignment_link = 18;
181 -
182 - /* If a read spans exon-exon junctions some aligners (e.g., GSNAP) will output two or more
183 - alignment entries, one for each matching part of the read, and link these entries with
184 - spliced_alignment_links. The field spliced_backward_alignment_link points to the previous
185 - AlignmentEntry in the chain of spliced alignments.
186 - */
187 - optional RelatedAlignmentEntry spliced_backward_alignment_link = 22;
188 -
189 - /*
190 - If a read spans exon-exon junctions some aligners (e.g., GSNAP) will output two alignment entries, one for each
191 - matching part of the read, and flag describes the spliced_alignment_link with these
192 - binary flags:
193 - 000000001 normal
194 - 000000010 novel
195 - */
196 - optional uint32 spliced_flags = 19;
197 -
198 - /* The size of the insert used when making the sequence library. This is the total size of the DNA
199 - fragment to sequence, without the adapters. This is not the length of sequence that separates the reads.
200 - See http://seqanswers.com/forums/showthread.php?t=8730 for details. Insert size is inferred for each pair
201 - of reads by the aligner and is recorded here if was estimated (i.e., for paired-end reads).
202 - */
203 - optional sint32 insert_size = 20;
204 -
205 - /*
206 - The sample index. Uniquely identifies the aligned sample this read was read from. Storing the sample index in the
207 - alignment entry makes it possible to concat alignments from different origins and track what sample originally
208 - contained each entry.
209 - */
210 - optional uint32 sample_index = 21;
211 - /*
212 - The total number of times the query index associated with this entry occurs across the entire alignment file.
213 -
214 - This field is used to purge queryIndex->smallIndex associations after all instances of a queryindex have
215 - been seen (see QueryIndexPermutation class). When each entry has a value for this field, the header field
216 - query_index_occurrences is true.
217 - This field is required (enforced by semantic validation in Goby 2.0+).
218 - */
219 - optional uint32 query_index_occurrences = 25;
220 - /*
221 - The total number of times the read matches the reference across the entire alignment file. This differs from
222 - query_index_occurrences because reads that are matching through splice and pair links count as one for ambiguity.
223 - The field can be used to filter by ambiguity-threshold on the fly after an alignment has been done (to restrict
224 - entries to more smaller thresholds). When each entry has a value for this field, the header field
225 - ambiguity_stored_in_entries is true.
226 -
227 - This field is required (enforced by semantic validation in Goby 2.0+).
228 - */
229 - optional uint32 ambiguity = 27;
230 - /*
231 - List of BAM attributes, if the alignment was imported from BAM. The attributes are stored in exactly the format
232 - allowed for BAM. For instance, X0:i:9 X1:i:1 MD:Z:68 RG:Z:SRR084825 will be stored as four strings:
233 - "X0:i:9", "X1:i:1", "MD:Z:68", "RG:Z:SRR084825". Note that sam-to-compact will interpret some BAM attributes
234 - and populate goby native fields. Such tags do not appear in bam_attributes, and are instead re-generated from
235 - the corresponding goby native fields.
236 - Since Goby 2.0.
237 - */
238 - repeated string bam_attributes = 50;
239 - /*
240 - Quality scores for all bases of the read.
241 - Since Goby 2.0.
242 - */
243 - optional bytes read_quality_scores = 55;
244 -
245 - /*
246 - Origin index. An integer that references a ReadOriginInfo message in the alignment header and
247 - makes it possible to track the origin of the read (especially useful after several alignments
248 - have been merged/concatenated).
249 - (Since Goby 2.0).
250 - */
251 - optional uint32 read_origin_index = 26;
252 - /*
253 - Bases that an aligner considered do not belong to the alignment of the read to the reference. Potentially
254 - erroneous bases, or bases that belong to a different part of the reference genome. Left clipped bases are
255 - stored in this field as character bases, or as an equal sign character '=' when the clipped base did match
256 - the reference base. For instance "A=G" for three soft-clipped bases, the middle one matching the genome at
257 - this position. The number of bases in softClippedBasesLeft is exactly equal to queryPosition.
258 - */
259 - optional string softClippedBasesLeft = 30;
260 - /*
261 - Bases that an aligner considered do not belong to the alignment of the read to the reference. Potentially
262 - erroneous bases, or bases that belong to a different part of the reference genome. Right clipped bases are
263 - stored in this field as character bases, or as an equal sign character '=' when the clipped base did match
264 - the reference base. The number of bases in softClippedBasesRight is exactly equal
265 - to queryLength - queryAlignedLength - queryPosition.
266 - */
267 - optional string softClippedBasesRight = 31;
268 -
269 - /*
270 - Quality scores for bases in softClippedBasesLeft. Stored in Phred Units.
271 - */
272 - optional bytes softClippedQualityLeft = 32;
273 - /*
274 - Quality scores for bases in softClippedBasesRight. Stored in Phred Units.
275 - */
276 - optional bytes softClippedQualityRight = 33;
277 - /*
278 - Sequence for a read placed near this entry, but unmapped to the reference sequence. For instance, used to record
279 - the sequence of a mate that did not map to the reference. We know that the mate maps in the proximity of this entry
280 - (it is placed) but are unable to map it to a specific genomic position. The sequence is always given as obtained
281 - from the reads file.
282 - */
283 - optional string placedUnmappedSequence=40;
284 - /*
285 - Quality scores for a read placed near this entry. Phred units.
286 - */
287 - optional bytes placedUnmappedQuality=41;
288 -
289 - /*
290 - Read name. In SAM/BAM this is referred to as QNAME. Paired and segmented reads will have the same Read name.
291 - */
292 - optional string readName=42;
293 -}
294 -
295 -/* A link to another alignment entry. This message type is used to represent relations
296 - between alignments, such as the relation between the two read fragments in a paired-end protocol,
297 - or the relation between parts of reads that align through an exon exon junction and map in
298 - different locations of the genome.
299 - */
300 -message RelatedAlignmentEntry {
301 - /* Target index of the location where the other alignment entry is mapped.
302 - This field is required (enforced by semantic validation in Goby 2.0+).
303 - */
304 - optional uint32 target_index = 1;
305 -
306 - /* Position on the reference where the other alignment entry is mapped. *
307 - This field is required (enforced by semantic validation in Goby 2.0+).
308 - */
309 - optional uint32 position = 2;
310 -
311 - /* Index of the fragment for the related alignment entry. This index
312 - makes it possible to identify which of the read fragments mapped to the given
313 - location is related to the source alignment entry.
314 - This field is required (enforced by semantic validation in Goby 2.0+).
315 - */
316 - optional uint32 fragment_index = 3;
317 -
318 - optional uint32 optimized_index=50;
319 -}
320 -
321 -/*
322 - Represents sequence variations between the query and the reference sequences. Many variations can be represented.
323 - For instance, an insertion at position 5 in the reference would be represented as from="A", to="" position=5.
324 - A mutation T->G at position 6 would be rendered as from="T", to="G" position=6. Padded alignments (see SAM description)
325 - can be described by a combination of pair-wise alignments, where the gap character '-' is used to indicate that no
326 - base exists in the sequence considered for the alignment position, for instance:
327 -
328 - - Padding example:
329 -
330 - 123 (<-positions)
331 -ref A-C
332 - A-T [from="-" to="" position=2] [from="C" to="T" position=3]
333 - ACT [from="" to="C" position=2] [from="C" to="T" position=3]
334 - A-T [from="-" to="" position=2] [from="C" to="T" position=3]
335 -
336 - - Mutation example:
337 - 123 (<-positions)
338 -ref ATT
339 - ACT [from="T" to="C" position=2]
340 -
341 - -- Example of deletion in a read:
342 - 123 (<-positions)
343 -ref ATT
344 - A-T [from="T" to="-" position=2]
345 -
346 - -- Example of insertion of two base pairs in a read:
347 - 12345 (<-positions)
348 -ref A--TT
349 - ACCTT [from="" to="CC" position=2]
350 -
351 - */
352 -message SequenceVariation {
353 - /* The reference bases. Can include one or more gap characters '-', to indicate that the reference sequence has
354 - no base at this alignment position.
355 - This field is required (enforced by semantic validation in Goby 2.0+).
356 - */
357 - optional string from = 2;
358 - /* The read bases that differ from the reference sequence. Can include one or more gap characters '-', to indicate
359 - that the query sequence has no base at this alignment position.
360 - This field is required (enforced by semantic validation in Goby 2.0+).
361 - */
362 - optional string to = 1;
363 - /*
364 - The position of the variation on the read, as if the read always matched on the forward strand.
365 - Adding position to the index where the reference starts aligning the read yields the position of the variation
366 - in reference/target sequence space. Since position starts at one the resulting position will also be one based.
367 - This field is required (enforced by semantic validation in Goby 2.0+).
368 - */
369 - optional uint32 position = 3;
370 - /*
371 - The position of the variation, starting from the beginning of the aligned read (position 1), and up to the length
372 - of the read (inclusive). Use this index if you need to know how far the variation is observed from the beginning
373 - of the sequenced read. When the read has an insertion, this index records the position immediately before the base
374 - where the bases are inserted (these bases are in the to field).
375 - When the read has a deletion, read_index records the position in the read after which the bases that would align
376 - in the reference are missing (these bases are in the from field).
377 - This field is required (enforced by semantic validation in Goby 2.0+).
378 - */
379 - optional uint32 read_index = 5;
380 -
381 - /**
382 - The read base quality scores for those bases that are given in the to field. This field
383 - is populated when the reads used to perform the search include quality scores, and when
384 - the alignment parser can extract the information from the aligner's output.
385 - (this option is currently not implemented in Goby.)
386 - */
387 - optional bytes to_quality = 4;
388 -
389 -}
390 -/*
391 - This message is written to 'basename'.header
392 -*/
393 -
394 -message AlignmentHeader {
395 - /*
396 - The smallest possible query index in this alignment. Data stored as an array where
397 - queryIndex is the array index will be stored with only the elements in the inclusive
398 - range [smallestSplitQueryIndex largestSplitQueryIndex]
399 - Such data structures include queryLength and some arrays in the TooManyHits data
400 - structure.
401 - */
402 - optional uint32 smallest_split_query_index = 9;
403 - /*
404 - The largest possible query index in this alignment. Data stored as an array where
405 - queryIndex is the array index will be stored with only the elements in the inclusive
406 - range [smallestSplitQueryIndex largestSplitQueryIndex]
407 - Such data structures include queryLength and some arrays in the TooManyHits data
408 - structure.
409 - */
410 - optional uint32 largest_split_query_index = 11;
411 -
412 - /* Mapping from query identifier name to query index (as used in alignment entries).
413 - */
414 - optional IdentifierMapping query_name_mapping = 1;
415 -
416 - /* Mapping from target identifier name to target index (as used in alignment entries).
417 - */
418 - optional IdentifierMapping target_name_mapping = 2;
419 -
420 - /*
421 - The number of query sequences
422 - */
423 - optional uint32 number_of_queries = 5;
424 - /*
425 - The number of target sequences
426 - */
427 - optional uint32 number_of_targets = 6;
428 - /*
429 - The number of reads that were aligned to the reference and are represented in this alignment archive.
430 - */
431 - optional uint32 number_of_aligned_reads = 7;
432 -
433 - /*
434 - Length of the query sequences. One number per query, in the order of increasing query index.
435 - This information has been moved to the individual alignment entries.
436 - */
437 - repeated uint32 query_length = 3 [deprecated = true];
438 - /*
439 - If query length is constant across all the queries, this field contains the constant length.
440 - In such cases, query_length will be empty.
441 - */
442 - optional uint32 constant_query_length = 10;
443 -
444 - /*
445 - Length of the target sequences. One number per target, in the order of increasing target index.
446 - The target indexes must be 0..(number of targets - 1).
447 - */
448 - repeated uint32 target_length = 8;
449 - /*
450 - Indicates whether this alignment is sorted by position. True: the alignment entries occur in sorted
451 - order, such that entry a occurs before entry b if a.targetIndex< b.targetIndex or, when entries
452 - have the same target, when a.position < b.position.
453 - */
454 - optional bool sorted = 13;
455 -
456 - /*
457 - Indicates whether this alignment is indexed by position. When this attribute is true, a file called
458 - 'basename'.index exists that contains the AlignmentIndex message (GZip compressed).
459 - */
460 - optional bool indexed = 14;
461 - /*
462 - True when query lengths are stored in alignment entries (Goby 1.7+).
463 - */
464 - optional bool query_lengths_stored_in_entries = 15;
465 - /*
466 - Name of the aligner that produced this alignment.
467 - */
468 - optional string aligner_name = 17;
469 - /*
470 - Version number for the aligner implementation that produced this alignment.
471 - */
472 - optional string aligner_version = 18;
473 - /*
474 - The version of Goby that created this alignment file.
475 - */
476 - optional string version = 25;
477 -
478 - /*
479 - Sample basenames, in the order of increasing sampleIndex, starting with sampleIndex=0.
480 - */
481 -
482 - repeated string sample_basename = 30;
483 -
484 - /*
485 - This field is true when the query indices of alignment entries were permuted to smaller indices. Only sorted
486 - alignments can have query_indices_were_permuted=true. When the field is true, and you need to retrieve the
487 - original query-index of an alignment (because you want to retrieve the specific read(s) from a read file for
488 - instance), you will need the information in the permutation file (extension basename.perm) and transform back
489 - each small index of interest to the original query index.
490 - */
491 - optional bool query_indices_were_permuted = 26;
492 - /*
493 - This field is true when entries in the alignment .entries file all have the query_index_occurrences field populated
494 - (Since Goby 2.0).
495 - */
496 - optional bool query_index_occurrences = 35;
497 -
498 - /*
499 - This field is true when entries in the alignment .entries file all have the ambiguity field populated
500 - (Since Goby 2.0).
501 - */
502 - optional bool ambiguity_stored_in_entries = 36;
503 - /*
504 - This field is true when entries in the alignment .entries file all have the read_quality_score field populated.
505 - (Since Goby 2.0).
506 - */
507 - optional bool all_read_quality_scores = 40;
508 - /*
509 - A description of the origin of sets of reads. Serves a similar function to BAM read groups, but more flexible and
510 - efficient. Instead of storing strings, we use integers in the entries.
511 - Alignemnt entries will link to a specific ReadOriginInfo with the origin_index field.
512 - (Since Goby 2.0).
513 - */
514 - repeated ReadOriginInfo read_origin = 27;
515 -}
516 -
517 -message IdentifierMapping {
518 - repeated IdentifierInfo mappings = 1;
519 -}
520 -
521 -message IdentifierInfo {
522 - required string name = 1;
523 - required uint32 index = 2;
524 -}
525 -
526 -
527 -/*
528 - A description of the origin of sets of reads. Stored in the Goby alignment header and linked
529 - from alignment entries. Goby makes it possible to adapt origin equivalence rules on the fly
530 - efficiently. To do this, it is sufficient to read the header of the alignment, decide which
531 - ReadOriginInfo instances are equivalent (e.g., by looking at sample, platform, library, or
532 - other fields in the message), then construct a function e(a):int. This function takes
533 - one originIndex parameter and returns another integer that maps to an equivalent class. The
534 - equivalence class can be used to estimate error models for entries that belong to each class,
535 - for instance.
536 - (Since Goby 2.0).
537 - */
538 -message ReadOriginInfo {
539 - /*
540 - Origin index. An integer that links alignment entries to their origin information.
541 - */
542 - required uint32 origin_index = 1;
543 - /*
544 - Identifier that describes the origin of the reads. This field is compatible with the ID/platform field of BAM read
545 - groups. Free text.
546 - */
547 - required string origin_id = 2;
548 - /*
549 - The sample from which the reads were sequenced. This field is compatible with the SM/sample field of BAM read
550 - groups. Free text.
551 - */
552 - optional string sample = 4;
553 - /*
554 - The platform on which the reads were sequenced. This field is compatible with the PL/platform field of BAM read
555 - groups. Valid values: ILLUMINA, SOLID, LS454, HELICOS and PACBIO.
556 - */
557 - optional string platform = 5;
558 - /*
559 - The library from which the reads were sequenced. This field is compatible with the LB/library field of BAM read
560 - groups. Free text.
561 - */
562 - optional string library = 8;
563 - /*
564 - The platform unit on which the reads were sequenced. This field for compatibility with samtools.
565 - */
566 - optional string platform_unit = 12;
567 - /*
568 - The date the reads were sequenced. Useful to identify batch effects, in the format dd:MMM:yyyy.
569 - The month is Jan, Feb, etc. to avoid all confusion with days when day<=12.
570 - */
571 - optional string run_date = 6;
572 -}
573 -
574 -/*
575 - This message is written to 'basename'.tmh
576 -*/
577 -
578 -message AlignmentTooManyHits {
579 - /*
580 - The threshold used by the aligner to determine that a query is ambiguous and should be dropped.
581 - Referred to as parameter k below.
582 - */
583 - required uint32 aligner_threshold = 2;
584 - /*
585 - The hits that are assigned to several (>k) reference location.
586 - */
587 - repeated AmbiguousLocation hits = 1;
588 -
589 -}
590 -
591 -message AmbiguousLocation {
592 - /*
593 - The index of the query that matched too many times.
594 - */
595 - required uint32 query_index = 1;
596 - /*
597 - The number of hits that triggered membership in the too many hits list. The query may hit more
598 - locations than reported here, since some alignment tools will just drop queries that match above
599 - a threshold and stop counting. This number can be >=k.
600 - */
601 - required uint32 at_least_number_of_hits = 2;
602 - /**
603 -The length of the part of the query sequence that could be matched to the target (also called depth).
604 -May be less than the length of the query sequence, in which case the match was not perfect. When merging
605 -alignments produced by searching different reference sequences, consider only at_least_number_of_hits
606 -from alignments that have exactly the longer depth for the query. */
607 - optional uint32 length_of_match = 3;
608 -}
609 -
610 -/*
611 - This message is written to 'basename'.index
612 - */
613 -message AlignmentIndex {
614 - /*
615 - Stores one element by target sequence. Each element is the cumulative target length for the target
616 - stored at index i. Assume there are four target sequences, with lengths {10, 12, 15, 34}. The field
617 - targetPositionOffsets will contain: {0,10,22,37}. Such offsets can be used to calculate the absolute
618 - position of a genomic location. Given targetIndex and positionOnReference, the absolute location
619 - is defined as targetPositionOffsets[targetIndex]+positionOnReference.
620 - */
621 - repeated uint32 target_position_offsets = 1 [packed = true];
622 - /*
623 - The byte offsets into the compressed entries file. Byte offsets are matched with absolute position
624 - by index. There should be as many elements in offsets as there are in absolutePosition
625 - where chunks start which represent entries whose absolute positions are less than
626 - */
627 - repeated uint64 offsets = 2 [packed = true];
628 - /*
629 - The absolute positions of the first entry in the chunk that immediately start at offset. One element
630 - per chunk in the 'basename'.entries file.
631 - */
632 - repeated uint64 absolute_positions = 3 [packed = true];
633 -
634 -}
635
636 diff --git a/sci-biology/goby-cpp/files/Reads.proto b/sci-biology/goby-cpp/files/Reads.proto
637 deleted file mode 100644
638 index 32c1244a3eb..00000000000
639 --- a/sci-biology/goby-cpp/files/Reads.proto
640 +++ /dev/null
641 @@ -1,96 +0,0 @@
642 -package goby;
643 -
644 -option java_package = "edu.cornell.med.icb.goby.reads";
645 -option optimize_for = SPEED;
646 -
647 -message ReadCollection {
648 - repeated ReadEntry reads = 1;
649 -}
650 -
651 -message ReadEntry {
652 - /*
653 - Index of a read.
654 - */
655 - required uint32 read_index = 1;
656 - /*
657 - Index of the barcode, if any.
658 - */
659 - optional uint32 barcode_index = 10;
660 - /*
661 - Read identifier/name may be present.
662 - */
663 - optional string read_identifier = 23;
664 - /*
665 - Additional description about the read (from Fasta/Q format).
666 - */
667 - optional string description = 22;
668 - /*
669 - Length of the sequence.
670 - */
671 - required uint32 read_length = 2;
672 - /*
673 - Sequence, encoded as ascii characters stored in single bytes.
674 - */
675 - optional bytes sequence = 3;
676 - /*
677 - The second sequence in a pair. Stored the same way as the sequence attribute.
678 - */
679 - optional bytes sequence_pair = 5;
680 - /*
681 - Length of the second sequence in a pair.
682 - */
683 - optional uint32 read_length_pair = 6;
684 - /*
685 - Quality scores in Phred units, stored as single bytes (0-255).
686 - */
687 - optional bytes quality_scores = 4;
688 - /*
689 - Quality scores for the second sequence in a pair. Stored as the 'qualityScores' attribute.
690 - */
691 - optional bytes quality_scores_pair = 7;
692 - /*
693 - Compressed stream of data. The first byte indicates the compression/decompression method (codec). The remaining bytes are
694 - content compressed with the codec.
695 - */
696 - optional bytes compressed_data = 8;
697 - /*
698 - Stores meta-data about the reads. Typically meta-data is stored in the very first read of a
699 - read collection, with the understanding that the meta-data applies to all the reads in the
700 - collection. Meta-data can be used to store information about when the sample was sequenced,
701 - or other information of interest. The key-value pair format is sufficiently flexible to
702 - accomodate a variety of needs. The following keys are pre-defined. Please use pre-defined
703 - keys so that automated tools can use metadata in relatively standard way. Please note that
704 - some keys provide a format for the value. This format should also be followed to garantee
705 - that meta data can be used computationally in fully automatic manner.
706 -
707 - key="sequencing-run-start-date" value="MM/DD/YYYY" Used to record when the sequencing run
708 - was initiated on the instrument. Can be used to detect batch effect in a large set of samples.
709 - key="platform" value="<free-text>". Value is free text, but the following terms are pre-defined.
710 - Illumina GaIIx
711 - Illumina HiSeq 1000
712 - Illumina HiSeq 2000
713 - Helicos Heliscope
714 - LifeTech 5500 SOLiD
715 - LifeTech 5500xl SOLiD
716 - Roche 454 GS FLX Ti
717 -
718 - key="organism" value="species name"
719 - Since Goby 1.9.1
720 - */
721 - repeated MetaData meta_data = 25;
722 -
723 -}
724 -/*
725 - A message to store a key/value pair and represent metadata about reads.
726 - Since Goby 1.9.1
727 - */
728 -message MetaData {
729 - /*
730 - Provides the key. See examples in the documentation of meta_data for ReadEntry.
731 - */
732 - required string key=1;
733 - /*
734 - Describes the value associated with the key. See examples in the documentation of meta_data for ReadEntry.
735 - */
736 - required string value=2;
737 -}
738
739 diff --git a/sci-biology/goby-cpp/files/goby-cpp-2.0.1-underlinking.patch b/sci-biology/goby-cpp/files/goby-cpp-2.0.1-underlinking.patch
740 deleted file mode 100644
741 index 415785466af..00000000000
742 --- a/sci-biology/goby-cpp/files/goby-cpp-2.0.1-underlinking.patch
743 +++ /dev/null
744 @@ -1,16 +0,0 @@
745 - src/Makefile.am | 2 +-
746 - 1 file changed, 1 insertion(+), 1 deletion(-)
747 -
748 -diff --git a/src/Makefile.am b/src/Makefile.am
749 -index 1033382..33ca906 100644
750 ---- a/src/Makefile.am
751 -+++ b/src/Makefile.am
752 -@@ -84,7 +84,7 @@ GobyReadsStats_LDADD = libgoby.la ${BOOST_LDFLAGS} ${BOOST_SYSTEM_LIB} ${BOOST_D
753 - GobyReadsStats_SOURCES = \
754 - GobyReadsStats.cc
755 -
756 --GobyFastaToCompact_LDADD = libgoby.la ${BOOST_LDFLAGS} ${BOOST_SYSTEM_LIB} ${BOOST_DATE_TIME_LIB} ${BOOST_FILESYSTEM_LIB} ${BOOST_PROGRAM_OPTIONS_LIB}
757 -+GobyFastaToCompact_LDADD = libgoby.la ${BOOST_LDFLAGS} ${BOOST_SYSTEM_LIB} ${BOOST_DATE_TIME_LIB} ${BOOST_FILESYSTEM_LIB} ${BOOST_PROGRAM_OPTIONS_LIB} -lz
758 - GobyFastaToCompact_SOURCES = \
759 - GobyFastaToCompact.cc
760 -
761
762 diff --git a/sci-biology/goby-cpp/goby-cpp-2.0.1.ebuild b/sci-biology/goby-cpp/goby-cpp-2.0.1.ebuild
763 index fcf8971fceb..e74dd6ecede 100644
764 --- a/sci-biology/goby-cpp/goby-cpp-2.0.1.ebuild
765 +++ b/sci-biology/goby-cpp/goby-cpp-2.0.1.ebuild
766 @@ -1,4 +1,4 @@
767 -# Copyright 1999-2015 Gentoo Foundation
768 +# Copyright 1999-2017 Gentoo Foundation
769 # Distributed under the terms of the GNU General Public License v2
770
771 EAPI=5
772 @@ -9,7 +9,8 @@ inherit autotools-utils
773
774 DESCRIPTION="A DNA sequencing data management framework - C/C++ API"
775 HOMEPAGE="http://campagnelab.org/software/goby/"
776 -SRC_URI="http://chagall.med.cornell.edu/goby/releases/archive/release-goby_${PV}/goby_${PV}-cpp.zip"
777 +SRC_URI="http://chagall.med.cornell.edu/goby/releases/archive/release-goby_${PV}/goby_${PV}-cpp.zip
778 + https://dev.gentoo.org/~mgorny/dist/${P}-files.tar.bz2"
779
780 LICENSE="GPL-3"
781 SLOT="0"
782 @@ -24,7 +25,7 @@ RDEPEND="${DEPEND}"
783 S="${WORKDIR}/${PV}/cpp"
784
785 PATCHES=(
786 - "${FILESDIR}"/${P}-underlinking.patch
787 + "${WORKDIR}"/${P}-files/${P}-underlinking.patch
788 )
789
790 src_prepare() {
791 @@ -33,7 +34,7 @@ src_prepare() {
792 -i src/Makefile.am || die
793
794 pushd src/goby > /dev/null || die
795 - cp "${FILESDIR}"/*.proto . || die
796 + cp "${WORKDIR}"/${P}-files/*.proto . || die
797 protoc --cpp_out=. *.proto || die
798 popd > /dev/null || die