[gentoo-commits] repo/gentoo:master commit in: sci-biology/goby-cpp/files/, sci-biology/goby-cpp/ - gentoo-commits

From:	"Michał Górny" <mgorny@g.o>
To:	gentoo-commits@l.g.o
Subject:	[gentoo-commits] repo/gentoo:master commit in: sci-biology/goby-cpp/files/, sci-biology/goby-cpp/
Date:	Sun, 10 Dec 2017 00:05:57
Message-Id:	`1512864328.9410f41fefdcd2fb758ae0b6921fb66be1d98378.mgorny@gentoo`

1

commit:     9410f41fefdcd2fb758ae0b6921fb66be1d98378

2

Author:     Michał Górny <mgorny <AT> gentoo <DOT> org>

3

AuthorDate: Sat Dec  9 23:20:58 2017 +0000

4

Commit:     Michał Górny <mgorny <AT> gentoo <DOT> org>

5

CommitDate: Sun Dec 10 00:05:28 2017 +0000

6

URL:        https://gitweb.gentoo.org/repo/gentoo.git/commit/?id=9410f41f

7

8

sci-biology/goby-cpp: [QA] Move aux files to a distfile

9

10

Closes: https://bugs.gentoo.org/620614

11

12

 sci-biology/goby-cpp/Manifest                      |   7 +-

13

 sci-biology/goby-cpp/files/Alignments.proto        | 597 ---------------------

14

 sci-biology/goby-cpp/files/Reads.proto             |  96 ----

15

 .../files/goby-cpp-2.0.1-underlinking.patch        |  16 -

16

 sci-biology/goby-cpp/goby-cpp-2.0.1.ebuild         |   9 +-

17

 5 files changed, 9 insertions(+), 716 deletions(-)

18

19

diff --git a/sci-biology/goby-cpp/Manifest b/sci-biology/goby-cpp/Manifest

20

index 88622e3c813..2bd240a998a 100644

21

--- a/sci-biology/goby-cpp/Manifest

22

+++ b/sci-biology/goby-cpp/Manifest

23

@@ -1,3 +1,4 @@

24

-DIST goby_1.9.7.3-cpp.zip 127215 SHA256 8493daa7c850732c6c48d4512bd26b7eec411a729b39d9861a4a6aae08faa674 SHA512 56bf190224b6f22e0578cea4cc950e52e746655c75ffc13675276787b4d0ced682f891f6ecf7af3cf124b535ac3afc8711b0ecff44d6fd25fe521de7371c3486 WHIRLPOOL ae7ead1b0364383b46d4ef8b59453146c68384b379c26498fc9b24d014ba096a99723bad42cfeb84d44c20e4fc14882bbad303ab8c981889f90dff88a882c5c0

25

-DIST goby_1.9.8.1-cpp.zip 134904 SHA256 2f1bd87f2870af178f34a8e7c11819aa9e42f35e20f1985d2ceb054f452e2a97 SHA512 d31cd7f0be19074bfe8da74d9f2510f0e0f15fe6c485bbed8520052468d2cd2f1bc5fcad8b0d6a1586f5acde73db326059f45994ecfbb5fb6c09692d8e155190 WHIRLPOOL 6ce51c46f8802d31068f510f6da13b2920086eafdae24506830b42d79e48eb6ed9cac48a96090a81964daebf4a0c8f21c490ca3b0af2f589ac57647bde1be79f

26

-DIST goby_2.0.1-cpp.zip 177718 SHA256 5ec57b833cb1a0f53e975112d1c360b14a9b17cfff3fb0ad77dd70672c1881db SHA512 992bd10d5538dec1478820f26151dd311f4de13e7947b49f0b06d6cbdd4b71deeb3aa8a4c6a598fb92fbcb9cbf4ff97bf81205c9389d4a0da4443317e48aea9f WHIRLPOOL ab94cf674703917b6f0cde812d0fbcd94e18fb6055b30d6a1eefa1e4cb5b76bbe18c67388c66e25e87e522df9a9946b0eae5a164428abe874a382f5bc39a13d0

27

+DIST goby-cpp-2.0.1-files.tar.bz2 8354 BLAKE2B 0169e1bbcdc27f359cde47df708546dd6af0a68334295b247a6aac9122b7e9b1ee590fe0b57052c642b7e25478f5b118c70bec0c4b4af3694ab0f68c1c9ea73a SHA512 6f0cf466688cdbe9fe646cdff78dd0721fd0b0819c354c63e7c39c45895c319754cdadf23aeb9d544b0b2c68f1168583cb541ec160ba7f567fa0218dbad38e1e

28

+DIST goby_1.9.7.3-cpp.zip 127215 BLAKE2B 0673c36b503a6daee5fdaaf96fb415277502c0a49e530eb39983d4718f4a1d8eb9a6ff0a3202413c358600aafc2bf73482be12462f798923c13e19a6bcd590b1 SHA512 56bf190224b6f22e0578cea4cc950e52e746655c75ffc13675276787b4d0ced682f891f6ecf7af3cf124b535ac3afc8711b0ecff44d6fd25fe521de7371c3486

29

+DIST goby_1.9.8.1-cpp.zip 134904 BLAKE2B 800f3bcbe9f721bfb636f514630fb1ceba3a1fe41616f63fc15f9f2a24394ef9be90419ccad0c9bd8b29100eeaea57659ba013042cf4a11b6038fc6dee782619 SHA512 d31cd7f0be19074bfe8da74d9f2510f0e0f15fe6c485bbed8520052468d2cd2f1bc5fcad8b0d6a1586f5acde73db326059f45994ecfbb5fb6c09692d8e155190

30

+DIST goby_2.0.1-cpp.zip 177718 BLAKE2B 666b50fdc199693f8a4f9b6007f6609e91ab6093b643da88e580c9a3438a150cd7be78d2b5dcdd2fe905263d32ebbac1e0e47dbc637fd5d59f877e7cbdaaeeb2 SHA512 992bd10d5538dec1478820f26151dd311f4de13e7947b49f0b06d6cbdd4b71deeb3aa8a4c6a598fb92fbcb9cbf4ff97bf81205c9389d4a0da4443317e48aea9f

31

32

diff --git a/sci-biology/goby-cpp/files/Alignments.proto b/sci-biology/goby-cpp/files/Alignments.proto

33

deleted file mode 100644

34

index fe7f5664764..00000000000

35

--- a/sci-biology/goby-cpp/files/Alignments.proto

36

+++ /dev/null

37

@@ -1,597 +0,0 @@

38

-package goby;

39

-

40

-option java_package = "edu.cornell.med.icb.goby.alignments";

41

-

42

-option optimize_for = SPEED;

43

-

44

-/*

45

-  This message is written to 'basename'.entries as a very large chunked collection.

46

-*/

47

-message AlignmentCollection {

48

-    repeated AlignmentEntry alignment_entries = 1;

49

-}

50

-

51

-

52

-message AlignmentEntry {

53

-    /* Multiplicity of this entry. The number of times this  alignment entry would be repeated exactly the same if

54

-     query redundancy had not been removed by read factorization.

55

-    */

56

-    optional uint32 multiplicity = 7;

57

-

58

-    /*

59

-      Compressed stream of data. Removed since Goby 2.0 supports chunk codecs. Do not reuse field index 23

60

-      optional bytes compressed_data = 23;

61

-    */

62

-

63

-    /* An integer that uniquely identifies the query (a short read) in a set of alignment runs. When several

64

-      alignment runs are made with the same set of query sequences, equality of query index means that the query

65

-      sequences were the same. (Comparing integers for equality is much faster than comparing strings.)

66

-      This field is required (enforced by semantic validation in Goby 2.0+).

67

-    */

68

-    optional uint32 query_index = 1;

69

-    /* An integer that uniquely identifies the target (e.g., a chromosome) in a set of alignment runs. When several

70

-      alignment runs are made with the same set of target sequences, equality of target index means that the target

71

-      sequence was the same across the runs. (Comparing integers for equality is much faster than comparing strings.)

72

-      This field is required (enforced by semantic validation in Goby 2.0+).

73

-    */

74

-    optional uint32 target_index = 2;

75

-    /*

76

-     The position on the target of the start of the alignment between the query and the target.

77

-     In the following example, position is 3 because the third base of the query 'C' was aligned with

78

-     position 3 of the reference (two read bases were soft clipped: "ct"). This example shows that the

79

-     alignment can start at a mismatch if it was so constructed by the aligner.

80

-

81

-     0123456789

82

-     AAAAGTCAAA  target

83

-      ctCGTC     query

84

-    This field is required (enforced by semantic validation in Goby 2.0+).

85

-   */

86

-    optional uint32 position = 3;

87

-

88

-    /*

89

-       True when the query matches the target on the reverse strand

90

-    */

91

-    optional bool matching_reverse_strand = 6;

92

-

93

-    /*

94

-     The position on the query where the alignment starts. This value is different from zero

95

-     when some bases/residues of the query could not be aligned with the target.

96

-     TODO: Rename this to left_trim. Add a right_trim property.

97

-    */

98

-    optional uint32 query_position = 5;

99

-

100

-    /*

101

-     The score of the alignment, where larger scores indicate better matches between the query and the target.

102

-     If an aligner outputs only the number of mismatches between query and target, the score is taken to be

103

-     -(#mismatches(query,target)).

104

-    */

105

-    optional float score = 4;

106

-

107

-    /*

108

-      Number of bases/residues that differ in the alignment between query and target sequences.

109

-    */

110

-    optional uint32 number_of_mismatches = 8;

111

-

112

-    /*

113

-     Cumulative number of insertions and/or deletions present in the alignment.

114

-    */

115

-    optional uint32 number_of_indels = 9;

116

-

117

-    /*

118

-     Number of bases that have been aligned for the query. Please note that query_aligned_length must be

119

-     less or equal to query_length.

120

-    */

121

-    optional uint32 query_aligned_length = 11;

122

-

123

-    /*

124

-     Number of bases that have been aligned for the target.

125

-    */

126

-    optional uint32 target_aligned_length = 12;

127

-

128

-    repeated SequenceVariation sequence_variations = 13;

129

-

130

-    /*

131

-     Length of the query sequence.

132

-    */

133

-    optional uint32 query_length = 10;

134

-    /*

135

-      Mapping Quality (phred-scaled posterior probability that the mapping

136

-      position of this read is incorrect). Please note that different aligners

137

-      may estimate mapping quality with different approaches, resulting in aligner

138

-      specific differences in the distribution of mapping quality. It is recommended

139

-      to condition mapping quality on the aligner that produced the specific alignment

140

-      being processed. See aligner name and version in the header.

141

-      Note that the following description is preliminary. A clear specification is

142

-      needed:

143

-      The mapping quality should be proportional to the

144

-      log of the probability that the given mapping is the "correct" one.

145

-      So if there are five equally good mappings of a read to the genome,

146

-      the probability of each would be 0.2, and the mapping quality would be

147

-      something like -10*log10(1-0.2) = 1.  If a mapping is highly likely,

148

-      say a 1e-4 of it being wrong, then the mapping quality would be

149

-      -10*log10(1e-4) = 40.

150

-    */

151

-    optional int32 mapping_quality = 14;

152

-

153

-    /*

154

-       If this read was aligned with a pair, the flags for the pair alignment (based on SAM):

155

-          000000001    paired

156

-          000000010    properly paired

157

-          000000100    read unmapped

158

-          000001000    mate unmapped

159

-          000010000    read reverse strand

160

-          000100000    mate reverse strand

161

-          001000000    first in pair

162

-          010000000    second in pair

163

-          100000000    not primary alignment

164

-    */

165

-    optional uint32 pair_flags = 15;

166

-

167

-    /*

168

-     If there is an alignment entry for the paired read (the paired read was mapped), a link to the entry is given.

169

-    */

170

-    optional RelatedAlignmentEntry pair_alignment_link = 16;

171

-

172

-    /* Index of the read fragment from which this alignment was obtained. */

173

-    optional uint32 fragment_index = 17;

174

-

175

-    /* If a read spans exon-exon junctions some aligners (e.g., GSNAP) will output two or more

176

-      alignment entries, one for each matching part of the read, and link these entries with

177

-      spliced_alignment_links. The field spliced_forward_alignment_link points to the next

178

-      AlignmentEntry in the chain of spliced alignments.

179

-    */

180

-    optional RelatedAlignmentEntry spliced_forward_alignment_link = 18;

181

-

182

-    /* If a read spans exon-exon junctions some aligners (e.g., GSNAP) will output two or more

183

-      alignment entries, one for each matching part of the read, and link these entries with

184

-      spliced_alignment_links. The field spliced_backward_alignment_link points to the previous

185

-      AlignmentEntry in the chain of spliced alignments.

186

-    */

187

-    optional RelatedAlignmentEntry spliced_backward_alignment_link = 22;

188

-

189

-    /*

190

-      If a read spans exon-exon junctions some aligners (e.g., GSNAP) will output two alignment entries, one for each

191

-      matching part of the read, and flag describes the spliced_alignment_link with these

192

-      binary flags:

193

-        000000001    normal

194

-        000000010    novel

195

-    */

196

-    optional uint32 spliced_flags = 19;

197

-

198

-    /* The size of the insert used when making the sequence library. This is the total size of the DNA

199

-    fragment to sequence, without the adapters. This is not the length of sequence that separates the reads.

200

-    See http://seqanswers.com/forums/showthread.php?t=8730 for details. Insert size is inferred for each pair

201

-    of reads by the aligner and is recorded here if was estimated (i.e., for paired-end reads).

202

-    */

203

-    optional sint32 insert_size = 20;

204

-

205

-    /*

206

-       The sample index. Uniquely identifies the aligned sample this read was read from. Storing the sample index in the

207

-       alignment entry makes it possible to concat alignments from different origins and track what sample originally

208

-       contained each entry.

209

-    */

210

-    optional uint32 sample_index = 21;

211

-    /*

212

-        The total number of times the query index associated with this entry occurs across the entire alignment file.

213

-

214

-        This field is used to purge queryIndex->smallIndex associations after all instances of a queryindex have

215

-        been seen (see QueryIndexPermutation class). When each entry has a value for this field, the header field

216

-        query_index_occurrences is true.

217

-        This field is required (enforced by semantic validation in Goby 2.0+).

218

-    */

219

-    optional uint32 query_index_occurrences = 25;

220

-    /*

221

-        The total number of times the read matches the reference across the entire alignment file. This differs from

222

-        query_index_occurrences because reads that are matching through splice and pair links count as one for ambiguity.

223

-        The field can be used to filter by ambiguity-threshold on the fly after an alignment has been done (to restrict

224

-        entries to more smaller thresholds). When each entry has a value for this field, the header field

225

-        ambiguity_stored_in_entries is true.

226

-

227

-        This field is required (enforced by semantic validation in Goby 2.0+).

228

-    */

229

-    optional uint32 ambiguity = 27;

230

-    /*

231

-        List of BAM attributes, if the alignment was imported from BAM. The attributes are stored in exactly the format

232

-        allowed for BAM. For instance, X0:i:9  X1:i:1  MD:Z:68 RG:Z:SRR084825 will be stored as four strings:

233

-        "X0:i:9", "X1:i:1", "MD:Z:68", "RG:Z:SRR084825". Note that sam-to-compact will interpret some BAM attributes

234

-        and populate goby native fields. Such tags do not appear in bam_attributes, and are instead re-generated from

235

-        the corresponding goby native fields.

236

-        Since Goby 2.0.

237

-    */

238

-    repeated string bam_attributes = 50;

239

-    /*

240

-        Quality scores for all bases of the read.

241

-        Since Goby 2.0.

242

-    */

243

-    optional bytes read_quality_scores = 55;

244

-

245

-    /*

246

-        Origin index. An integer that references a ReadOriginInfo message in the alignment header and

247

-        makes it possible to track the origin of the read (especially useful after several alignments

248

-        have been merged/concatenated).

249

-        (Since Goby 2.0).

250

-    */

251

-    optional uint32 read_origin_index = 26;

252

-    /*

253

-    Bases that an aligner considered do not belong to the alignment of the read to the reference. Potentially

254

-    erroneous bases, or bases that belong to a different part of the reference genome. Left clipped bases are

255

-    stored in this field as character bases, or as an equal sign character '=' when the clipped base did match

256

-    the reference base. For instance "A=G" for three soft-clipped bases, the middle one matching the genome at

257

-    this position. The number of bases in softClippedBasesLeft is exactly equal to queryPosition.

258

-    */

259

-    optional string softClippedBasesLeft = 30;

260

-    /*

261

-    Bases that an aligner considered do not belong to the alignment of the read to the reference. Potentially

262

-    erroneous bases, or bases that belong to a different part of the reference genome. Right clipped bases are

263

-    stored in this field as character bases, or as an equal sign character '=' when the clipped base did match

264

-    the reference base. The number of bases in softClippedBasesRight is exactly equal

265

-    to  queryLength - queryAlignedLength - queryPosition.

266

-    */

267

-    optional string softClippedBasesRight = 31;

268

-

269

-    /*

270

-    Quality scores for bases in softClippedBasesLeft.  Stored in Phred Units.

271

-    */

272

-    optional bytes softClippedQualityLeft = 32;

273

-   /*

274

-    Quality scores for bases in softClippedBasesRight.  Stored in Phred Units.

275

-    */

276

-    optional bytes softClippedQualityRight = 33;

277

-    /*

278

-     Sequence for a read placed near this entry, but unmapped to the reference sequence. For instance, used to record

279

-     the sequence of a mate that did not map to the reference. We know that the mate maps in the proximity of this entry

280

-     (it is placed) but are unable to map it to a specific genomic position. The sequence is always given as obtained

281

-     from the reads file.

282

-    */

283

-    optional string placedUnmappedSequence=40;

284

-    /*

285

-    Quality scores for a read placed near this entry.  Phred units.

286

-    */

287

-    optional bytes placedUnmappedQuality=41;

288

-

289

-    /*

290

-    Read name. In SAM/BAM this is referred to as QNAME. Paired and segmented reads will have the same Read name.

291

-    */

292

-    optional string readName=42;

293

-}

294

-

295

-/* A link to another alignment entry. This message type is used to represent relations

296

-   between alignments, such as the relation between the two read fragments in a paired-end protocol,

297

-   or the relation between parts of reads that align through an exon exon junction and map in

298

-   different locations of the genome.

299

-  */

300

-message RelatedAlignmentEntry {

301

-    /* Target index of the location where the other alignment entry is mapped.

302

-      This field is required (enforced by semantic validation in Goby 2.0+).

303

-    */

304

-    optional uint32 target_index = 1;

305

-

306

-    /* Position on the reference where the other alignment entry is mapped. *

307

-       This field is required (enforced by semantic validation in Goby 2.0+).

308

-    */

309

-    optional uint32 position = 2;

310

-

311

-    /* Index of the fragment for the related alignment entry. This index

312

-       makes it possible to identify which of the read fragments mapped to the given

313

-       location is related to the source alignment entry.

314

-       This field is required (enforced by semantic validation in Goby 2.0+).

315

-    */

316

-    optional uint32 fragment_index = 3;

317

-

318

-    optional uint32 optimized_index=50;

319

-}

320

-

321

-/*

322

-   Represents sequence variations between the query and the reference sequences. Many variations can be represented.

323

-   For instance, an insertion at position 5 in the reference would be represented as from="A", to="" position=5.

324

-   A mutation T->G at position 6 would be rendered as from="T", to="G" position=6. Padded alignments (see SAM description)

325

-   can be described by a combination of pair-wise alignments, where the gap character '-' is used to indicate that no

326

-   base exists in the sequence considered for the alignment position, for instance:

327

-

328

-   - Padding example:

329

-

330

-    123 (<-positions)

331

-ref A-C

332

-    A-T [from="-" to=""  position=2] [from="C" to="T"  position=3]

333

-    ACT [from=""  to="C" position=2] [from="C" to="T"  position=3]

334

-    A-T [from="-" to=""  position=2] [from="C" to="T"  position=3]

335

-

336

-   - Mutation example:

337

-    123 (<-positions)

338

-ref ATT

339

-    ACT [from="T"  to="C" position=2]

340

-

341

-    -- Example of deletion in a read:

342

-    123 (<-positions)

343

-ref ATT

344

-    A-T [from="T"  to="-" position=2]

345

-

346

-    -- Example of insertion of two base pairs in a read:

347

-    12345 (<-positions)

348

-ref A--TT

349

-    ACCTT [from=""  to="CC" position=2]

350

-

351

-  */

352

-message SequenceVariation {

353

-    /* The reference bases. Can include one or more gap characters '-', to indicate that the reference sequence has

354

-     no base at this alignment position.

355

-     This field is required (enforced by semantic validation in Goby 2.0+).

356

-    */

357

-    optional string from = 2;

358

-    /* The read bases that differ from the reference sequence.  Can include one or more gap characters '-', to indicate

359

-     that the query sequence has no base at this alignment position.

360

-     This field is required (enforced by semantic validation in Goby 2.0+).

361

-    */

362

-    optional string to = 1;

363

-    /*

364

-    The position of the variation on the read, as if the read always matched on the forward strand.

365

-    Adding position to the index where the reference starts aligning the read yields the position of the variation

366

-    in reference/target sequence space. Since position starts at one the resulting position will also be one based.

367

-    This field is required (enforced by semantic validation in Goby 2.0+).

368

-    */

369

-    optional uint32 position = 3;

370

-    /*

371

-    The position of the variation, starting from the beginning of the aligned read (position 1), and up to the length

372

-    of the read (inclusive). Use this index if you need to know  how far the variation is observed from the beginning

373

-    of the sequenced read. When the read has an insertion, this index records the position immediately before the base

374

-    where the bases are inserted (these bases are in the to field).

375

-    When the read has a deletion, read_index records the position in the read after which the bases that would align

376

-    in the reference are missing (these bases are in the from field).

377

-    This field is required (enforced by semantic validation in Goby 2.0+).

378

-    */

379

-    optional uint32 read_index = 5;

380

-

381

-    /**

382

-      The read base quality scores for those bases that are given in the to field. This field

383

-      is populated when the reads used to perform the search include quality scores, and when

384

-      the alignment parser can extract the information from the aligner's output.

385

-      (this option is currently not implemented in Goby.)

386

-    */

387

-    optional bytes to_quality = 4;

388

-

389

-}

390

-/*

391

-  This message is written to 'basename'.header

392

-*/

393

-

394

-message AlignmentHeader {

395

-    /*

396

-     The smallest possible query index in this alignment. Data stored as an array where

397

-     queryIndex is the array index will be stored with only the elements in the inclusive

398

-     range [smallestSplitQueryIndex largestSplitQueryIndex]

399

-     Such data structures include queryLength and some arrays in the TooManyHits data

400

-     structure.

401

-    */

402

-    optional uint32 smallest_split_query_index = 9;

403

-    /*

404

-     The largest possible query index in this alignment. Data stored as an array where

405

-     queryIndex is the array index will be stored with only the elements in the inclusive

406

-     range [smallestSplitQueryIndex largestSplitQueryIndex]

407

-     Such data structures include queryLength and some arrays in the TooManyHits data

408

-     structure.

409

-    */

410

-    optional uint32 largest_split_query_index = 11;

411

-

412

-    /* Mapping from query identifier name to query index (as used in alignment entries).

413

-    */

414

-    optional IdentifierMapping query_name_mapping = 1;

415

-

416

-    /* Mapping from target identifier name to target index (as used in alignment entries).

417

-    */

418

-    optional IdentifierMapping target_name_mapping = 2;

419

-

420

-    /*

421

-     The number of query sequences

422

-    */

423

-    optional uint32 number_of_queries = 5;

424

-    /*

425

-      The number of target sequences

426

-    */

427

-    optional uint32 number_of_targets = 6;

428

-    /*

429

-      The number of reads that were aligned to the reference and are represented in this alignment archive.

430

-    */

431

-    optional uint32 number_of_aligned_reads = 7;

432

-

433

-    /*

434

-      Length of the query sequences. One number per query, in the order of increasing query index.

435

-      This information has been moved to the individual alignment entries.

436

-    */

437

-    repeated uint32 query_length = 3 [deprecated = true];

438

-    /*

439

-       If query length is constant across all the queries, this field contains the constant length.

440

-       In such cases, query_length will be empty.

441

-    */

442

-    optional uint32 constant_query_length = 10;

443

-

444

-    /*

445

-      Length of the target sequences. One number per target, in the order of increasing target index.

446

-      The target indexes must be 0..(number of targets - 1).

447

-    */

448

-    repeated uint32 target_length = 8;

449

-    /*

450

-       Indicates whether this alignment is sorted by position. True: the alignment entries occur in sorted

451

-       order, such that entry a occurs before entry b if a.targetIndex< b.targetIndex or, when entries

452

-       have the same target, when a.position < b.position.

453

-    */

454

-    optional bool sorted = 13;

455

-

456

-    /*

457

-       Indicates whether this alignment is indexed by position. When this attribute is true, a file called

458

-      'basename'.index exists that contains the AlignmentIndex message (GZip compressed).

459

-    */

460

-    optional bool indexed = 14;

461

-    /*

462

-      True when query lengths are stored in alignment entries (Goby 1.7+).

463

-    */

464

-    optional bool query_lengths_stored_in_entries = 15;

465

-    /*

466

-      Name of the aligner that produced this alignment.

467

-    */

468

-    optional string aligner_name = 17;

469

-    /*

470

-      Version number for the aligner implementation that produced this alignment.

471

-    */

472

-    optional string aligner_version = 18;

473

-    /*

474

-       The version of Goby that created this alignment file.

475

-    */

476

-    optional string version = 25;

477

-

478

-    /*

479

-      Sample basenames, in the order of increasing sampleIndex, starting with sampleIndex=0.

480

-    */

481

-

482

-    repeated string sample_basename = 30;

483

-

484

-    /*

485

-       This field is true when the query indices of alignment entries were permuted to smaller indices. Only sorted

486

-       alignments can have query_indices_were_permuted=true. When the field is true, and you need to retrieve the

487

-       original query-index of an alignment (because you want to retrieve the specific read(s) from a read file for

488

-       instance), you will need the information in the permutation file (extension basename.perm) and transform back

489

-       each small index of interest to the original query index.

490

-    */

491

-    optional bool query_indices_were_permuted = 26;

492

-    /*

493

-       This field is true when entries in the alignment .entries file all have the query_index_occurrences field populated

494

-       (Since Goby 2.0).

495

-    */

496

-    optional bool query_index_occurrences = 35;

497

-

498

-    /*

499

-       This field is true when entries in the alignment .entries file all have the ambiguity field populated

500

-       (Since Goby 2.0).

501

-    */

502

-    optional bool ambiguity_stored_in_entries = 36;

503

-    /*

504

-       This field is true when entries in the alignment .entries file all have the read_quality_score field populated.

505

-       (Since Goby 2.0).

506

-    */

507

-    optional bool all_read_quality_scores = 40;

508

-    /*

509

-      A description of the origin of sets of reads. Serves a similar function to BAM read groups, but more flexible and

510

-      efficient. Instead of storing strings, we use integers in the entries.

511

-      Alignemnt entries will link to a specific ReadOriginInfo with the origin_index field.

512

-      (Since Goby 2.0).

513

-    */

514

-    repeated ReadOriginInfo read_origin = 27;

515

-}

516

-

517

-message IdentifierMapping {

518

-    repeated IdentifierInfo mappings = 1;

519

-}

520

-

521

-message IdentifierInfo {

522

-    required string name = 1;

523

-    required uint32 index = 2;

524

-}

525

-

526

-

527

-/*

528

-     A description of the origin of sets of reads. Stored in the Goby alignment header and linked

529

-     from alignment entries. Goby makes it possible to adapt origin equivalence rules on the fly

530

-     efficiently. To do this, it is sufficient to read the header of the alignment, decide which

531

-     ReadOriginInfo instances are equivalent (e.g., by looking at sample, platform, library, or

532

-     other fields in the message), then construct a function e(a):int. This function takes

533

-     one originIndex parameter and returns another integer that maps to an equivalent class. The

534

-     equivalence class can be used to estimate error models for entries that belong to each class,

535

-     for instance.

536

-     (Since Goby 2.0).

537

- */

538

-message ReadOriginInfo {

539

-    /*

540

-       Origin index. An integer that links alignment entries to their origin information.

541

-    */

542

-    required uint32 origin_index = 1;

543

-    /*

544

-       Identifier that describes the origin of the reads. This field is compatible with the ID/platform field of BAM read

545

-       groups. Free text.

546

-    */

547

-    required string origin_id = 2;

548

-    /*

549

-       The sample from which the reads were sequenced. This field is compatible with the SM/sample field of BAM read

550

-       groups. Free text.

551

-    */

552

-    optional string sample = 4;

553

-    /*

554

-       The platform on which the reads were sequenced. This field is compatible with the PL/platform field of BAM read

555

-       groups. Valid values: ILLUMINA, SOLID, LS454, HELICOS and PACBIO.

556

-    */

557

-    optional string platform = 5;

558

-    /*

559

-       The library from which the reads were sequenced. This field is compatible with the LB/library field of BAM read

560

-       groups. Free text.

561

-    */

562

-    optional string library = 8;

563

-    /*

564

-       The platform unit on which the reads were sequenced. This field for compatibility with samtools.

565

-    */

566

-    optional string platform_unit = 12;

567

-    /*

568

-       The date the reads were sequenced. Useful to identify batch effects, in the format dd:MMM:yyyy.

569

-       The month is Jan, Feb, etc. to avoid all confusion with days when day<=12.

570

-    */

571

-    optional string run_date = 6;

572

-}

573

-

574

-/*

575

-  This message is written to 'basename'.tmh

576

-*/

577

-

578

-message AlignmentTooManyHits {

579

-    /*

580

-    The threshold used by the aligner to determine that a query is ambiguous and should be dropped.

581

-    Referred to as parameter k below.

582

-    */

583

-    required uint32 aligner_threshold = 2;

584

-    /*

585

-     The hits that are assigned to several (>k) reference location.

586

-    */

587

-    repeated AmbiguousLocation hits = 1;

588

-

589

-}

590

-

591

-message AmbiguousLocation {

592

-    /*

593

-     The index of the query that matched too many times.

594

-    */

595

-    required uint32 query_index = 1;

596

-    /*

597

-     The number of hits that triggered membership in the too many hits list. The query may hit more

598

-     locations than reported here, since some alignment tools will just drop queries that match above

599

-     a threshold and stop counting. This number can be >=k.

600

-    */

601

-    required uint32 at_least_number_of_hits = 2;

602

-    /**

603

-The length of the part of the query sequence that could be matched to the target (also called depth).

604

-May be less than the length of the query sequence, in which case the match was not perfect. When merging

605

-alignments produced by searching different reference sequences, consider only at_least_number_of_hits

606

-from alignments that have exactly the longer depth for the query. */

607

-    optional uint32 length_of_match = 3;

608

-}

609

-

610

-/*

611

-      This message is written to 'basename'.index

612

-  */

613

-message AlignmentIndex {

614

-    /*

615

-      Stores one element by target sequence. Each element is the cumulative target length for the target

616

-      stored at index i. Assume there are four target sequences, with lengths {10, 12, 15, 34}. The field

617

-      targetPositionOffsets will contain: {0,10,22,37}. Such offsets can be used to calculate the absolute

618

-      position of a genomic location. Given targetIndex and positionOnReference, the absolute location

619

-      is defined as  targetPositionOffsets[targetIndex]+positionOnReference.

620

-    */

621

-    repeated uint32 target_position_offsets = 1 [packed = true];

622

-    /*

623

-     The byte offsets into the compressed entries file. Byte offsets are matched with absolute position

624

-     by index. There should be as many elements in offsets as there are in absolutePosition

625

-     where chunks start which represent entries whose absolute positions are less than

626

-    */

627

-    repeated uint64 offsets = 2 [packed = true];

628

-    /*

629

-      The absolute positions of the first entry in the chunk that immediately start at offset. One element

630

-      per chunk in the 'basename'.entries file.

631

-    */

632

-    repeated uint64 absolute_positions = 3 [packed = true];

633

-

634

-}

635

636

diff --git a/sci-biology/goby-cpp/files/Reads.proto b/sci-biology/goby-cpp/files/Reads.proto

637

deleted file mode 100644

638

index 32c1244a3eb..00000000000

639

--- a/sci-biology/goby-cpp/files/Reads.proto

640

+++ /dev/null

641

@@ -1,96 +0,0 @@

642

-package goby;

643

-

644

-option java_package = "edu.cornell.med.icb.goby.reads";

645

-option optimize_for = SPEED;

646

-

647

-message ReadCollection {

648

-     repeated ReadEntry reads = 1;

649

-}

650

-

651

-message ReadEntry {

652

-  /*

653

-    Index of a read.

654

-  */

655

-  required uint32 read_index = 1;

656

-   /*

657

-    Index of the barcode, if any.

658

-  */

659

-  optional uint32 barcode_index = 10;

660

-  /*

661

-     Read identifier/name may be present.

662

-  */

663

-  optional string read_identifier = 23;

664

-  /*

665

-     Additional description about the read (from Fasta/Q format).

666

-   */

667

-  optional string description = 22;

668

-  /*

669

-    Length of the sequence.

670

-   */

671

-  required uint32 read_length = 2;

672

-  /*

673

-    Sequence, encoded as ascii characters stored in single bytes.

674

-   */

675

-  optional bytes sequence = 3;

676

-  /*

677

-   The second sequence in a pair. Stored the same way as the sequence attribute.

678

-  */

679

-  optional bytes sequence_pair = 5;

680

-  /*

681

-    Length of the second sequence in a pair.

682

-  */

683

-  optional uint32 read_length_pair = 6;

684

-  /*

685

-    Quality scores in Phred units, stored as single bytes (0-255).

686

-  */

687

-  optional bytes quality_scores = 4;

688

-  /*

689

-    Quality scores for the second sequence in a pair. Stored as the 'qualityScores' attribute.

690

-   */

691

-  optional bytes quality_scores_pair = 7;

692

-  /*

693

-    Compressed stream of data. The first byte indicates the compression/decompression method (codec). The remaining bytes are

694

-    content compressed with the codec.

695

-  */

696

-  optional bytes compressed_data = 8;

697

-  /*

698

-     Stores meta-data about the reads. Typically meta-data is stored in the very first read of a

699

-     read collection, with the understanding that the meta-data applies to all the reads in the

700

-     collection. Meta-data can be used to store information about when the sample was sequenced,

701

-     or other information of interest. The key-value pair format is sufficiently flexible to

702

-     accomodate a variety of needs. The following keys are pre-defined. Please use pre-defined

703

-     keys so that automated tools can use metadata in relatively standard way. Please note that

704

-     some keys provide a format for the value. This format should also be followed to garantee

705

-     that meta data can be used computationally in fully automatic manner.

706

-

707

-     key="sequencing-run-start-date" value="MM/DD/YYYY" Used to record when the sequencing run

708

-     was initiated on the instrument. Can be used to detect batch effect in a large set of samples.

709

-     key="platform" value="<free-text>". Value is free text, but the following terms are pre-defined.

710

-      Illumina GaIIx

711

-      Illumina HiSeq 1000

712

-      Illumina HiSeq 2000

713

-      Helicos Heliscope

714

-      LifeTech 5500 SOLiD

715

-      LifeTech 5500xl SOLiD

716

-      Roche 454 GS FLX Ti

717

-

718

-      key="organism" value="species name"

719

-      Since Goby 1.9.1

720

-  */

721

-  repeated MetaData meta_data = 25;

722

-

723

-}

724

-/*

725

- A message to store a key/value pair and represent metadata about reads.

726

- Since Goby 1.9.1

727

- */

728

-message MetaData {

729

- /*

730

-   Provides the key. See examples in the documentation of meta_data for ReadEntry.

731

- */

732

- required string key=1;

733

- /*

734

-   Describes the value associated with the key. See examples in the documentation of meta_data for ReadEntry.

735

- */

736

- required string value=2;

737

-}

738

739

diff --git a/sci-biology/goby-cpp/files/goby-cpp-2.0.1-underlinking.patch b/sci-biology/goby-cpp/files/goby-cpp-2.0.1-underlinking.patch

740

deleted file mode 100644

741

index 415785466af..00000000000

742

--- a/sci-biology/goby-cpp/files/goby-cpp-2.0.1-underlinking.patch

743

+++ /dev/null

744

@@ -1,16 +0,0 @@

745

- src/Makefile.am | 2 +-

746

- 1 file changed, 1 insertion(+), 1 deletion(-)

747

-

748

-diff --git a/src/Makefile.am b/src/Makefile.am

749

-index 1033382..33ca906 100644

750

---- a/src/Makefile.am

751

-+++ b/src/Makefile.am

752

-@@ -84,7 +84,7 @@ GobyReadsStats_LDADD = libgoby.la ${BOOST_LDFLAGS} ${BOOST_SYSTEM_LIB} ${BOOST_D

753

- GobyReadsStats_SOURCES = \

754

- 	GobyReadsStats.cc

755

-

756

--GobyFastaToCompact_LDADD = libgoby.la ${BOOST_LDFLAGS} ${BOOST_SYSTEM_LIB} ${BOOST_DATE_TIME_LIB} ${BOOST_FILESYSTEM_LIB} ${BOOST_PROGRAM_OPTIONS_LIB}

757

-+GobyFastaToCompact_LDADD = libgoby.la ${BOOST_LDFLAGS} ${BOOST_SYSTEM_LIB} ${BOOST_DATE_TIME_LIB} ${BOOST_FILESYSTEM_LIB} ${BOOST_PROGRAM_OPTIONS_LIB} -lz

758

- GobyFastaToCompact_SOURCES = \

759

- 	GobyFastaToCompact.cc

760

-

761

762

diff --git a/sci-biology/goby-cpp/goby-cpp-2.0.1.ebuild b/sci-biology/goby-cpp/goby-cpp-2.0.1.ebuild

763

index fcf8971fceb..e74dd6ecede 100644

764

--- a/sci-biology/goby-cpp/goby-cpp-2.0.1.ebuild

765

+++ b/sci-biology/goby-cpp/goby-cpp-2.0.1.ebuild

766

@@ -1,4 +1,4 @@

767

-# Copyright 1999-2015 Gentoo Foundation

768

+# Copyright 1999-2017 Gentoo Foundation

769

 # Distributed under the terms of the GNU General Public License v2

770

771

 EAPI=5

772

@@ -9,7 +9,8 @@ inherit autotools-utils

773

774

 DESCRIPTION="A DNA sequencing data management framework - C/C++ API"

775

 HOMEPAGE="http://campagnelab.org/software/goby/"

776

-SRC_URI="http://chagall.med.cornell.edu/goby/releases/archive/release-goby_${PV}/goby_${PV}-cpp.zip"

777

+SRC_URI="http://chagall.med.cornell.edu/goby/releases/archive/release-goby_${PV}/goby_${PV}-cpp.zip

778

+	https://dev.gentoo.org/~mgorny/dist/${P}-files.tar.bz2"

779

780

 LICENSE="GPL-3"

781

 SLOT="0"

782

@@ -24,7 +25,7 @@ RDEPEND="${DEPEND}"

783

 S="${WORKDIR}/${PV}/cpp"

784

785

 PATCHES=(

786

-	"${FILESDIR}"/${P}-underlinking.patch

787

+	"${WORKDIR}"/${P}-files/${P}-underlinking.patch

788

)

789

790

 src_prepare() {

791

@@ -33,7 +34,7 @@ src_prepare() {

792

 		-i src/Makefile.am || die

793

794

 	pushd src/goby > /dev/null || die

795

-	cp "${FILESDIR}"/*.proto . || die

796

+	cp "${WORKDIR}"/${P}-files/*.proto . || die

797

 	protoc --cpp_out=. *.proto || die

798

 	popd > /dev/null || die

Gentoo Archives: gentoo-commits