1 |
commit: b2800087a6719b8b9df1732d7ecdac3f5fab8b06 |
2 |
Author: Kent Fredric <kentfredric <AT> gmail <DOT> com> |
3 |
AuthorDate: Wed Apr 18 03:25:51 2012 +0000 |
4 |
Commit: Kent Fredric <kentfredric <AT> gmail <DOT> com> |
5 |
CommitDate: Wed Apr 18 03:25:51 2012 +0000 |
6 |
URL: http://git.overlays.gentoo.org/gitweb/?p=proj/perl-overlay.git;a=commit;h=b2800087 |
7 |
|
8 |
[scripts/package_map_all.pl] hacks to get around the abysmal speed I experienced today with the API, request batching and ssl stuff |
9 |
|
10 |
--- |
11 |
scripts/package_map_all.pl | 120 +++++++++++++++++++++++++------------------ |
12 |
1 files changed, 70 insertions(+), 50 deletions(-) |
13 |
|
14 |
diff --git a/scripts/package_map_all.pl b/scripts/package_map_all.pl |
15 |
index 351cd63..8bb260e 100755 |
16 |
--- a/scripts/package_map_all.pl |
17 |
+++ b/scripts/package_map_all.pl |
18 |
@@ -28,7 +28,8 @@ if ( $optparse->has_long_opt('root') ) { |
19 |
$root = Path::Class::Dir->new( $optparse->long_opt('root') ); |
20 |
} |
21 |
|
22 |
-my $size = 1000; |
23 |
+my $size = 500; |
24 |
+my $scroll_time = '20m'; |
25 |
|
26 |
my $metadata = $root->subdir( 'metadata', 'perl' ); |
27 |
my $distmap = $metadata->subdir('distmap'); |
28 |
@@ -52,7 +53,7 @@ my %g_repos; |
29 |
for ( keys %{$nodes} ) { |
30 |
my $records = $nodes->{$_}; |
31 |
$lookup{$_}++; |
32 |
- for my $rec ( @{ $records }) { |
33 |
+ for my $rec ( @{$records} ) { |
34 |
my $repo = $rec->{repository}; |
35 |
$repos{$repo}++; |
36 |
} |
37 |
@@ -72,58 +73,17 @@ my %g_repos; |
38 |
|
39 |
my @dists = keys %lookup; |
40 |
|
41 |
-my $search = {}; |
42 |
-$search->{query} = { constant_score => { filter => { terms => { distribution => [@dists] } } } }; |
43 |
-$search->{sort} = [ { 'date' => 'desc', }, ]; |
44 |
-$search->{size} = $size; |
45 |
-$search->{fields} = [ |
46 |
- qw( |
47 |
- abstract |
48 |
- archive |
49 |
- author |
50 |
- authorized |
51 |
- date |
52 |
- distribution |
53 |
- download_url |
54 |
- license |
55 |
- maturity |
56 |
- name |
57 |
- status |
58 |
- version |
59 |
- ) |
60 |
-]; |
61 |
- |
62 |
-$ENV{WWW_MECH_NOCACHE} = 1; |
63 |
- |
64 |
-my $results_string = mcpan->ua->request( |
65 |
- 'POST', |
66 |
- mcpan->base_url . 'release/_search?search_type=scan&scroll=30s&size=' . $size, |
67 |
- { |
68 |
- headers => { 'Accept-Encoding' => 'gzip', }, |
69 |
- content => $encoder->encode($search), |
70 |
- } |
71 |
-); |
72 |
- |
73 |
-say $results_string->{content}; |
74 |
- |
75 |
-my $results = $decoder->decode( $results_string->{content} ); |
76 |
-my $scroll_id = $results->{_scroll_id}; |
77 |
+my $dtree; |
78 |
|
79 |
-my $total_results = $results->{hits}->{total}; |
80 |
+my $seen = 0; |
81 |
|
82 |
-say "Found: $total_results releases"; |
83 |
+use List::MoreUtils qw( natatime ); |
84 |
|
85 |
-my $dtree; |
86 |
-my $seen = 0; |
87 |
+my $it = natatime 500, @dists; |
88 |
|
89 |
-while (1) { |
90 |
- my ( $result, $scroll ) = scroll($scroll_id); |
91 |
- last unless scalar @{ $result->{hits}->{hits} }; |
92 |
- collate_resultset($result); |
93 |
- $scroll_id = $scroll; |
94 |
- say "Seen $seen of $total_results"; |
95 |
+while ( my @dists_batch = $it->() ) { |
96 |
+ get_data_for(@dists_batch); |
97 |
} |
98 |
- |
99 |
for my $package ( sort keys %{$dtree} ) { |
100 |
say "Sorting $package"; |
101 |
$dtree->{$package} = [ sort { $b->{date} cmp $a->{date} } @{ $dtree->{$package} } ]; |
102 |
@@ -134,14 +94,74 @@ $fh->print( $encoder->encode($dtree) ); |
103 |
|
104 |
exit 0; |
105 |
|
106 |
+sub get_data_for { |
107 |
+ my (@items) = @_; |
108 |
+ my $search = {}; |
109 |
+ $search->{query} = { constant_score => { filter => { terms => { distribution => [@items] } } } }; |
110 |
+ $search->{sort} = [ { 'date' => 'desc', }, ]; |
111 |
+ $search->{size} = $size; |
112 |
+ $search->{fields} = [ |
113 |
+ qw( |
114 |
+ abstract |
115 |
+ archive |
116 |
+ author |
117 |
+ authorized |
118 |
+ date |
119 |
+ distribution |
120 |
+ download_url |
121 |
+ license |
122 |
+ maturity |
123 |
+ name |
124 |
+ status |
125 |
+ version |
126 |
+ ) |
127 |
+ ]; |
128 |
+ |
129 |
+ $ENV{WWW_MECH_NOCACHE} = 1; |
130 |
+ |
131 |
+ my $results_string = mcpan->ua->request( |
132 |
+ 'POST', |
133 |
+ 'https://api.metacpan.org/release/_search?search_type=scan&scroll=' . $scroll_time . '&size=' . $size, |
134 |
+ { |
135 |
+ headers => { 'Accept-Encoding' => 'gzip', }, |
136 |
+ content => $encoder->encode($search), |
137 |
+ } |
138 |
+ ); |
139 |
+ |
140 |
+ my $results = $decoder->decode( $results_string->{content} ); |
141 |
+ my $scroll_id = $results->{_scroll_id}; |
142 |
+ |
143 |
+ my $total_results = $results->{hits}->{total}; |
144 |
+ |
145 |
+ say "Found: $total_results releases"; |
146 |
+ $seen = 0; |
147 |
+ while (1) { |
148 |
+ my ( $result, $scroll ) = scroll($scroll_id); |
149 |
+ last unless scalar @{ $result->{hits}->{hits} }; |
150 |
+ collate_resultset($result); |
151 |
+ $scroll_id = $scroll; |
152 |
+ say "Seen $seen of $total_results"; |
153 |
+ } |
154 |
+ |
155 |
+} |
156 |
+ |
157 |
sub scroll { |
158 |
my ($id) = @_; |
159 |
my $result = mcpan->ua->request( |
160 |
'GET', |
161 |
- 'http://api.metacpan.org/_search/scroll/?scroll=30s&size=' . $size . '&scroll_id=' . $id, |
162 |
+ 'https://api.metacpan.org/_search/scroll/?scroll=' . $scroll_time . '&size=' . $size . '&scroll_id=' . $id, |
163 |
{ headers => { 'Accept-Encoding' => 'gzip', } } |
164 |
); |
165 |
|
166 |
+ if ( $result->{content} =~ /Server Error/ ) { |
167 |
+ require Data::Dump; |
168 |
+ Data::Dump::pp( { result => $result, size => $size, scroll_id => $id } ); |
169 |
+ die; |
170 |
+ } |
171 |
+ else { |
172 |
+ #require Data::Dump; |
173 |
+ #Data::Dump::pp( { result => { %{$result}, content => '...' }, size => $size, scroll_id => $id } ); |
174 |
+ } |
175 |
my $data = $decoder->decode( $result->{content} ); |
176 |
return $data, $data->{_scroll_id}; |
177 |
} |