Gentoo Archives: gentoo-commits

From: Alexis Ballier <aballier@g.o>
To: gentoo-commits@l.g.o
Subject: [gentoo-commits] repo/gentoo:master commit in: dev-ml/markup/files/, dev-ml/markup/
Date: Mon, 28 Nov 2016 20:27:53
Message-Id: 1480364853.56c0c3a567b08228699194eb2820a6f59595ab6a.aballier@gentoo
1 commit: 56c0c3a567b08228699194eb2820a6f59595ab6a
2 Author: Alexis Ballier <aballier <AT> gentoo <DOT> org>
3 AuthorDate: Mon Nov 28 20:26:55 2016 +0000
4 Commit: Alexis Ballier <aballier <AT> gentoo <DOT> org>
5 CommitDate: Mon Nov 28 20:27:33 2016 +0000
6 URL: https://gitweb.gentoo.org/repo/gentoo.git/commit/?id=56c0c3a5
7
8 dev-ml/markup: fix build with uutf 1.0
9
10 Package-Manager: portage-2.3.2
11
12 dev-ml/markup/files/test.patch | 273 +++++
13 dev-ml/markup/files/uutf.patch | 1085 ++++++++++++++++++++
14 ...{markup-0.7.2.ebuild => markup-0.7.2-r1.ebuild} | 11 +-
15 3 files changed, 1366 insertions(+), 3 deletions(-)
16
17 diff --git a/dev-ml/markup/files/test.patch b/dev-ml/markup/files/test.patch
18 new file mode 100644
19 index 00000000..f2a5257
20 --- /dev/null
21 +++ b/dev-ml/markup/files/test.patch
22 @@ -0,0 +1,273 @@
23 +Index: markup.ml-0.7.2/test/test_encoding.ml
24 +===================================================================
25 +--- markup.ml-0.7.2.orig/test/test_encoding.ml
26 ++++ markup.ml-0.7.2/test/test_encoding.ml
27 +@@ -15,9 +15,9 @@ let test_ucs_4 (f : Encoding.t) name s1
28 + expect_error (1, 2) (`Decoding_error (bad_bytes, name))
29 + begin fun report ->
30 + let chars = s1 |> string |> f ~report in
31 +- next_option chars ok (assert_equal (Some (Char.code 'f')));
32 ++ next_option chars ok (assert_equal (Some ((Uchar.of_int (Char.code 'f')))));
33 + next_option chars ok (assert_equal (Some Uutf.u_rep));
34 +- next_option chars ok (assert_equal (Some (Char.code 'o')));
35 ++ next_option chars ok (assert_equal (Some ((Uchar.of_int (Char.code 'o')))));
36 + next_option chars ok (assert_equal None);
37 + next_option chars ok (assert_equal None)
38 + end;
39 +@@ -25,9 +25,9 @@ let test_ucs_4 (f : Encoding.t) name s1
40 + expect_error (2, 2) (`Decoding_error ("\x00\x00\x00", name))
41 + begin fun report ->
42 + let chars = s2 |> string |> f ~report in
43 +- next_option chars ok (assert_equal (Some (Char.code 'f')));
44 +- next_option chars ok (assert_equal (Some 0x000A));
45 +- next_option chars ok (assert_equal (Some (Char.code 'o')));
46 ++ next_option chars ok (assert_equal (Some (Uchar.of_int (Char.code 'f'))));
47 ++ next_option chars ok (assert_equal (Some (Uchar.of_int 0x000A)));
48 ++ next_option chars ok (assert_equal (Some (Uchar.of_int (Char.code 'o'))));
49 + next_option chars ok (assert_equal (Some Uutf.u_rep));
50 + next_option chars ok (assert_equal None);
51 + next_option chars ok (assert_equal None)
52 +@@ -38,12 +38,12 @@ let tests = [
53 + let s = "\xef\xbb\xbffoo\xf0\x9f\x90\x99bar\xa0more" in
54 + expect_error (1, 8) (`Decoding_error ("\xa0", "utf-8")) begin fun report ->
55 + let chars = s |> string |> utf_8 ~report in
56 +- next_n 3 chars ok (assert_equal (List.map Char.code ['f'; 'o'; 'o']));
57 +- next_option chars ok (assert_equal (Some 0x1F419));
58 +- next_n 3 chars ok (assert_equal (List.map Char.code ['b'; 'a'; 'r']));
59 ++ next_n 3 chars ok (assert_equal (List.map (fun x -> Uchar.of_int (Char.code x)) ['f'; 'o'; 'o']));
60 ++ next_option chars ok (assert_equal (Some (Uchar.of_int 0x1F419)));
61 ++ next_n 3 chars ok (assert_equal (List.map (fun x -> Uchar.of_int (Char.code x)) ['b'; 'a'; 'r']));
62 + next_option chars ok (assert_equal (Some Uutf.u_rep));
63 + next_n 4 chars ok
64 +- (assert_equal (List.map Char.code ['m'; 'o'; 'r'; 'e']));
65 ++ (assert_equal (List.map (fun x -> Uchar.of_int (Char.code x)) ['m'; 'o'; 'r'; 'e']));
66 + next_option chars ok (assert_equal None);
67 + next_option chars ok (assert_equal None)
68 + end);
69 +@@ -53,11 +53,11 @@ let tests = [
70 + expect_error (1, 6) (`Decoding_error ("\xdc\x19", "utf-16be"))
71 + begin fun report ->
72 + let chars = s |> string |> utf_16be ~report in
73 +- next_n 3 chars ok (assert_equal (List.map Char.code ['f'; 'o'; 'o']));
74 +- next_option chars ok (assert_equal (Some 0x1F419));
75 +- next_option chars ok (assert_equal (Some (Char.code 'b')));
76 ++ next_n 3 chars ok (assert_equal (List.map (fun x -> Uchar.of_int (Char.code x)) ['f'; 'o'; 'o']));
77 ++ next_option chars ok (assert_equal (Some (Uchar.of_int 0x1F419)));
78 ++ next_option chars ok (assert_equal (Some (Uchar.of_int (Char.code 'b'))));
79 + next_option chars ok (assert_equal (Some Uutf.u_rep));
80 +- next_n 16 chars ok (assert_equal (List.map Char.code ['a'; 'r']));
81 ++ next_n 16 chars ok (assert_equal (List.map (fun x -> Uchar.of_int (Char.code x)) ['a'; 'r']));
82 + next_option chars ok (assert_equal None);
83 + next_option chars ok (assert_equal None)
84 + end);
85 +@@ -67,11 +67,11 @@ let tests = [
86 + expect_error (1, 6) (`Decoding_error ("\x19\xdc", "utf-16le"))
87 + begin fun report ->
88 + let chars = s |> string |> utf_16le ~report in
89 +- next_n 3 chars ok (assert_equal (List.map Char.code ['f'; 'o'; 'o']));
90 +- next_option chars ok (assert_equal (Some 0x1F419));
91 +- next_option chars ok (assert_equal (Some (Char.code 'b')));
92 ++ next_n 3 chars ok (assert_equal (List.map (fun x -> Uchar.of_int (Char.code x)) ['f'; 'o'; 'o']));
93 ++ next_option chars ok (assert_equal (Some (Uchar.of_int 0x1F419)));
94 ++ next_option chars ok (assert_equal (Some (Uchar.of_int (Char.code 'b'))));
95 + next_option chars ok (assert_equal (Some Uutf.u_rep));
96 +- next_n 16 chars ok (assert_equal (List.map Char.code ['a'; 'r']));
97 ++ next_n 16 chars ok (assert_equal (List.map (fun x -> Uchar.of_int (Char.code x)) ['a'; 'r']));
98 + next_option chars ok (assert_equal None);
99 + next_option chars ok (assert_equal None)
100 + end);
101 +@@ -79,7 +79,7 @@ let tests = [
102 + ("encoding.iso_8859_1" >:: fun _ ->
103 + let chars = string "foo\xa0" |> iso_8859_1 in
104 + next_n 4 chars
105 +- ok (assert_equal (List.map Char.code ['f'; 'o'; 'o'; '\xa0']));
106 ++ ok (assert_equal (List.map (fun x -> Uchar.of_int (Char.code x)) ['f'; 'o'; 'o'; '\xa0']));
107 + next_option chars ok (assert_equal None);
108 + next_option chars ok (assert_equal None));
109 +
110 +@@ -88,26 +88,26 @@ let tests = [
111 + expect_error (1, 4) (`Decoding_error ("\xa0", "us-ascii"))
112 + begin fun report ->
113 + let chars = s |> string |> us_ascii ~report in
114 +- next_n 3 chars ok (assert_equal (List.map Char.code ['f'; 'o'; 'o']));
115 ++ next_n 3 chars ok (assert_equal (List.map (fun x -> Uchar.of_int (Char.code x)) ['f'; 'o'; 'o']));
116 + next_option chars ok (assert_equal (Some Uutf.u_rep));
117 +- next_n 3 chars ok (assert_equal (List.map Char.code ['b'; 'a'; 'r']));
118 ++ next_n 3 chars ok (assert_equal (List.map (fun x -> Uchar.of_int (Char.code x)) ['b'; 'a'; 'r']));
119 + next_option chars ok (assert_equal None);
120 + next_option chars ok (assert_equal None)
121 + end);
122 +
123 + ("encoding.windows_1251" >:: fun _ ->
124 + let chars = string "foo\xe0\xe1\xe2bar" |> windows_1251 in
125 +- next_n 3 chars ok (assert_equal (List.map Char.code ['f'; 'o'; 'o']));
126 +- next_n 3 chars ok (assert_equal [0x0430; 0x0431; 0x0432]);
127 +- next_n 3 chars ok (assert_equal (List.map Char.code ['b'; 'a'; 'r']));
128 ++ next_n 3 chars ok (assert_equal (List.map (fun x -> Uchar.of_int (Char.code x)) ['f'; 'o'; 'o']));
129 ++ next_n 3 chars ok (assert_equal [Uchar.of_int 0x0430; Uchar.of_int 0x0431; Uchar.of_int 0x0432]);
130 ++ next_n 3 chars ok (assert_equal (List.map (fun x -> Uchar.of_int (Char.code x)) ['b'; 'a'; 'r']));
131 + next_option chars ok (assert_equal None);
132 + next_option chars ok (assert_equal None));
133 +
134 + ("encoding.windows_1252" >:: fun _ ->
135 + let chars = string "foo\x80\x83bar" |> windows_1252 in
136 +- next_n 3 chars ok (assert_equal (List.map Char.code ['f'; 'o'; 'o']));
137 +- next_n 2 chars ok (assert_equal [0x20AC; 0x0192]);
138 +- next_n 3 chars ok (assert_equal (List.map Char.code ['b'; 'a'; 'r']));
139 ++ next_n 3 chars ok (assert_equal (List.map (fun x -> Uchar.of_int (Char.code x)) ['f'; 'o'; 'o']));
140 ++ next_n 2 chars ok (assert_equal [Uchar.of_int 0x20AC; Uchar.of_int 0x0192]);
141 ++ next_n 3 chars ok (assert_equal (List.map (fun x -> Uchar.of_int (Char.code x)) ['b'; 'a'; 'r']));
142 + next_option chars ok (assert_equal None);
143 + next_option chars ok (assert_equal None));
144 +
145 +@@ -137,7 +137,7 @@ let tests = [
146 +
147 + ("encoding.ebcdic" >:: fun _ ->
148 + let chars = string "\x86\x96\x96" |> ebcdic in
149 +- next_n 3 chars ok (assert_equal (List.map Char.code ['f'; 'o'; 'o']));
150 ++ next_n 3 chars ok (assert_equal (List.map (fun x -> Uchar.of_int (Char.code x)) ['f'; 'o'; 'o']));
151 + next_option chars ok (assert_equal None);
152 + next_option chars ok (assert_equal None));
153 + ]
154 +Index: markup.ml-0.7.2/test/test_html_tokenizer.ml
155 +===================================================================
156 +--- markup.ml-0.7.2.orig/test/test_html_tokenizer.ml
157 ++++ markup.ml-0.7.2/test/test_html_tokenizer.ml
158 +@@ -134,7 +134,7 @@ let tests = [
159 + expect "&#1000000000000000000000000000000;"
160 + [ 1, 1, E (`Bad_token ("&#1000000000000000000000000000000;",
161 + reference, "out of range"));
162 +- 1, 1, S (`Char Uutf.u_rep);
163 ++ 1, 1, S (`Char (Uchar.to_int Uutf.u_rep));
164 + 1, 35, S `EOF];
165 +
166 + expect "&#1000000000000000000000000000000"
167 +@@ -142,22 +142,22 @@ let tests = [
168 + reference, "missing ';' at end"));
169 + 1, 1, E (`Bad_token ("&#1000000000000000000000000000000",
170 + reference, "out of range"));
171 +- 1, 1, S (`Char Uutf.u_rep);
172 ++ 1, 1, S (`Char (Uchar.to_int Uutf.u_rep));
173 + 1, 34, S `EOF];
174 +
175 + expect "&#xD800;"
176 + [ 1, 1, E (`Bad_token ("&#xD800;", reference, "out of range"));
177 +- 1, 1, S (`Char Uutf.u_rep);
178 ++ 1, 1, S (`Char (Uchar.to_int Uutf.u_rep));
179 + 1, 9, S `EOF];
180 +
181 + expect "&#x110000;"
182 + [ 1, 1, E (`Bad_token ("&#x110000;", reference, "out of range"));
183 +- 1, 1, S (`Char Uutf.u_rep);
184 ++ 1, 1, S (`Char (Uchar.to_int Uutf.u_rep));
185 + 1, 11, S `EOF];
186 +
187 + expect "&#0;"
188 + [ 1, 1, E (`Bad_token ("&#0;", reference, "out of range"));
189 +- 1, 1, S (`Char Uutf.u_rep);
190 ++ 1, 1, S (`Char (Uchar.to_int Uutf.u_rep));
191 + 1, 5, S `EOF];
192 +
193 + expect "&#x01;"
194 +@@ -264,7 +264,7 @@ let tests = [
195 + expect ~state:`RCDATA "f\x00</foo>"
196 + ([ 1, 1, S (`Char 0x66);
197 + 1, 2, E (`Bad_token ("U+0000", "content", "null"));
198 +- 1, 2, S (`Char Uutf.u_rep)] @
199 ++ 1, 2, S (`Char (Uchar.to_int Uutf.u_rep))] @
200 + (char_sequence ~start:3 "</foo>"));
201 +
202 + expect ~state:`RCDATA "<title>f</title >"
203 +@@ -302,7 +302,7 @@ let tests = [
204 + expect ~state:`RAWTEXT "f\x00</foo>"
205 + ([ 1, 1, S (`Char 0x66);
206 + 1, 2, E (`Bad_token ("U+0000", "content", "null"));
207 +- 1, 2, S (`Char Uutf.u_rep)] @
208 ++ 1, 2, S (`Char (Uchar.to_int Uutf.u_rep))] @
209 + (char_sequence ~start:3 "</foo>")));
210 +
211 + ("html.tokenizer.script-data" >:: fun _ ->
212 +@@ -330,7 +330,7 @@ let tests = [
213 + expect ~state:`Script_data "f<!--o\x00o"
214 + ((char_sequence ~no_eof:true "f<!--o") @
215 + [1, 7, E (`Bad_token ("U+0000", "script", "null"));
216 +- 1, 7, S (`Char Uutf.u_rep);
217 ++ 1, 7, S (`Char (Uchar.to_int Uutf.u_rep));
218 + 1, 8, S (`Char 0x6F);
219 + 1, 9, E (`Unexpected_eoi "script");
220 + 1, 9, S `EOF]);
221 +@@ -363,7 +363,7 @@ let tests = [
222 + expect ~state:`Script_data "f<!--a-\x00-"
223 + ((char_sequence ~no_eof:true "f<!--a-") @
224 + [ 1, 8, E (`Bad_token ("U+0000", "script", "null"));
225 +- 1, 8, S (`Char Uutf.u_rep);
226 ++ 1, 8, S (`Char (Uchar.to_int Uutf.u_rep));
227 + 1, 9, S (`Char 0x02D);
228 + 1, 10, E (`Unexpected_eoi "script");
229 + 1, 10, S `EOF]);
230 +@@ -371,7 +371,7 @@ let tests = [
231 + expect ~state:`Script_data "f<!--a--\x00--"
232 + ((char_sequence ~no_eof:true "f<!--a--") @
233 + [ 1, 9, E (`Bad_token ("U+0000", "script", "null"));
234 +- 1, 9, S (`Char Uutf.u_rep);
235 ++ 1, 9, S (`Char (Uchar.to_int Uutf.u_rep));
236 + 1, 10, S (`Char 0x02D);
237 + 1, 11, S (`Char 0x02D);
238 + 1, 12, E (`Unexpected_eoi "script");
239 +@@ -380,14 +380,14 @@ let tests = [
240 + expect ~state:`Script_data "f<!--<script>\x00"
241 + ((char_sequence ~no_eof:true "f<!--<script>") @
242 + [ 1, 14, E (`Bad_token ("U+0000", "script", "null"));
243 +- 1, 14, S (`Char Uutf.u_rep);
244 ++ 1, 14, S (`Char (Uchar.to_int Uutf.u_rep));
245 + 1, 15, E (`Unexpected_eoi "script");
246 + 1, 15, S `EOF]);
247 +
248 + expect ~state:`Script_data "f<!--<script>-\x00-"
249 + ((char_sequence ~no_eof:true "f<!--<script>-") @
250 + [ 1, 15, E (`Bad_token ("U+0000", "script", "null"));
251 +- 1, 15, S (`Char Uutf.u_rep);
252 ++ 1, 15, S (`Char (Uchar.to_int Uutf.u_rep));
253 + 1, 16, S (`Char 0x2D);
254 + 1, 17, E (`Unexpected_eoi "script");
255 + 1, 17, S `EOF]);
256 +@@ -395,7 +395,7 @@ let tests = [
257 + expect ~state:`Script_data "f<!--<script>--\x00--"
258 + ((char_sequence ~no_eof:true "f<!--<script>--") @
259 + [ 1, 16, E (`Bad_token ("U+0000", "script", "null"));
260 +- 1, 16, S (`Char Uutf.u_rep);
261 ++ 1, 16, S (`Char (Uchar.to_int Uutf.u_rep));
262 + 1, 17, S (`Char 0x2D);
263 + 1, 18, S (`Char 0x2D);
264 + 1, 19, E (`Unexpected_eoi "script");
265 +@@ -413,7 +413,7 @@ let tests = [
266 + expect ~state:`Script_data "f\x00</foo>"
267 + ([ 1, 1, S (`Char 0x66);
268 + 1, 2, E (`Bad_token ("U+0000", "content", "null"));
269 +- 1, 2, S (`Char Uutf.u_rep)] @
270 ++ 1, 2, S (`Char (Uchar.to_int Uutf.u_rep))] @
271 + (char_sequence ~start:3 "</foo>")));
272 +
273 + ("html.tokenizer.plaintext" >:: fun _ ->
274 +@@ -424,7 +424,7 @@ let tests = [
275 + expect ~state:`PLAINTEXT "f\x00</foo>"
276 + ([ 1, 1, S (`Char 0x66);
277 + 1, 2, E (`Bad_token ("U+0000", "content", "null"));
278 +- 1, 2, S (`Char Uutf.u_rep)] @
279 ++ 1, 2, S (`Char (Uchar.to_int Uutf.u_rep))] @
280 + (char_sequence ~start:3 "</foo>")));
281 +
282 + ("html.tokenizer.comment" >:: fun _ ->
283 +Index: markup.ml-0.7.2/test/test_input.ml
284 +===================================================================
285 +--- markup.ml-0.7.2.orig/test/test_input.ml
286 ++++ markup.ml-0.7.2/test/test_input.ml
287 +@@ -71,7 +71,7 @@ let tests = [
288 + end);
289 +
290 + ("input.bom" >:: fun _ ->
291 +- [0xFEFF; 0x66]
292 ++ [Uchar.of_int 0xFEFF; Uchar.of_int 0x66]
293 + |> of_list
294 + |> preprocess is_valid_xml_char Error.ignore_errors
295 + |> fst
296
297 diff --git a/dev-ml/markup/files/uutf.patch b/dev-ml/markup/files/uutf.patch
298 new file mode 100644
299 index 00000000..f561084
300 --- /dev/null
301 +++ b/dev-ml/markup/files/uutf.patch
302 @@ -0,0 +1,1085 @@
303 +Index: markup.ml-0.7.2/src/common.ml
304 +===================================================================
305 +--- markup.ml-0.7.2.orig/src/common.ml
306 ++++ markup.ml-0.7.2/src/common.ml
307 +@@ -134,7 +134,7 @@ let is_printable = is_in_range 0x0020 0x
308 + let char c =
309 + if is_printable c then begin
310 + let buffer = Buffer.create 4 in
311 +- add_utf_8 buffer c;
312 ++ add_utf_8 buffer (Uchar.of_int c);
313 + Buffer.contents buffer
314 + end
315 + else
316 +Index: markup.ml-0.7.2/src/detect.ml
317 +===================================================================
318 +--- markup.ml-0.7.2.orig/src/detect.ml
319 ++++ markup.ml-0.7.2/src/detect.ml
320 +@@ -222,7 +222,7 @@ let meta_tag_prescan =
321 + let rec iterate () =
322 + next source throw (fun () -> k "") (function
323 + | c when c = quote -> k (Buffer.contents buffer)
324 +- | c -> add_utf_8 buffer (Char.code (Char.lowercase c)); iterate ())
325 ++ | c -> add_utf_8 buffer (Uchar.of_int (Char.code (Char.lowercase c))); iterate ())
326 + in
327 + iterate ()
328 + in
329 +@@ -236,7 +236,7 @@ let meta_tag_prescan =
330 + push source c;
331 + k (Buffer.contents buffer)
332 + | c ->
333 +- add_utf_8 buffer (Char.code (Char.lowercase c));
334 ++ add_utf_8 buffer (Uchar.of_int (Char.code (Char.lowercase c)));
335 + iterate ())
336 + in
337 + iterate ()
338 +@@ -315,7 +315,7 @@ let meta_tag_prescan =
339 + k (Buffer.contents buffer)
340 +
341 + | Some c ->
342 +- add_utf_8 buffer (Char.code (Char.lowercase c));
343 ++ add_utf_8 buffer (Uchar.of_int (Char.code (Char.lowercase c)));
344 + iterate ()
345 + end
346 + in
347 +Index: markup.ml-0.7.2/src/encoding.ml
348 +===================================================================
349 +--- markup.ml-0.7.2.orig/src/encoding.ml
350 ++++ markup.ml-0.7.2/src/encoding.ml
351 +@@ -4,7 +4,7 @@
352 + open Common
353 + open Kstream
354 +
355 +-type t = ?report:Error.parse_handler -> char Kstream.t -> int Kstream.t
356 ++type t = ?report:Error.parse_handler -> char Kstream.t -> Uchar.t Kstream.t
357 +
358 + let wrap f = fun ?(report = Error.ignore_errors) s -> f report s
359 +
360 +@@ -24,8 +24,8 @@ let _uutf_decoder encoding name =
361 + k Uutf.u_rep)
362 + | `Await ->
363 + next bytes throw
364 +- (fun () -> Uutf.Manual.src decoder "" 0 0; run ())
365 +- (fun c -> Uutf.Manual.src decoder (String.make 1 c) 0 1; run ())
366 ++ (fun () -> Uutf.Manual.src decoder Bytes.empty 0 0; run ())
367 ++ (fun c -> Uutf.Manual.src decoder (Bytes.make 1 c) 0 1; run ())
368 + in
369 + run ())
370 + |> make)
371 +@@ -87,7 +87,7 @@ let _ucs_4_decoder arrange name =
372 + let skip =
373 + if !first then begin
374 + first := false;
375 +- scalar = Uutf.u_bom
376 ++ scalar = Uchar.to_int Uutf.u_bom
377 + end
378 + else
379 + false
380 +@@ -96,9 +96,9 @@ let _ucs_4_decoder arrange name =
381 + if skip then run ()
382 + else
383 + if scalar = 0x000A then
384 +- newline k scalar
385 ++ newline k (Uchar.of_int scalar)
386 + else
387 +- char k scalar
388 ++ char k (Uchar.of_int scalar)
389 +
390 + | [] -> empty ()
391 +
392 +@@ -130,7 +130,7 @@ let code_page table =
393 +
394 + (fun _ bytes ->
395 + (fun throw empty k ->
396 +- next bytes throw empty (fun c -> k table.(Char.code c)))
397 ++ next bytes throw empty (fun c -> k (Uchar.of_int table.(Char.code c))))
398 + |> make)
399 + |> wrap
400 +
401 +Index: markup.ml-0.7.2/src/html_parser.ml
402 +===================================================================
403 +--- markup.ml-0.7.2.orig/src/html_parser.ml
404 ++++ markup.ml-0.7.2/src/html_parser.ml
405 +@@ -1022,7 +1022,7 @@ let parse requested_context report (toke
406 + let frameset_ok = ref true in
407 + let head_seen = ref false in
408 +
409 +- let add_character = Text.add text in
410 ++ let add_character = (fun x y -> Text.add text x (Uchar.of_int y)) in
411 +
412 + set_foreign (fun () ->
413 + Stack.current_element_is_foreign context open_elements);
414 +@@ -2717,7 +2717,7 @@ let parse requested_context report (toke
415 + | l, `Char 0 ->
416 + report l (`Bad_token ("U+0000", "foreign content", "null")) !throw
417 + (fun () ->
418 +- add_character l Uutf.u_rep;
419 ++ add_character l (Uchar.to_int Uutf.u_rep);
420 + mode ())
421 +
422 + | l, `Char (0x0009 | 0x000A | 0x000C | 0x000D | 0x0020 as c) ->
423 +Index: markup.ml-0.7.2/src/html_tokenizer.ml
424 +===================================================================
425 +--- markup.ml-0.7.2.orig/src/html_tokenizer.ml
426 ++++ markup.ml-0.7.2/src/html_tokenizer.ml
427 +@@ -252,7 +252,7 @@ let tokenize report (input, get_location
428 + report location
429 + (`Bad_token (prefix ^ text ^ semicolon, "character reference",
430 + "Windows-1252 character")) !throw (fun () ->
431 +- k (Some (`One n)))
432 ++ k (Some (`One (Uchar.of_int n))))
433 +
434 + else
435 + match n with
436 +@@ -268,9 +268,9 @@ let tokenize report (input, get_location
437 + (`Bad_token (prefix ^ text ^ semicolon,
438 + "character reference",
439 + "invalid HTML character")) !throw (fun () ->
440 +- k (Some (`One n)))
441 ++ k (Some (`One (Uchar.of_int n))))
442 +
443 +- | n -> k (Some (`One n))
444 ++ | n -> k (Some (`One (Uchar.of_int n)))
445 + end
446 + end
447 + in
448 +@@ -366,6 +366,10 @@ let tokenize report (input, get_location
449 + | _ -> unterminated ())
450 + in
451 +
452 ++ let ma = function
453 ++ a, `One x -> (a, `One (Uchar.of_int x))
454 ++ | a, `Two (x,y) -> (a, `Two (Uchar.of_int x, Uchar.of_int y)) in
455 ++
456 + let rec match_named best matched replace candidate =
457 + next_option input !throw (function
458 + | None -> finish best matched replace
459 +@@ -377,8 +381,8 @@ let tokenize report (input, get_location
460 + | `None -> finish best matched (v::replace)
461 + | `Continue -> match_named best matched (v::replace) candidate
462 + | `Match_and_continue m ->
463 +- match_named (Some m) (v::(replace @ matched)) [] candidate
464 +- | `Match m -> finish (Some m) (v::matched) [])
465 ++ match_named (Some (ma m)) (v::(replace @ matched)) [] candidate
466 ++ | `Match m -> finish (Some (ma m)) (v::matched) [])
467 + in
468 + match_named None [] [] "")
469 +
470 +@@ -409,11 +413,11 @@ let tokenize report (input, get_location
471 + emit (l, `Char 0x0026) state
472 +
473 + | Some (`One c) ->
474 +- emit (l, `Char c) state
475 ++ emit (l, `Char (Uchar.to_int c)) state
476 +
477 + | Some (`Two (c, c')) ->
478 +- emit (l, `Char c) (fun () ->
479 +- emit (l, `Char c') state)
480 ++ emit (l, `Char (Uchar.to_int c)) (fun () ->
481 ++ emit (l, `Char (Uchar.to_int c')) state)
482 + end
483 +
484 + (* 8.2.4.3. *)
485 +@@ -427,7 +431,7 @@ let tokenize report (input, get_location
486 +
487 + | Some (l, 0) ->
488 + report l (`Bad_token ("U+0000", "content", "null")) !throw (fun () ->
489 +- emit (l, `Char Uutf.u_rep) rcdata_state)
490 ++ emit (l, `Char (Uchar.to_int Uutf.u_rep)) rcdata_state)
491 +
492 + | None ->
493 + emit_eof ()
494 +@@ -444,7 +448,7 @@ let tokenize report (input, get_location
495 +
496 + | Some (l, 0) ->
497 + report l (`Bad_token ("U+0000", "content", "null")) !throw (fun () ->
498 +- emit (l, `Char Uutf.u_rep) rawtext_state)
499 ++ emit (l, `Char (Uchar.to_int Uutf.u_rep)) rawtext_state)
500 +
501 + | None ->
502 + emit_eof ()
503 +@@ -461,7 +465,7 @@ let tokenize report (input, get_location
504 +
505 + | Some (l, 0) ->
506 + report l (`Bad_token ("U+0000", "content", "null")) !throw (fun () ->
507 +- emit_character l Uutf.u_rep script_data_state)
508 ++ emit_character l (Uchar.to_int Uutf.u_rep) script_data_state)
509 +
510 + | None ->
511 + emit_eof ()
512 +@@ -475,7 +479,7 @@ let tokenize report (input, get_location
513 + next_option input !throw begin function
514 + | Some (l, 0) ->
515 + report l (`Bad_token ("U+0000", "content", "null")) !throw (fun () ->
516 +- emit (l, `Char Uutf.u_rep) plaintext_state)
517 ++ emit (l, `Char (Uchar.to_int Uutf.u_rep)) plaintext_state)
518 +
519 + | None ->
520 + emit_eof ()
521 +@@ -501,7 +505,7 @@ let tokenize report (input, get_location
522 + end_tag_open_state l' tag
523 +
524 + | Some (_, c) when is_alphabetic c ->
525 +- add_utf_8 tag._tag_name (to_lowercase c);
526 ++ add_utf_8 tag._tag_name (Uchar.of_int (to_lowercase c));
527 + tag_name_state l' tag
528 +
529 + | Some (_, 0x003F) ->
530 +@@ -529,7 +533,7 @@ let tokenize report (input, get_location
531 +
532 + next_option input !throw begin function
533 + | Some (_, c) when is_alphabetic c ->
534 +- add_utf_8 tag._tag_name (to_lowercase c);
535 ++ add_utf_8 tag._tag_name (Uchar.of_int (to_lowercase c));
536 + tag_name_state l' tag
537 +
538 + | Some (_, 0x003E) ->
539 +@@ -569,7 +573,7 @@ let tokenize report (input, get_location
540 + report (get_location ()) (`Unexpected_eoi "tag") !throw data_state
541 +
542 + | Some (_, c) ->
543 +- add_utf_8 tag._tag_name (to_lowercase c);
544 ++ add_utf_8 tag._tag_name (Uchar.of_int (to_lowercase c));
545 + tag_name_state l' tag
546 + end
547 +
548 +@@ -589,7 +593,7 @@ let tokenize report (input, get_location
549 + next_option input !throw begin function
550 + | Some (_, c as v) when is_alphabetic c ->
551 + let name_buffer = Buffer.create 32 in
552 +- add_utf_8 name_buffer (to_lowercase c);
553 ++ add_utf_8 name_buffer (Uchar.of_int (to_lowercase c));
554 + text_end_tag_name_state state l' (v::cs) name_buffer
555 +
556 + | maybe_v ->
557 +@@ -618,7 +622,7 @@ let tokenize report (input, get_location
558 + emit_tag l' (create_tag ())
559 +
560 + | Some ((_, c) as v) when is_alphabetic c ->
561 +- add_utf_8 name_buffer (to_lowercase c);
562 ++ add_utf_8 name_buffer (Uchar.of_int (to_lowercase c));
563 + text_end_tag_name_state state l' (v::cs) name_buffer
564 +
565 + | maybe_v ->
566 +@@ -676,7 +680,7 @@ let tokenize report (input, get_location
567 +
568 + | Some (l, 0) ->
569 + report l (`Bad_token ("U+0000", "script", "null")) !throw (fun () ->
570 +- emit_character l Uutf.u_rep (fun () ->
571 ++ emit_character l (Uchar.to_int Uutf.u_rep) (fun () ->
572 + script_data_escaped_state l'))
573 +
574 + | None ->
575 +@@ -699,7 +703,7 @@ let tokenize report (input, get_location
576 +
577 + | Some (l, 0) ->
578 + report l (`Bad_token ("U+0000", "script", "null")) !throw (fun () ->
579 +- emit_character l Uutf.u_rep (fun () ->
580 ++ emit_character l (Uchar.to_int Uutf.u_rep) (fun () ->
581 + script_data_escaped_state l'))
582 +
583 + | None ->
584 +@@ -725,7 +729,7 @@ let tokenize report (input, get_location
585 +
586 + | Some (l, 0) ->
587 + report l (`Bad_token ("U+0000", "script", "null")) !throw (fun () ->
588 +- emit_character l Uutf.u_rep (fun () ->
589 ++ emit_character l (Uchar.to_int Uutf.u_rep) (fun () ->
590 + script_data_escaped_state l'))
591 +
592 + | None ->
593 +@@ -745,7 +749,7 @@ let tokenize report (input, get_location
594 +
595 + | Some (_, c as v) when is_alphabetic c ->
596 + let tag_buffer = Buffer.create 32 in
597 +- add_utf_8 tag_buffer (to_lowercase c);
598 ++ add_utf_8 tag_buffer (Uchar.of_int (to_lowercase c));
599 + emit_characters (List.rev (v::cs)) (fun () ->
600 + script_data_double_escape_start_state l' tag_buffer)
601 +
602 +@@ -765,7 +769,7 @@ let tokenize report (input, get_location
603 + else script_data_escaped_state l')
604 +
605 + | Some (l, c) when is_alphabetic c ->
606 +- add_utf_8 tag_buffer (to_lowercase c);
607 ++ add_utf_8 tag_buffer (Uchar.of_int (to_lowercase c));
608 + emit_character l c (fun () ->
609 + script_data_double_escape_start_state l' tag_buffer)
610 +
611 +@@ -787,7 +791,7 @@ let tokenize report (input, get_location
612 +
613 + | Some (l, 0) ->
614 + report l (`Bad_token ("U+0000", "script", "null")) !throw (fun () ->
615 +- emit_character l Uutf.u_rep (fun () ->
616 ++ emit_character l (Uchar.to_int Uutf.u_rep) (fun () ->
617 + script_data_double_escaped_state l'))
618 +
619 + | None ->
620 +@@ -811,7 +815,7 @@ let tokenize report (input, get_location
621 +
622 + | Some (l, 0) ->
623 + report l (`Bad_token ("U+0000", "script", "null")) !throw (fun () ->
624 +- emit_character l Uutf.u_rep (fun () ->
625 ++ emit_character l (Uchar.to_int Uutf.u_rep) (fun () ->
626 + script_data_double_escaped_state l'))
627 +
628 + | None ->
629 +@@ -838,7 +842,7 @@ let tokenize report (input, get_location
630 +
631 + | Some (l, 0) ->
632 + report l (`Bad_token ("U+0000", "script", "null")) !throw (fun () ->
633 +- emit_character l Uutf.u_rep (fun () ->
634 ++ emit_character l (Uchar.to_int Uutf.u_rep) (fun () ->
635 + script_data_double_escaped_state l'))
636 +
637 + | None ->
638 +@@ -872,7 +876,7 @@ let tokenize report (input, get_location
639 + else script_data_double_escaped_state l')
640 +
641 + | Some (l, c) when is_alphabetic c ->
642 +- add_utf_8 tag_buffer (to_lowercase c);
643 ++ add_utf_8 tag_buffer (Uchar.of_int (to_lowercase c));
644 + emit_character l c (fun () ->
645 + script_data_double_escape_end_state l' tag_buffer)
646 +
647 +@@ -910,10 +914,10 @@ let tokenize report (input, get_location
648 + | Some (l, (0x0022 | 0x0027 | 0x003C | 0x003D as c)) ->
649 + report l (`Bad_token (char c, "attribute name",
650 + "invalid start character")) !throw (fun () ->
651 +- start_attribute c)
652 ++ start_attribute (Uchar.of_int c))
653 +
654 + | Some (_, c) ->
655 +- start_attribute (to_lowercase c)
656 ++ start_attribute (Uchar.of_int (to_lowercase c))
657 + end
658 +
659 + (* 8.2.4.35. *)
660 +@@ -942,14 +946,14 @@ let tokenize report (input, get_location
661 + | Some (l, (0x0022 | 0x0027 | 0x003C as c)) ->
662 + report l (`Bad_token (char c, "attribute name",
663 + "invalid name character")) !throw (fun () ->
664 +- add_utf_8 name_buffer c;
665 ++ add_utf_8 name_buffer (Uchar.of_int c);
666 + attribute_name_state l' tag name_buffer)
667 +
668 + | None ->
669 + report (get_location ()) (`Unexpected_eoi "tag") !throw data_state
670 +
671 + | Some (_, c) ->
672 +- add_utf_8 name_buffer (to_lowercase c);
673 ++ add_utf_8 name_buffer (Uchar.of_int (to_lowercase c));
674 + attribute_name_state l' tag name_buffer
675 + end
676 +
677 +@@ -985,13 +989,13 @@ let tokenize report (input, get_location
678 + | Some (l, (0x0022 | 0x0027 | 0x003C as c)) ->
679 + report l (`Bad_token (char c, "attribute name",
680 + "invalid start character")) !throw (fun () ->
681 +- start_next_attribute c)
682 ++ start_next_attribute (Uchar.of_int c))
683 +
684 + | None ->
685 + report (get_location ()) (`Unexpected_eoi "tag") !throw data_state
686 +
687 + | Some (_, c) ->
688 +- start_next_attribute (to_lowercase c)
689 ++ start_next_attribute (Uchar.of_int (to_lowercase c))
690 + end
691 +
692 + (* 8.2.4.37. *)
693 +@@ -1030,13 +1034,13 @@ let tokenize report (input, get_location
694 + | Some (l, (0x003C | 0x003D | 0x0060 as c)) ->
695 + report l (`Bad_token (char c, "attribute value",
696 + "invalid start character")) !throw (fun () ->
697 +- start_value attribute_value_unquoted_state (Some c))
698 ++ start_value attribute_value_unquoted_state (Some (Uchar.of_int c)))
699 +
700 + | None ->
701 + report (get_location ()) (`Unexpected_eoi "tag") !throw data_state
702 +
703 + | Some (_, c) ->
704 +- start_value attribute_value_unquoted_state (Some c)
705 ++ start_value attribute_value_unquoted_state (Some (Uchar.of_int c))
706 + end
707 +
708 + (* 8.2.4.38 and 8.2.4.39. *)
709 +@@ -1062,7 +1066,7 @@ let tokenize report (input, get_location
710 + data_state
711 +
712 + | Some (_, c) ->
713 +- add_utf_8 value_buffer c;
714 ++ add_utf_8 value_buffer (Uchar.of_int c);
715 + attribute_value_quoted_state quote l' tag name value_buffer
716 + end
717 +
718 +@@ -1092,14 +1096,14 @@ let tokenize report (input, get_location
719 + | Some (l, (0x0022 | 0x0027 | 0x003C | 0x003D | 0x0060 as c)) ->
720 + report l (`Bad_token (char c, "attribute value",
721 + "invalid character")) !throw (fun () ->
722 +- add_utf_8 value_buffer c;
723 ++ add_utf_8 value_buffer (Uchar.of_int c);
724 + attribute_value_unquoted_state l' tag name value_buffer)
725 +
726 + | None ->
727 + report (get_location ()) (`Unexpected_eoi "tag") !throw data_state
728 +
729 + | Some (_, c) ->
730 +- add_utf_8 value_buffer c;
731 ++ add_utf_8 value_buffer (Uchar.of_int c);
732 + attribute_value_unquoted_state l' tag name value_buffer
733 + end
734 +
735 +@@ -1107,7 +1111,7 @@ let tokenize report (input, get_location
736 + and character_reference_in_attribute allowed l value_buffer k =
737 + consume_character_reference true (Some allowed) l begin function
738 + | None ->
739 +- add_utf_8 value_buffer 0x0026;
740 ++ add_utf_8 value_buffer (Uchar.of_int 0x0026);
741 + k ()
742 +
743 + | Some (`One c) ->
744 +@@ -1176,7 +1180,7 @@ let tokenize report (input, get_location
745 + emit_comment l' buffer
746 +
747 + | Some (_, c) ->
748 +- add_utf_8 buffer c;
749 ++ add_utf_8 buffer (Uchar.of_int c);
750 + consume ()
751 + end
752 + in
753 +@@ -1239,7 +1243,7 @@ let tokenize report (input, get_location
754 + emit_comment l' buffer)
755 +
756 + | Some (_, c) ->
757 +- add_utf_8 buffer c;
758 ++ add_utf_8 buffer (Uchar.of_int c);
759 + comment_state l' buffer
760 + end
761 +
762 +@@ -1266,7 +1270,7 @@ let tokenize report (input, get_location
763 +
764 + | Some (_, c) ->
765 + Buffer.add_char buffer '-';
766 +- add_utf_8 buffer c;
767 ++ add_utf_8 buffer (Uchar.of_int c);
768 + comment_state l' buffer
769 + end
770 +
771 +@@ -1286,7 +1290,7 @@ let tokenize report (input, get_location
772 + emit_comment l' buffer)
773 +
774 + | Some (_, c) ->
775 +- add_utf_8 buffer c;
776 ++ add_utf_8 buffer (Uchar.of_int c);
777 + comment_state l' buffer
778 + end
779 +
780 +@@ -1308,7 +1312,7 @@ let tokenize report (input, get_location
781 +
782 + | Some (_, c) ->
783 + Buffer.add_char buffer '-';
784 +- add_utf_8 buffer c;
785 ++ add_utf_8 buffer (Uchar.of_int c);
786 + comment_state l' buffer
787 + end
788 +
789 +@@ -1343,7 +1347,7 @@ let tokenize report (input, get_location
790 + report l (`Bad_token ("--" ^ (char c), "comment",
791 + "'--' should be in '-->'")) !throw (fun () ->
792 + Buffer.add_string buffer "--";
793 +- add_utf_8 buffer c;
794 ++ add_utf_8 buffer (Uchar.of_int c);
795 + comment_state l' buffer)
796 + end
797 +
798 +@@ -1369,7 +1373,7 @@ let tokenize report (input, get_location
799 +
800 + | Some (_, c) ->
801 + Buffer.add_string buffer "--!";
802 +- add_utf_8 buffer c;
803 ++ add_utf_8 buffer (Uchar.of_int c);
804 + comment_state l' buffer
805 + end
806 +
807 +@@ -1420,7 +1424,7 @@ let tokenize report (input, get_location
808 +
809 + | Some (_, c) ->
810 + doctype._doctype_name <-
811 +- add_doctype_char doctype._doctype_name (to_lowercase c);
812 ++ add_doctype_char doctype._doctype_name (Uchar.of_int (to_lowercase c));
813 + doctype_name_state l' doctype
814 + end
815 +
816 +@@ -1445,7 +1449,7 @@ let tokenize report (input, get_location
817 +
818 + | Some (_, c) ->
819 + doctype._doctype_name <-
820 +- add_doctype_char doctype._doctype_name (to_lowercase c);
821 ++ add_doctype_char doctype._doctype_name (Uchar.of_int (to_lowercase c));
822 + doctype_name_state l' doctype
823 + end
824 +
825 +@@ -1574,7 +1578,7 @@ let tokenize report (input, get_location
826 + emit_doctype ~quirks:true l' doctype)
827 +
828 + | Some (_, c) ->
829 +- add doctype c;
830 ++ add doctype (Uchar.of_int c);
831 + doctype_identifier_quoted_state add quote next_state l' doctype
832 + end
833 +
834 +Index: markup.ml-0.7.2/src/html_writer.ml
835 +===================================================================
836 +--- markup.ml-0.7.2.orig/src/html_writer.ml
837 ++++ markup.ml-0.7.2/src/html_writer.ml
838 +@@ -8,7 +8,7 @@ let _escape_attribute s =
839 + Uutf.String.fold_utf_8 (fun () _ -> function
840 + | `Malformed _ -> ()
841 + | `Uchar c ->
842 +- match c with
843 ++ match (Uchar.to_int c) with
844 + | 0x0026 -> Buffer.add_string buffer "&amp;"
845 + | 0x00A0 -> Buffer.add_string buffer "&nbsp;"
846 + | 0x0022 -> Buffer.add_string buffer "&quot;"
847 +@@ -21,7 +21,7 @@ let _escape_text s =
848 + Uutf.String.fold_utf_8 (fun () _ -> function
849 + | `Malformed _ -> ()
850 + | `Uchar c ->
851 +- match c with
852 ++ match (Uchar.to_int c) with
853 + | 0x0026 -> Buffer.add_string buffer "&amp;"
854 + | 0x00A0 -> Buffer.add_string buffer "&nbsp;"
855 + | 0x003C -> Buffer.add_string buffer "&lt;"
856 +Index: markup.ml-0.7.2/src/input.ml
857 +===================================================================
858 +--- markup.ml-0.7.2.orig/src/input.ml
859 ++++ markup.ml-0.7.2/src/input.ml
860 +@@ -27,13 +27,13 @@ let preprocess is_valid_char report sour
861 + in
862 +
863 + let rec iterate () =
864 +- next source throw empty (function
865 ++ next source throw empty (fun x -> match Uchar.to_int x with
866 + | 0xFEFF when !first_char -> first_char := false; iterate ()
867 +
868 + | 0x0D ->
869 +- next source throw newline (function
870 ++ next source throw newline (fun y -> match Uchar.to_int y with
871 + | 0x0A -> newline ()
872 +- | c -> push source c; newline ())
873 ++ | c -> push source (Uchar.of_int c); newline ())
874 +
875 + | 0x0A -> newline ()
876 +
877 +Index: markup.ml-0.7.2/src/input.mli
878 +===================================================================
879 +--- markup.ml-0.7.2.orig/src/input.mli
880 ++++ markup.ml-0.7.2/src/input.mli
881 +@@ -4,5 +4,5 @@
882 + open Common
883 +
884 + val preprocess :
885 +- (int -> bool) -> Error.parse_handler -> int Kstream.t ->
886 ++ (int -> bool) -> Error.parse_handler -> Uchar.t Kstream.t ->
887 + (location * int) Kstream.t * (unit -> location)
888 +Index: markup.ml-0.7.2/src/markup.ml
889 +===================================================================
890 +--- markup.ml-0.7.2.orig/src/markup.ml
891 ++++ markup.ml-0.7.2/src/markup.ml
892 +@@ -187,7 +187,7 @@ sig
893 +
894 + val decode :
895 + ?report:(location -> Error.t -> unit io) -> t ->
896 +- (char, _) stream -> (int, async) stream
897 ++ (char, _) stream -> (Uchar.t, async) stream
898 + end
899 +
900 + val parse_xml :
901 +Index: markup.ml-0.7.2/src/markup.mli
902 +===================================================================
903 +--- markup.ml-0.7.2.orig/src/markup.mli
904 ++++ markup.ml-0.7.2/src/markup.mli
905 +@@ -194,7 +194,7 @@ sig
906 +
907 + val decode :
908 + ?report:(location -> Error.t -> unit) -> t ->
909 +- (char, 's) stream -> (int, 's) stream
910 ++ (char, 's) stream -> (Uchar.t, 's) stream
911 + (** Applies a decoder to a byte stream. Illegal input byte sequences result in
912 + calls to the error handler [~report] with error kind [`Decoding_error].
913 + The illegal bytes are then skipped, and zero or more U+FFFD replacement
914 +@@ -764,7 +764,7 @@ sig
915 +
916 + val decode :
917 + ?report:(location -> Error.t -> unit io) -> Encoding.t ->
918 +- (char, _) stream -> (int, async) stream
919 ++ (char, _) stream -> (Uchar.t, async) stream
920 + end
921 +
922 + (** {2 XML} *)
923 +@@ -838,7 +838,7 @@ val kstream : ('a, _) stream -> 'a Kstre
924 + val of_kstream : 'a Kstream.t -> ('a, _) stream
925 +
926 + val preprocess_input_stream :
927 +- (int, 's) stream -> (location * int, 's) stream * (unit -> location)
928 ++ (Uchar.t, 's) stream -> (location * int, 's) stream * (unit -> location)
929 +
930 + (**/**)
931 +
932 +Index: markup.ml-0.7.2/src/utility.ml
933 +===================================================================
934 +--- markup.ml-0.7.2.orig/src/utility.ml
935 ++++ markup.ml-0.7.2/src/utility.ml
936 +@@ -346,11 +346,11 @@ let xhtml_entity name =
937 +
938 + match lookup 0 with
939 + | `One c ->
940 +- add_utf_8 buffer c;
941 ++ add_utf_8 buffer (Uchar.of_int c);
942 + Some (Buffer.contents buffer)
943 + | `Two (c, c') ->
944 +- add_utf_8 buffer c;
945 +- add_utf_8 buffer c';
946 ++ add_utf_8 buffer (Uchar.of_int c);
947 ++ add_utf_8 buffer (Uchar.of_int c');
948 + Some (Buffer.contents buffer)
949 +
950 + with Exit -> None
951 +Index: markup.ml-0.7.2/src/xml_tokenizer.ml
952 +===================================================================
953 +--- markup.ml-0.7.2.orig/src/xml_tokenizer.ml
954 ++++ markup.ml-0.7.2/src/xml_tokenizer.ml
955 +@@ -101,7 +101,7 @@ let tokenize report resolve_reference (i
956 + end
957 +
958 + | _, c when filter c ->
959 +- add_utf_8 buffer c;
960 ++ add_utf_8 buffer (Uchar.of_int c);
961 + read ()
962 +
963 + | l, c ->
964 +@@ -133,7 +133,7 @@ let tokenize report resolve_reference (i
965 +
966 + | _, c when is_name_start_char c ->
967 + let buffer = Buffer.create 32 in
968 +- add_utf_8 buffer c;
969 ++ add_utf_8 buffer (Uchar.of_int c);
970 + let rec read () =
971 + next input !throw unexpected_eoi begin function
972 + | _, 0x003B ->
973 +@@ -146,7 +146,7 @@ let tokenize report resolve_reference (i
974 + end
975 +
976 + | _, c when is_name_char c ->
977 +- add_utf_8 buffer c;
978 ++ add_utf_8 buffer (Uchar.of_int c);
979 + read ()
980 +
981 + | l, c ->
982 +@@ -218,7 +218,7 @@ let tokenize report resolve_reference (i
983 + report_if (not @@ is_name_start_char c) l (fun () ->
984 + `Bad_token (char c, "attribute", "invalid start character"))
985 + !throw (fun () ->
986 +- add_utf_8 name_buffer c;
987 ++ add_utf_8 name_buffer (Uchar.of_int c);
988 + name_state ())
989 + end
990 +
991 +@@ -235,7 +235,7 @@ let tokenize report resolve_reference (i
992 + report_if (not @@ is_name_start_char c) l (fun () ->
993 + `Bad_token (char c, "attribute", "invalid name character"))
994 + !throw (fun () ->
995 +- add_utf_8 name_buffer c;
996 ++ add_utf_8 name_buffer (Uchar.of_int c);
997 + name_state ())
998 + end
999 +
1000 +@@ -275,14 +275,14 @@ let tokenize report resolve_reference (i
1001 + report l
1002 + (`Bad_token ("&", "attribute", "replace with '&amp;'"))
1003 + !throw (fun () ->
1004 +- add_utf_8 value_buffer 0x0026;
1005 ++ add_utf_8 value_buffer (Uchar.of_int 0x0026);
1006 + state ())
1007 + end
1008 +
1009 + and handle_lt l state =
1010 + report l (`Bad_token ("<", "attribute", "replace with '&lt;'")) !throw
1011 + (fun () ->
1012 +- add_utf_8 value_buffer 0x003C;
1013 ++ add_utf_8 value_buffer (Uchar.of_int 0x003C);
1014 + state ())
1015 +
1016 + and quoted_value_state quote =
1017 +@@ -300,7 +300,7 @@ let tokenize report resolve_reference (i
1018 + quoted_value_state quote)
1019 +
1020 + | _, c ->
1021 +- add_utf_8 value_buffer c;
1022 ++ add_utf_8 value_buffer (Uchar.of_int c);
1023 + quoted_value_state quote
1024 + end
1025 +
1026 +@@ -317,7 +317,7 @@ let tokenize report resolve_reference (i
1027 + handle_lt l unquoted_value_state
1028 +
1029 + | _, c ->
1030 +- add_utf_8 value_buffer c;
1031 ++ add_utf_8 value_buffer (Uchar.of_int c);
1032 + unquoted_value_state ()
1033 + end
1034 +
1035 +@@ -372,7 +372,7 @@ let tokenize report resolve_reference (i
1036 + report_if (not @@ is_name_start_char c) l (fun () ->
1037 + `Bad_token (char c, pi, "invalid start character")) !throw
1038 + (fun () ->
1039 +- add_utf_8 target_buffer c;
1040 ++ add_utf_8 target_buffer (Uchar.of_int c);
1041 + target_state ())
1042 + end
1043 +
1044 +@@ -388,13 +388,13 @@ let tokenize report resolve_reference (i
1045 + report_if (not @@ is_name_char c) l (fun () ->
1046 + `Bad_token (char c, pi, "invalid name character")) !throw
1047 + (fun () ->
1048 +- add_utf_8 target_buffer c;
1049 ++ add_utf_8 target_buffer (Uchar.of_int c);
1050 + target_state ())
1051 + end
1052 +
1053 + and text_state () =
1054 + next' pi finish_pi (fun (_, c) ->
1055 +- add_utf_8 text_buffer c;
1056 ++ add_utf_8 text_buffer (Uchar.of_int c);
1057 + text_state ())
1058 +
1059 + and xml_declaration_state () =
1060 +@@ -572,7 +572,7 @@ let tokenize report resolve_reference (i
1061 + and initial_state () =
1062 + next input !throw (fun () -> emit_eoi ()) begin function
1063 + | l, (0x005D as c) ->
1064 +- add_character l c;
1065 ++ add_character l (Uchar.of_int c);
1066 + one_bracket_state l
1067 +
1068 + | l, 0x003C ->
1069 +@@ -583,7 +583,7 @@ let tokenize report resolve_reference (i
1070 + | None ->
1071 + report l (`Bad_token (char c, "text", "replace with '&amp;'"))
1072 + !throw (fun () ->
1073 +- add_character l c;
1074 ++ add_character l (Uchar.of_int c);
1075 + initial_state ())
1076 +
1077 + | Some s ->
1078 +@@ -591,14 +591,14 @@ let tokenize report resolve_reference (i
1079 + initial_state ())
1080 +
1081 + | l, c ->
1082 +- add_character l c;
1083 ++ add_character l (Uchar.of_int c);
1084 + initial_state ()
1085 + end
1086 +
1087 + and one_bracket_state l' =
1088 + next_option input !throw begin function
1089 + | Some (l, (0x005D as c)) ->
1090 +- add_character l c;
1091 ++ add_character l (Uchar.of_int c);
1092 + two_brackets_state l' l
1093 +
1094 + | v ->
1095 +@@ -611,11 +611,11 @@ let tokenize report resolve_reference (i
1096 + | Some (l, (0x003E as c)) ->
1097 + report l' (`Bad_token ("]]>", "text", "must end a CDATA section"))
1098 + !throw (fun () ->
1099 +- add_character l c;
1100 ++ add_character l (Uchar.of_int c);
1101 + initial_state ())
1102 +
1103 + | Some (l, (0x005D as c)) ->
1104 +- add_character l c;
1105 ++ add_character l (Uchar.of_int c);
1106 + two_brackets_state l'' l
1107 +
1108 + | v ->
1109 +@@ -626,7 +626,7 @@ let tokenize report resolve_reference (i
1110 + and begin_markup_state l' =
1111 + let recover v =
1112 + lt_in_text l' (fun () ->
1113 +- add_character l' 0x003C;
1114 ++ add_character l' (Uchar.of_int 0x003C);
1115 + push_option input v;
1116 + initial_state ())
1117 + in
1118 +@@ -648,7 +648,7 @@ let tokenize report resolve_reference (i
1119 +
1120 + | _, c when is_name_start_char c ->
1121 + let tag_name_buffer = Buffer.create 32 in
1122 +- add_utf_8 tag_name_buffer c;
1123 ++ add_utf_8 tag_name_buffer (Uchar.of_int c);
1124 + start_tag_state l' tag_name_buffer
1125 +
1126 + | l, c as v ->
1127 +@@ -660,7 +660,7 @@ let tokenize report resolve_reference (i
1128 + and start_tag_state l' buffer =
1129 + let recover v =
1130 + lt_in_text l' (fun () ->
1131 +- add_character l' 0x003C;
1132 ++ add_character l' (Uchar.of_int 0x003C);
1133 + add_string l' (Buffer.contents buffer);
1134 + push_option input v;
1135 + initial_state ())
1136 +@@ -680,7 +680,7 @@ let tokenize report resolve_reference (i
1137 + attributes_state l' (Buffer.contents buffer) []
1138 +
1139 + | _, c when is_name_char c ->
1140 +- add_utf_8 buffer c;
1141 ++ add_utf_8 buffer (Uchar.of_int c);
1142 + start_tag_state l' buffer
1143 +
1144 + | l, c as v ->
1145 +@@ -731,8 +731,8 @@ let tokenize report resolve_reference (i
1146 + and end_tag_state l' =
1147 + let recover v =
1148 + lt_in_text l' (fun () ->
1149 +- add_character l' 0x003C;
1150 +- add_character l' 0x002F;
1151 ++ add_character l' (Uchar.of_int 0x003C);
1152 ++ add_character l' (Uchar.of_int 0x002F);
1153 + push_option input v;
1154 + initial_state ())
1155 + in
1156 +@@ -743,7 +743,7 @@ let tokenize report resolve_reference (i
1157 + begin function
1158 + | _, c when is_name_start_char c ->
1159 + let name_buffer = Buffer.create 32 in
1160 +- add_utf_8 name_buffer c;
1161 ++ add_utf_8 name_buffer (Uchar.of_int c);
1162 + end_tag_name_state l' name_buffer
1163 +
1164 + | l, c as v ->
1165 +@@ -755,8 +755,8 @@ let tokenize report resolve_reference (i
1166 + and end_tag_name_state l' buffer =
1167 + let recover v =
1168 + lt_in_text l' (fun () ->
1169 +- add_character l' 0x003C;
1170 +- add_character l' 0x002F;
1171 ++ add_character l' (Uchar.of_int 0x003C);
1172 ++ add_character l' (Uchar.of_int 0x002F);
1173 + add_string l' (Buffer.contents buffer);
1174 + push_option input v;
1175 + initial_state ())
1176 +@@ -773,7 +773,7 @@ let tokenize report resolve_reference (i
1177 + end_tag_whitespace_state false l' (Buffer.contents buffer)
1178 +
1179 + | _, c when is_name_char c ->
1180 +- add_utf_8 buffer c;
1181 ++ add_utf_8 buffer (Uchar.of_int c);
1182 + end_tag_name_state l' buffer
1183 +
1184 + | l, c as v ->
1185 +@@ -821,8 +821,8 @@ let tokenize report resolve_reference (i
1186 +
1187 + | v ->
1188 + bad_comment_start "<!" l' (fun () ->
1189 +- add_character l' 0x003C;
1190 +- add_character l' 0x0021;
1191 ++ add_character l' (Uchar.of_int 0x003C);
1192 ++ add_character l' (Uchar.of_int 0x0021);
1193 + push_option input v;
1194 + initial_state ())
1195 + end
1196 +@@ -834,9 +834,9 @@ let tokenize report resolve_reference (i
1197 +
1198 + | v ->
1199 + bad_comment_start "<!-" l' (fun () ->
1200 +- add_character l' 0x003C;
1201 +- add_character l' 0x0021;
1202 +- add_character l' 0x002D;
1203 ++ add_character l' (Uchar.of_int 0x003C);
1204 ++ add_character l' (Uchar.of_int 0x0021);
1205 ++ add_character l' (Uchar.of_int 0x002D);
1206 + push_option input v;
1207 + initial_state ())
1208 + end
1209 +@@ -852,7 +852,7 @@ let tokenize report resolve_reference (i
1210 + comment_one_dash_state l' l buffer
1211 +
1212 + | _, c ->
1213 +- add_utf_8 buffer c;
1214 ++ add_utf_8 buffer (Uchar.of_int c);
1215 + comment_state l' buffer
1216 + end
1217 +
1218 +@@ -863,8 +863,8 @@ let tokenize report resolve_reference (i
1219 + comment_two_dashes_state false l' l'' buffer
1220 +
1221 + | _, c ->
1222 +- add_utf_8 buffer 0x002D;
1223 +- add_utf_8 buffer c;
1224 ++ add_utf_8 buffer (Uchar.of_int 0x002D);
1225 ++ add_utf_8 buffer (Uchar.of_int c);
1226 + comment_state l' buffer
1227 + end
1228 +
1229 +@@ -883,14 +883,14 @@ let tokenize report resolve_reference (i
1230 +
1231 + | _, 0x002D ->
1232 + recover (fun () ->
1233 +- add_utf_8 buffer 0x002D;
1234 ++ add_utf_8 buffer (Uchar.of_int 0x002D);
1235 + comment_two_dashes_state true l' l'' buffer)
1236 +
1237 + | _, c ->
1238 + recover (fun () ->
1239 +- add_utf_8 buffer 0x002D;
1240 +- add_utf_8 buffer 0x002D;
1241 +- add_utf_8 buffer c;
1242 ++ add_utf_8 buffer (Uchar.of_int 0x002D);
1243 ++ add_utf_8 buffer (Uchar.of_int 0x002D);
1244 ++ add_utf_8 buffer (Uchar.of_int c);
1245 + comment_state l' buffer)
1246 + end
1247 +
1248 +@@ -905,9 +905,9 @@ let tokenize report resolve_reference (i
1249 + !throw (fun () ->
1250 + lt_in_text l' (fun () ->
1251 + push_list input cs;
1252 +- add_character l' 0x003C;
1253 +- add_character l' 0x0021;
1254 +- add_character l' 0x005B;
1255 ++ add_character l' (Uchar.of_int 0x003C);
1256 ++ add_character l' (Uchar.of_int 0x0021);
1257 ++ add_character l' (Uchar.of_int 0x005B);
1258 + initial_state ()))
1259 + end
1260 +
1261 +@@ -918,7 +918,7 @@ let tokenize report resolve_reference (i
1262 + cdata_one_bracket_state l' l
1263 +
1264 + | l, c ->
1265 +- add_character l c;
1266 ++ add_character l (Uchar.of_int c);
1267 + cdata_state l'
1268 + end
1269 +
1270 +@@ -929,8 +929,8 @@ let tokenize report resolve_reference (i
1271 + cdata_two_brackets_state l' l'' l
1272 +
1273 + | l, c ->
1274 +- add_character l'' 0x005D;
1275 +- add_character l c;
1276 ++ add_character l'' (Uchar.of_int 0x005D);
1277 ++ add_character l (Uchar.of_int c);
1278 + cdata_state l'
1279 + end
1280 +
1281 +@@ -941,13 +941,13 @@ let tokenize report resolve_reference (i
1282 + initial_state ()
1283 +
1284 + | l, 0x005D ->
1285 +- add_character l'' 0x005D;
1286 ++ add_character l'' (Uchar.of_int 0x005D);
1287 + cdata_two_brackets_state l' l''' l
1288 +
1289 + | l, c ->
1290 +- add_character l'' 0x005D;
1291 +- add_character l''' 0x005D;
1292 +- add_character l c;
1293 ++ add_character l'' (Uchar.of_int 0x005D);
1294 ++ add_character l''' (Uchar.of_int 0x005D);
1295 ++ add_character l (Uchar.of_int c);
1296 + cdata_state l'
1297 + end
1298 +
1299 +@@ -963,9 +963,9 @@ let tokenize report resolve_reference (i
1300 + !throw (fun () ->
1301 + lt_in_text l' (fun () ->
1302 + push_list input cs;
1303 +- add_character l' 0x003C;
1304 +- add_character l' 0x0021;
1305 +- add_character l' 0x0044;
1306 ++ add_character l' (Uchar.of_int 0x003C);
1307 ++ add_character l' (Uchar.of_int 0x0021);
1308 ++ add_character l' (Uchar.of_int 0x0044);
1309 + initial_state ()))
1310 + end
1311 +
1312 +@@ -980,15 +980,15 @@ let tokenize report resolve_reference (i
1313 + emit_doctype l' buffer initial_state
1314 +
1315 + | _, (0x0022 | 0x0027 as c) ->
1316 +- add_utf_8 buffer c;
1317 ++ add_utf_8 buffer (Uchar.of_int c);
1318 + doctype_quoted_state (fun () -> doctype_state l' buffer) c l' buffer
1319 +
1320 + | _, (0x003C as c) ->
1321 +- add_utf_8 buffer c;
1322 ++ add_utf_8 buffer (Uchar.of_int c);
1323 + doctype_item_state (fun () -> doctype_state l' buffer) l' buffer
1324 +
1325 + | _, c ->
1326 +- add_utf_8 buffer c;
1327 ++ add_utf_8 buffer (Uchar.of_int c);
1328 + doctype_state l' buffer
1329 + end
1330 +
1331 +@@ -996,11 +996,11 @@ let tokenize report resolve_reference (i
1332 + next input !throw (fun () -> unterminated_doctype l' buffer)
1333 + begin function
1334 + | _, c when c = quote ->
1335 +- add_utf_8 buffer c;
1336 ++ add_utf_8 buffer (Uchar.of_int c);
1337 + state ()
1338 +
1339 + | _, c ->
1340 +- add_utf_8 buffer c;
1341 ++ add_utf_8 buffer (Uchar.of_int c);
1342 + doctype_quoted_state state quote l' buffer
1343 + end
1344 +
1345 +@@ -1008,18 +1008,18 @@ let tokenize report resolve_reference (i
1346 + next input !throw (fun () -> unterminated_doctype l' buffer)
1347 + begin function
1348 + | _, (0x0021 as c) ->
1349 +- add_utf_8 buffer c;
1350 ++ add_utf_8 buffer (Uchar.of_int c);
1351 + doctype_declaration_state state l' buffer
1352 +
1353 + | l, (0x003F as c) ->
1354 +- add_utf_8 buffer c;
1355 +- let undo = tap (fun (_, c) -> add_utf_8 buffer c) input in
1356 ++ add_utf_8 buffer (Uchar.of_int c);
1357 ++ let undo = tap (fun (_, c) -> add_utf_8 buffer (Uchar.of_int c)) input in
1358 + parse_declaration_or_processing_instruction l (fun _ ->
1359 + undo ();
1360 + state ())
1361 +
1362 + | _, c ->
1363 +- add_utf_8 buffer c;
1364 ++ add_utf_8 buffer (Uchar.of_int c);
1365 + state ()
1366 + end
1367 +
1368 +@@ -1027,16 +1027,16 @@ let tokenize report resolve_reference (i
1369 + next input !throw (fun () -> unterminated_doctype l' buffer)
1370 + begin function
1371 + | _, (0x003E as c) ->
1372 +- add_utf_8 buffer c;
1373 ++ add_utf_8 buffer (Uchar.of_int c);
1374 + state ()
1375 +
1376 + | _, (0x0022 | 0x0027 as c) ->
1377 +- add_utf_8 buffer c;
1378 ++ add_utf_8 buffer (Uchar.of_int c);
1379 + doctype_quoted_state
1380 + (fun () -> doctype_declaration_state state l' buffer) c l' buffer
1381 +
1382 + | _, c ->
1383 +- add_utf_8 buffer c;
1384 ++ add_utf_8 buffer (Uchar.of_int c);
1385 + doctype_declaration_state state l' buffer
1386 + end
1387 +
1388
1389 diff --git a/dev-ml/markup/markup-0.7.2.ebuild b/dev-ml/markup/markup-0.7.2-r1.ebuild
1390 similarity index 82%
1391 rename from dev-ml/markup/markup-0.7.2.ebuild
1392 rename to dev-ml/markup/markup-0.7.2-r1.ebuild
1393 index 235c575..f70ac55 100644
1394 --- a/dev-ml/markup/markup-0.7.2.ebuild
1395 +++ b/dev-ml/markup/markup-0.7.2-r1.ebuild
1396 @@ -4,21 +4,21 @@
1397
1398 EAPI=5
1399
1400 -inherit findlib
1401 +inherit findlib eutils
1402
1403 DESCRIPTION="Error-recovering streaming HTML5 and XML parsers"
1404 HOMEPAGE="https://github.com/aantron/markup.ml"
1405 SRC_URI="https://github.com/aantron/markup.ml/archive/${PV}.tar.gz -> ${P}.tar.gz"
1406
1407 LICENSE="BSD"
1408 -SLOT="0/${PV}"
1409 +SLOT="0/${PV}p1"
1410 KEYWORDS="~amd64"
1411 IUSE="doc test"
1412
1413 DEPEND="
1414 dev-lang/ocaml:=[ocamlopt]
1415 dev-ml/lwt:=[ocamlopt]
1416 - dev-ml/uutf:=[ocamlopt]
1417 + >=dev-ml/uutf-1.0:=[ocamlopt]
1418 "
1419 RDEPEND="${DEPEND}"
1420 DEPEND="${DEPEND}
1421 @@ -26,6 +26,11 @@ DEPEND="${DEPEND}
1422 dev-ml/ocamlbuild"
1423 S="${WORKDIR}/${PN}.ml-${PV}"
1424
1425 +src_prepare() {
1426 + epatch "${FILESDIR}/uutf.patch" \
1427 + "${FILESDIR}/test.patch"
1428 +}
1429 +
1430 src_compile() {
1431 emake
1432 use doc && emake docs