[gentoo-commits] repo/gentoo:master commit in: dev-ml/markup/files/, dev-ml/markup/ - gentoo-commits

From:	Alexis Ballier <aballier@g.o>
To:	gentoo-commits@l.g.o
Subject:	[gentoo-commits] repo/gentoo:master commit in: dev-ml/markup/files/, dev-ml/markup/
Date:	Mon, 28 Nov 2016 20:27:53
Message-Id:	`1480364853.56c0c3a567b08228699194eb2820a6f59595ab6a.aballier@gentoo`

1

commit:     56c0c3a567b08228699194eb2820a6f59595ab6a

2

Author:     Alexis Ballier <aballier <AT> gentoo <DOT> org>

3

AuthorDate: Mon Nov 28 20:26:55 2016 +0000

4

Commit:     Alexis Ballier <aballier <AT> gentoo <DOT> org>

5

CommitDate: Mon Nov 28 20:27:33 2016 +0000

6

URL:        https://gitweb.gentoo.org/repo/gentoo.git/commit/?id=56c0c3a5

7

8

dev-ml/markup: fix build with uutf 1.0

9

10

Package-Manager: portage-2.3.2

11

12

 dev-ml/markup/files/test.patch                     |  273 +++++

13

 dev-ml/markup/files/uutf.patch                     | 1085 ++++++++++++++++++++

14

 ...{markup-0.7.2.ebuild => markup-0.7.2-r1.ebuild} |   11 +-

15

 3 files changed, 1366 insertions(+), 3 deletions(-)

16

17

diff --git a/dev-ml/markup/files/test.patch b/dev-ml/markup/files/test.patch

18

new file mode 100644

19

index 00000000..f2a5257

20

--- /dev/null

21

+++ b/dev-ml/markup/files/test.patch

22

@@ -0,0 +1,273 @@

23

+Index: markup.ml-0.7.2/test/test_encoding.ml

24

+===================================================================

25

+--- markup.ml-0.7.2.orig/test/test_encoding.ml

26

++++ markup.ml-0.7.2/test/test_encoding.ml

27

+@@ -15,9 +15,9 @@ let test_ucs_4 (f : Encoding.t) name s1

28

+   expect_error (1, 2) (`Decoding_error (bad_bytes, name))

29

+   begin fun report ->

30

+     let chars = s1 |> string |> f ~report in

31

+-    next_option chars ok (assert_equal (Some (Char.code 'f')));

32

++    next_option chars ok (assert_equal (Some ((Uchar.of_int (Char.code 'f')))));

33

+     next_option chars ok (assert_equal (Some Uutf.u_rep));

34

+-    next_option chars ok (assert_equal (Some (Char.code 'o')));

35

++    next_option chars ok (assert_equal (Some ((Uchar.of_int (Char.code 'o')))));

36

+     next_option chars ok (assert_equal None);

37

+     next_option chars ok (assert_equal None)

38

+   end;

39

+@@ -25,9 +25,9 @@ let test_ucs_4 (f : Encoding.t) name s1

40

+   expect_error (2, 2) (`Decoding_error ("\x00\x00\x00", name))

41

+   begin fun report ->

42

+     let chars = s2 |> string |> f ~report in

43

+-    next_option chars ok (assert_equal (Some (Char.code 'f')));

44

+-    next_option chars ok (assert_equal (Some 0x000A));

45

+-    next_option chars ok (assert_equal (Some (Char.code 'o')));

46

++    next_option chars ok (assert_equal (Some (Uchar.of_int (Char.code 'f'))));

47

++    next_option chars ok (assert_equal (Some (Uchar.of_int 0x000A)));

48

++    next_option chars ok (assert_equal (Some (Uchar.of_int (Char.code 'o'))));

49

+     next_option chars ok (assert_equal (Some Uutf.u_rep));

50

+     next_option chars ok (assert_equal None);

51

+     next_option chars ok (assert_equal None)

52

+@@ -38,12 +38,12 @@ let tests = [

53

+     let s = "\xef\xbb\xbffoo\xf0\x9f\x90\x99bar\xa0more" in

54

+     expect_error (1, 8) (`Decoding_error ("\xa0", "utf-8")) begin fun report ->

55

+       let chars = s |> string |> utf_8 ~report in

56

+-      next_n 3 chars ok (assert_equal (List.map Char.code ['f'; 'o'; 'o']));

57

+-      next_option chars ok (assert_equal (Some 0x1F419));

58

+-      next_n 3 chars ok (assert_equal (List.map Char.code ['b'; 'a'; 'r']));

59

++      next_n 3 chars ok (assert_equal (List.map (fun x -> Uchar.of_int (Char.code x)) ['f'; 'o'; 'o']));

60

++      next_option chars ok (assert_equal (Some (Uchar.of_int 0x1F419)));

61

++      next_n 3 chars ok (assert_equal (List.map (fun x -> Uchar.of_int (Char.code x)) ['b'; 'a'; 'r']));

62

+       next_option chars ok (assert_equal (Some Uutf.u_rep));

63

+       next_n 4 chars ok

64

+-        (assert_equal (List.map Char.code ['m'; 'o'; 'r'; 'e']));

65

++        (assert_equal (List.map (fun x -> Uchar.of_int (Char.code x)) ['m'; 'o'; 'r'; 'e']));

66

+       next_option chars ok (assert_equal None);

67

+       next_option chars ok (assert_equal None)

68

+     end);

69

+@@ -53,11 +53,11 @@ let tests = [

70

+     expect_error (1, 6) (`Decoding_error ("\xdc\x19", "utf-16be"))

71

+     begin fun report ->

72

+       let chars = s |> string |> utf_16be ~report in

73

+-      next_n 3 chars ok (assert_equal (List.map Char.code ['f'; 'o'; 'o']));

74

+-      next_option chars ok (assert_equal (Some 0x1F419));

75

+-      next_option chars ok (assert_equal (Some (Char.code 'b')));

76

++      next_n 3 chars ok (assert_equal (List.map (fun x -> Uchar.of_int (Char.code x)) ['f'; 'o'; 'o']));

77

++      next_option chars ok (assert_equal (Some (Uchar.of_int 0x1F419)));

78

++      next_option chars ok (assert_equal (Some (Uchar.of_int (Char.code 'b'))));

79

+       next_option chars ok (assert_equal (Some Uutf.u_rep));

80

+-      next_n 16 chars ok (assert_equal (List.map Char.code ['a'; 'r']));

81

++      next_n 16 chars ok (assert_equal (List.map (fun x -> Uchar.of_int (Char.code x)) ['a'; 'r']));

82

+       next_option chars ok (assert_equal None);

83

+       next_option chars ok (assert_equal None)

84

+     end);

85

+@@ -67,11 +67,11 @@ let tests = [

86

+     expect_error (1, 6) (`Decoding_error ("\x19\xdc", "utf-16le"))

87

+     begin fun report ->

88

+       let chars = s |> string |> utf_16le ~report in

89

+-      next_n 3 chars ok (assert_equal (List.map Char.code ['f'; 'o'; 'o']));

90

+-      next_option chars ok (assert_equal (Some 0x1F419));

91

+-      next_option chars ok (assert_equal (Some (Char.code 'b')));

92

++      next_n 3 chars ok (assert_equal (List.map (fun x -> Uchar.of_int (Char.code x)) ['f'; 'o'; 'o']));

93

++      next_option chars ok (assert_equal (Some (Uchar.of_int 0x1F419)));

94

++      next_option chars ok (assert_equal (Some (Uchar.of_int (Char.code 'b'))));

95

+       next_option chars ok (assert_equal (Some Uutf.u_rep));

96

+-      next_n 16 chars ok (assert_equal (List.map Char.code ['a'; 'r']));

97

++      next_n 16 chars ok (assert_equal (List.map (fun x -> Uchar.of_int (Char.code x)) ['a'; 'r']));

98

+       next_option chars ok (assert_equal None);

99

+       next_option chars ok (assert_equal None)

100

+     end);

101

+@@ -79,7 +79,7 @@ let tests = [

102

+   ("encoding.iso_8859_1" >:: fun _ ->

103

+     let chars = string "foo\xa0" |> iso_8859_1 in

104

+     next_n 4 chars

105

+-    ok (assert_equal (List.map Char.code ['f'; 'o'; 'o'; '\xa0']));

106

++    ok (assert_equal (List.map (fun x -> Uchar.of_int (Char.code x)) ['f'; 'o'; 'o'; '\xa0']));

107

+     next_option chars ok (assert_equal None);

108

+     next_option chars ok (assert_equal None));

109

+

110

+@@ -88,26 +88,26 @@ let tests = [

111

+     expect_error (1, 4) (`Decoding_error ("\xa0", "us-ascii"))

112

+     begin fun report ->

113

+       let chars = s |> string |> us_ascii ~report in

114

+-      next_n 3 chars ok (assert_equal (List.map Char.code ['f'; 'o'; 'o']));

115

++      next_n 3 chars ok (assert_equal (List.map (fun x -> Uchar.of_int (Char.code x)) ['f'; 'o'; 'o']));

116

+       next_option chars ok (assert_equal (Some Uutf.u_rep));

117

+-      next_n 3 chars ok (assert_equal (List.map Char.code ['b'; 'a'; 'r']));

118

++      next_n 3 chars ok (assert_equal (List.map (fun x -> Uchar.of_int (Char.code x)) ['b'; 'a'; 'r']));

119

+       next_option chars ok (assert_equal None);

120

+       next_option chars ok (assert_equal None)

121

+     end);

122

+

123

+   ("encoding.windows_1251" >:: fun _ ->

124

+     let chars = string "foo\xe0\xe1\xe2bar" |> windows_1251 in

125

+-    next_n 3 chars ok (assert_equal (List.map Char.code ['f'; 'o'; 'o']));

126

+-    next_n 3 chars ok (assert_equal [0x0430; 0x0431; 0x0432]);

127

+-    next_n 3 chars ok (assert_equal (List.map Char.code ['b'; 'a'; 'r']));

128

++    next_n 3 chars ok (assert_equal (List.map (fun x -> Uchar.of_int (Char.code x)) ['f'; 'o'; 'o']));

129

++    next_n 3 chars ok (assert_equal [Uchar.of_int 0x0430; Uchar.of_int 0x0431; Uchar.of_int 0x0432]);

130

++    next_n 3 chars ok (assert_equal (List.map (fun x -> Uchar.of_int (Char.code x)) ['b'; 'a'; 'r']));

131

+     next_option chars ok (assert_equal None);

132

+     next_option chars ok (assert_equal None));

133

+

134

+   ("encoding.windows_1252" >:: fun _ ->

135

+     let chars = string "foo\x80\x83bar" |> windows_1252 in

136

+-    next_n 3 chars ok (assert_equal (List.map Char.code ['f'; 'o'; 'o']));

137

+-    next_n 2 chars ok (assert_equal [0x20AC; 0x0192]);

138

+-    next_n 3 chars ok (assert_equal (List.map Char.code ['b'; 'a'; 'r']));

139

++    next_n 3 chars ok (assert_equal (List.map (fun x -> Uchar.of_int (Char.code x)) ['f'; 'o'; 'o']));

140

++    next_n 2 chars ok (assert_equal [Uchar.of_int 0x20AC; Uchar.of_int 0x0192]);

141

++    next_n 3 chars ok (assert_equal (List.map (fun x -> Uchar.of_int (Char.code x)) ['b'; 'a'; 'r']));

142

+     next_option chars ok (assert_equal None);

143

+     next_option chars ok (assert_equal None));

144

+

145

+@@ -137,7 +137,7 @@ let tests = [

146

+

147

+   ("encoding.ebcdic" >:: fun _ ->

148

+     let chars = string "\x86\x96\x96" |> ebcdic in

149

+-    next_n 3 chars ok (assert_equal (List.map Char.code ['f'; 'o'; 'o']));

150

++    next_n 3 chars ok (assert_equal (List.map (fun x -> Uchar.of_int (Char.code x)) ['f'; 'o'; 'o']));

151

+     next_option chars ok (assert_equal None);

152

+     next_option chars ok (assert_equal None));

153

+ ]

154

+Index: markup.ml-0.7.2/test/test_html_tokenizer.ml

155

+===================================================================

156

+--- markup.ml-0.7.2.orig/test/test_html_tokenizer.ml

157

++++ markup.ml-0.7.2/test/test_html_tokenizer.ml

158

+@@ -134,7 +134,7 @@ let tests = [

159

+     expect "&#1000000000000000000000000000000;"

160

+       [ 1,  1, E (`Bad_token ("&#1000000000000000000000000000000;",

161

+                               reference, "out of range"));

162

+-        1,  1, S (`Char Uutf.u_rep);

163

++        1,  1, S (`Char (Uchar.to_int Uutf.u_rep));

164

+         1, 35, S  `EOF];

165

+

166

+     expect "&#1000000000000000000000000000000"

167

+@@ -142,22 +142,22 @@ let tests = [

168

+                               reference, "missing ';' at end"));

169

+         1,  1, E (`Bad_token ("&#1000000000000000000000000000000",

170

+                               reference, "out of range"));

171

+-        1,  1, S (`Char Uutf.u_rep);

172

++        1,  1, S (`Char (Uchar.to_int Uutf.u_rep));

173

+         1, 34, S  `EOF];

174

+

175

+     expect "&#xD800;"

176

+       [ 1,  1, E (`Bad_token ("&#xD800;", reference, "out of range"));

177

+-        1,  1, S (`Char Uutf.u_rep);

178

++        1,  1, S (`Char (Uchar.to_int Uutf.u_rep));

179

+         1,  9, S  `EOF];

180

+

181

+     expect "&#x110000;"

182

+       [ 1,  1, E (`Bad_token ("&#x110000;", reference, "out of range"));

183

+-        1,  1, S (`Char Uutf.u_rep);

184

++        1,  1, S (`Char (Uchar.to_int Uutf.u_rep));

185

+         1, 11, S  `EOF];

186

+

187

+     expect "&#0;"

188

+       [ 1,  1, E (`Bad_token ("&#0;", reference, "out of range"));

189

+-        1,  1, S (`Char Uutf.u_rep);

190

++        1,  1, S (`Char (Uchar.to_int Uutf.u_rep));

191

+         1,  5, S  `EOF];

192

+

193

+     expect "&#x01;"

194

+@@ -264,7 +264,7 @@ let tests = [

195

+     expect ~state:`RCDATA "f\x00</foo>"

196

+       ([ 1,  1, S (`Char 0x66);

197

+          1,  2, E (`Bad_token ("U+0000", "content", "null"));

198

+-         1,  2, S (`Char Uutf.u_rep)] @

199

++         1,  2, S (`Char (Uchar.to_int Uutf.u_rep))] @

200

+        (char_sequence ~start:3 "</foo>"));

201

+

202

+     expect ~state:`RCDATA "<title>f</title >"

203

+@@ -302,7 +302,7 @@ let tests = [

204

+     expect ~state:`RAWTEXT "f\x00</foo>"

205

+       ([ 1,  1, S (`Char 0x66);

206

+          1,  2, E (`Bad_token ("U+0000", "content", "null"));

207

+-         1,  2, S (`Char Uutf.u_rep)] @

208

++         1,  2, S (`Char (Uchar.to_int Uutf.u_rep))] @

209

+        (char_sequence ~start:3 "</foo>")));

210

+

211

+   ("html.tokenizer.script-data" >:: fun _ ->

212

+@@ -330,7 +330,7 @@ let tests = [

213

+     expect ~state:`Script_data "f<!--o\x00o"

214

+       ((char_sequence ~no_eof:true "f<!--o") @

215

+        [1,  7, E (`Bad_token ("U+0000", "script", "null"));

216

+-        1,  7, S (`Char Uutf.u_rep);

217

++        1,  7, S (`Char (Uchar.to_int Uutf.u_rep));

218

+         1,  8, S (`Char 0x6F);

219

+         1,  9, E (`Unexpected_eoi "script");

220

+         1,  9, S  `EOF]);

221

+@@ -363,7 +363,7 @@ let tests = [

222

+     expect ~state:`Script_data "f<!--a-\x00-"

223

+       ((char_sequence ~no_eof:true "f<!--a-") @

224

+        [ 1,  8, E (`Bad_token ("U+0000", "script", "null"));

225

+-         1,  8, S (`Char Uutf.u_rep);

226

++         1,  8, S (`Char (Uchar.to_int Uutf.u_rep));

227

+          1,  9, S (`Char 0x02D);

228

+          1, 10, E (`Unexpected_eoi "script");

229

+          1, 10, S  `EOF]);

230

+@@ -371,7 +371,7 @@ let tests = [

231

+     expect ~state:`Script_data "f<!--a--\x00--"

232

+       ((char_sequence ~no_eof:true "f<!--a--") @

233

+        [ 1,  9, E (`Bad_token ("U+0000", "script", "null"));

234

+-         1,  9, S (`Char Uutf.u_rep);

235

++         1,  9, S (`Char (Uchar.to_int Uutf.u_rep));

236

+          1, 10, S (`Char 0x02D);

237

+          1, 11, S (`Char 0x02D);

238

+          1, 12, E (`Unexpected_eoi "script");

239

+@@ -380,14 +380,14 @@ let tests = [

240

+     expect ~state:`Script_data "f<!--<script>\x00"

241

+       ((char_sequence ~no_eof:true "f<!--<script>") @

242

+        [ 1, 14, E (`Bad_token ("U+0000", "script", "null"));

243

+-         1, 14, S (`Char Uutf.u_rep);

244

++         1, 14, S (`Char (Uchar.to_int Uutf.u_rep));

245

+          1, 15, E (`Unexpected_eoi "script");

246

+          1, 15, S  `EOF]);

247

+

248

+     expect ~state:`Script_data "f<!--<script>-\x00-"

249

+       ((char_sequence ~no_eof:true "f<!--<script>-") @

250

+        [ 1, 15, E (`Bad_token ("U+0000", "script", "null"));

251

+-         1, 15, S (`Char Uutf.u_rep);

252

++         1, 15, S (`Char (Uchar.to_int Uutf.u_rep));

253

+          1, 16, S (`Char 0x2D);

254

+          1, 17, E (`Unexpected_eoi "script");

255

+          1, 17, S  `EOF]);

256

+@@ -395,7 +395,7 @@ let tests = [

257

+     expect ~state:`Script_data "f<!--<script>--\x00--"

258

+       ((char_sequence ~no_eof:true "f<!--<script>--") @

259

+        [ 1, 16, E (`Bad_token ("U+0000", "script", "null"));

260

+-         1, 16, S (`Char Uutf.u_rep);

261

++         1, 16, S (`Char (Uchar.to_int Uutf.u_rep));

262

+          1, 17, S (`Char 0x2D);

263

+          1, 18, S (`Char 0x2D);

264

+          1, 19, E (`Unexpected_eoi "script");

265

+@@ -413,7 +413,7 @@ let tests = [

266

+     expect ~state:`Script_data "f\x00</foo>"

267

+       ([ 1,  1, S (`Char 0x66);

268

+          1,  2, E (`Bad_token ("U+0000", "content", "null"));

269

+-         1,  2, S (`Char Uutf.u_rep)] @

270

++         1,  2, S (`Char (Uchar.to_int Uutf.u_rep))] @

271

+        (char_sequence ~start:3 "</foo>")));

272

+

273

+   ("html.tokenizer.plaintext" >:: fun _ ->

274

+@@ -424,7 +424,7 @@ let tests = [

275

+     expect ~state:`PLAINTEXT "f\x00</foo>"

276

+       ([ 1,  1, S (`Char 0x66);

277

+          1,  2, E (`Bad_token ("U+0000", "content", "null"));

278

+-         1,  2, S (`Char Uutf.u_rep)] @

279

++         1,  2, S (`Char (Uchar.to_int Uutf.u_rep))] @

280

+        (char_sequence ~start:3 "</foo>")));

281

+

282

+   ("html.tokenizer.comment" >:: fun _ ->

283

+Index: markup.ml-0.7.2/test/test_input.ml

284

+===================================================================

285

+--- markup.ml-0.7.2.orig/test/test_input.ml

286

++++ markup.ml-0.7.2/test/test_input.ml

287

+@@ -71,7 +71,7 @@ let tests = [

288

+     end);

289

+

290

+   ("input.bom" >:: fun _ ->

291

+-    [0xFEFF; 0x66]

292

++    [Uchar.of_int 0xFEFF; Uchar.of_int 0x66]

293

+     |> of_list

294

+     |> preprocess is_valid_xml_char Error.ignore_errors

295

+     |> fst

296

297

diff --git a/dev-ml/markup/files/uutf.patch b/dev-ml/markup/files/uutf.patch

298

new file mode 100644

299

index 00000000..f561084

300

--- /dev/null

301

+++ b/dev-ml/markup/files/uutf.patch

302

@@ -0,0 +1,1085 @@

303

+Index: markup.ml-0.7.2/src/common.ml

304

+===================================================================

305

+--- markup.ml-0.7.2.orig/src/common.ml

306

++++ markup.ml-0.7.2/src/common.ml

307

+@@ -134,7 +134,7 @@ let is_printable = is_in_range 0x0020 0x

308

+ let char c =

309

+   if is_printable c then begin

310

+     let buffer = Buffer.create 4 in

311

+-    add_utf_8 buffer c;

312

++    add_utf_8 buffer (Uchar.of_int c);

313

+     Buffer.contents buffer

314

+   end

315

+   else

316

+Index: markup.ml-0.7.2/src/detect.ml

317

+===================================================================

318

+--- markup.ml-0.7.2.orig/src/detect.ml

319

++++ markup.ml-0.7.2/src/detect.ml

320

+@@ -222,7 +222,7 @@ let meta_tag_prescan =

321

+     let rec iterate () =

322

+       next source throw (fun () -> k "") (function

323

+         | c when c = quote -> k (Buffer.contents buffer)

324

+-        | c -> add_utf_8 buffer (Char.code (Char.lowercase c)); iterate ())

325

++        | c -> add_utf_8 buffer (Uchar.of_int (Char.code (Char.lowercase c))); iterate ())

326

+     in

327

+     iterate ()

328

+   in

329

+@@ -236,7 +236,7 @@ let meta_tag_prescan =

330

+           push source c;

331

+           k (Buffer.contents buffer)

332

+         | c ->

333

+-          add_utf_8 buffer (Char.code (Char.lowercase c));

334

++          add_utf_8 buffer (Uchar.of_int (Char.code (Char.lowercase c)));

335

+           iterate ())

336

+     in

337

+     iterate ()

338

+@@ -315,7 +315,7 @@ let meta_tag_prescan =

339

+               k (Buffer.contents buffer)

340

+

341

+             | Some c ->

342

+-              add_utf_8 buffer (Char.code (Char.lowercase c));

343

++              add_utf_8 buffer (Uchar.of_int (Char.code (Char.lowercase c)));

344

+               iterate ()

345

+           end

346

+         in

347

+Index: markup.ml-0.7.2/src/encoding.ml

348

+===================================================================

349

+--- markup.ml-0.7.2.orig/src/encoding.ml

350

++++ markup.ml-0.7.2/src/encoding.ml

351

+@@ -4,7 +4,7 @@

352

+ open Common

353

+ open Kstream

354

+

355

+-type t = ?report:Error.parse_handler -> char Kstream.t -> int Kstream.t

356

++type t = ?report:Error.parse_handler -> char Kstream.t -> Uchar.t Kstream.t

357

+

358

+ let wrap f = fun ?(report = Error.ignore_errors) s -> f report s

359

+

360

+@@ -24,8 +24,8 @@ let _uutf_decoder encoding name =

361

+           k Uutf.u_rep)

362

+         | `Await ->

363

+           next bytes throw

364

+-            (fun () -> Uutf.Manual.src decoder "" 0 0; run ())

365

+-            (fun c -> Uutf.Manual.src decoder (String.make 1 c) 0 1; run ())

366

++            (fun () -> Uutf.Manual.src decoder Bytes.empty 0 0; run ())

367

++            (fun c -> Uutf.Manual.src decoder (Bytes.make 1 c) 0 1; run ())

368

+       in

369

+       run ())

370

+     |> make)

371

+@@ -87,7 +87,7 @@ let _ucs_4_decoder arrange name =

372

+               let skip =

373

+                 if !first then begin

374

+                   first := false;

375

+-                  scalar = Uutf.u_bom

376

++                  scalar = Uchar.to_int Uutf.u_bom

377

+                 end

378

+                 else

379

+                   false

380

+@@ -96,9 +96,9 @@ let _ucs_4_decoder arrange name =

381

+               if skip then run ()

382

+               else

383

+                 if scalar = 0x000A then

384

+-                  newline k scalar

385

++                  newline k (Uchar.of_int scalar)

386

+                 else

387

+-                  char k scalar

388

++                  char k (Uchar.of_int scalar)

389

+

390

+           | [] -> empty ()

391

+

392

+@@ -130,7 +130,7 @@ let code_page table =

393

+

394

+   (fun _ bytes ->

395

+     (fun throw empty k ->

396

+-      next bytes throw empty (fun c -> k table.(Char.code c)))

397

++      next bytes throw empty (fun c -> k (Uchar.of_int table.(Char.code c))))

398

+     |> make)

399

+   |> wrap

400

+

401

+Index: markup.ml-0.7.2/src/html_parser.ml

402

+===================================================================

403

+--- markup.ml-0.7.2.orig/src/html_parser.ml

404

++++ markup.ml-0.7.2/src/html_parser.ml

405

+@@ -1022,7 +1022,7 @@ let parse requested_context report (toke

406

+   let frameset_ok = ref true in

407

+   let head_seen = ref false in

408

+

409

+-  let add_character = Text.add text in

410

++  let add_character = (fun x y -> Text.add text x (Uchar.of_int y)) in

411

+

412

+   set_foreign (fun () ->

413

+     Stack.current_element_is_foreign context open_elements);

414

+@@ -2717,7 +2717,7 @@ let parse requested_context report (toke

415

+     | l, `Char 0 ->

416

+       report l (`Bad_token ("U+0000", "foreign content", "null")) !throw

417

+         (fun () ->

418

+-      add_character l Uutf.u_rep;

419

++      add_character l (Uchar.to_int Uutf.u_rep);

420

+       mode ())

421

+

422

+     | l, `Char (0x0009 | 0x000A | 0x000C | 0x000D | 0x0020 as c) ->

423

+Index: markup.ml-0.7.2/src/html_tokenizer.ml

424

+===================================================================

425

+--- markup.ml-0.7.2.orig/src/html_tokenizer.ml

426

++++ markup.ml-0.7.2/src/html_tokenizer.ml

427

+@@ -252,7 +252,7 @@ let tokenize report (input, get_location

428

+                 report location

429

+                   (`Bad_token (prefix ^ text ^ semicolon, "character reference",

430

+                                "Windows-1252 character")) !throw (fun () ->

431

+-                k (Some (`One n)))

432

++                k (Some (`One (Uchar.of_int n))))

433

+

434

+               else

435

+                 match n with

436

+@@ -268,9 +268,9 @@ let tokenize report (input, get_location

437

+                     (`Bad_token (prefix ^ text ^ semicolon,

438

+                                  "character reference",

439

+                                  "invalid HTML character")) !throw (fun () ->

440

+-                  k (Some (`One n)))

441

++                  k (Some (`One (Uchar.of_int n))))

442

+

443

+-                | n -> k (Some (`One n))

444

++                | n -> k (Some (`One (Uchar.of_int n)))

445

+               end

446

+             end

447

+         in

448

+@@ -366,6 +366,10 @@ let tokenize report (input, get_location

449

+                   | _ -> unterminated ())

450

+         in

451

+

452

++	let ma = function

453

++	a, `One x -> (a, `One (Uchar.of_int x))

454

++	| a, `Two (x,y) -> (a, `Two (Uchar.of_int x, Uchar.of_int y)) in

455

++

456

+         let rec match_named best matched replace candidate =

457

+           next_option input !throw (function

458

+             | None -> finish best matched replace

459

+@@ -377,8 +381,8 @@ let tokenize report (input, get_location

460

+               | `None -> finish best matched (v::replace)

461

+               | `Continue -> match_named best matched (v::replace) candidate

462

+               | `Match_and_continue m ->

463

+-                match_named (Some m) (v::(replace @ matched)) [] candidate

464

+-              | `Match m -> finish (Some m) (v::matched) [])

465

++                match_named (Some (ma m)) (v::(replace @ matched)) [] candidate

466

++              | `Match m -> finish (Some (ma m)) (v::matched) [])

467

+         in

468

+         match_named None [] [] "")

469

+

470

+@@ -409,11 +413,11 @@ let tokenize report (input, get_location

471

+         emit (l, `Char 0x0026) state

472

+

473

+       | Some (`One c) ->

474

+-        emit (l, `Char c) state

475

++        emit (l, `Char (Uchar.to_int c)) state

476

+

477

+       | Some (`Two (c, c')) ->

478

+-        emit (l, `Char c) (fun () ->

479

+-        emit (l, `Char c') state)

480

++        emit (l, `Char (Uchar.to_int c)) (fun () ->

481

++        emit (l, `Char (Uchar.to_int c')) state)

482

+     end

483

+

484

+   (* 8.2.4.3. *)

485

+@@ -427,7 +431,7 @@ let tokenize report (input, get_location

486

+

487

+       | Some (l, 0) ->

488

+         report l (`Bad_token ("U+0000", "content", "null")) !throw (fun () ->

489

+-        emit (l, `Char Uutf.u_rep) rcdata_state)

490

++        emit (l, `Char (Uchar.to_int Uutf.u_rep)) rcdata_state)

491

+

492

+       | None ->

493

+         emit_eof ()

494

+@@ -444,7 +448,7 @@ let tokenize report (input, get_location

495

+

496

+       | Some (l, 0) ->

497

+         report l (`Bad_token ("U+0000", "content", "null")) !throw (fun () ->

498

+-        emit (l, `Char Uutf.u_rep) rawtext_state)

499

++        emit (l, `Char (Uchar.to_int Uutf.u_rep)) rawtext_state)

500

+

501

+       | None ->

502

+         emit_eof ()

503

+@@ -461,7 +465,7 @@ let tokenize report (input, get_location

504

+

505

+       | Some (l, 0) ->

506

+         report l (`Bad_token ("U+0000", "content", "null")) !throw (fun () ->

507

+-        emit_character l Uutf.u_rep script_data_state)

508

++        emit_character l (Uchar.to_int Uutf.u_rep) script_data_state)

509

+

510

+       | None ->

511

+         emit_eof ()

512

+@@ -475,7 +479,7 @@ let tokenize report (input, get_location

513

+     next_option input !throw begin function

514

+       | Some (l, 0) ->

515

+         report l (`Bad_token ("U+0000", "content", "null")) !throw (fun () ->

516

+-        emit (l, `Char Uutf.u_rep) plaintext_state)

517

++        emit (l, `Char (Uchar.to_int Uutf.u_rep)) plaintext_state)

518

+

519

+       | None ->

520

+         emit_eof ()

521

+@@ -501,7 +505,7 @@ let tokenize report (input, get_location

522

+         end_tag_open_state l' tag

523

+

524

+       | Some (_, c) when is_alphabetic c ->

525

+-        add_utf_8 tag._tag_name (to_lowercase c);

526

++        add_utf_8 tag._tag_name (Uchar.of_int (to_lowercase c));

527

+         tag_name_state l' tag

528

+

529

+       | Some (_, 0x003F) ->

530

+@@ -529,7 +533,7 @@ let tokenize report (input, get_location

531

+

532

+     next_option input !throw begin function

533

+       | Some (_, c) when is_alphabetic c ->

534

+-        add_utf_8 tag._tag_name (to_lowercase c);

535

++        add_utf_8 tag._tag_name (Uchar.of_int (to_lowercase c));

536

+         tag_name_state l' tag

537

+

538

+       | Some (_, 0x003E) ->

539

+@@ -569,7 +573,7 @@ let tokenize report (input, get_location

540

+         report (get_location ()) (`Unexpected_eoi "tag") !throw data_state

541

+

542

+       | Some (_, c) ->

543

+-        add_utf_8 tag._tag_name (to_lowercase c);

544

++        add_utf_8 tag._tag_name (Uchar.of_int (to_lowercase c));

545

+         tag_name_state l' tag

546

+     end

547

+

548

+@@ -589,7 +593,7 @@ let tokenize report (input, get_location

549

+     next_option input !throw begin function

550

+       | Some (_, c as v) when is_alphabetic c ->

551

+         let name_buffer = Buffer.create 32 in

552

+-        add_utf_8 name_buffer (to_lowercase c);

553

++        add_utf_8 name_buffer (Uchar.of_int (to_lowercase c));

554

+         text_end_tag_name_state state l' (v::cs) name_buffer

555

+

556

+       | maybe_v ->

557

+@@ -618,7 +622,7 @@ let tokenize report (input, get_location

558

+         emit_tag l' (create_tag ())

559

+

560

+       | Some ((_, c) as v) when is_alphabetic c ->

561

+-        add_utf_8 name_buffer (to_lowercase c);

562

++        add_utf_8 name_buffer (Uchar.of_int (to_lowercase c));

563

+         text_end_tag_name_state state l' (v::cs) name_buffer

564

+

565

+       | maybe_v ->

566

+@@ -676,7 +680,7 @@ let tokenize report (input, get_location

567

+

568

+       | Some (l, 0) ->

569

+         report l (`Bad_token ("U+0000", "script", "null")) !throw (fun () ->

570

+-        emit_character l Uutf.u_rep (fun () ->

571

++        emit_character l (Uchar.to_int Uutf.u_rep) (fun () ->

572

+         script_data_escaped_state l'))

573

+

574

+       | None ->

575

+@@ -699,7 +703,7 @@ let tokenize report (input, get_location

576

+

577

+       | Some (l, 0) ->

578

+         report l (`Bad_token ("U+0000", "script", "null")) !throw (fun () ->

579

+-        emit_character l Uutf.u_rep (fun () ->

580

++        emit_character l (Uchar.to_int Uutf.u_rep) (fun () ->

581

+         script_data_escaped_state l'))

582

+

583

+       | None ->

584

+@@ -725,7 +729,7 @@ let tokenize report (input, get_location

585

+

586

+       | Some (l, 0) ->

587

+         report l (`Bad_token ("U+0000", "script", "null")) !throw (fun () ->

588

+-        emit_character l Uutf.u_rep (fun () ->

589

++        emit_character l (Uchar.to_int Uutf.u_rep) (fun () ->

590

+         script_data_escaped_state l'))

591

+

592

+       | None ->

593

+@@ -745,7 +749,7 @@ let tokenize report (input, get_location

594

+

595

+       | Some (_, c as v) when is_alphabetic c ->

596

+         let tag_buffer = Buffer.create 32 in

597

+-        add_utf_8 tag_buffer (to_lowercase c);

598

++        add_utf_8 tag_buffer (Uchar.of_int (to_lowercase c));

599

+         emit_characters (List.rev (v::cs)) (fun () ->

600

+         script_data_double_escape_start_state l' tag_buffer)

601

+

602

+@@ -765,7 +769,7 @@ let tokenize report (input, get_location

603

+         else script_data_escaped_state l')

604

+

605

+       | Some (l, c) when is_alphabetic c ->

606

+-        add_utf_8 tag_buffer (to_lowercase c);

607

++        add_utf_8 tag_buffer (Uchar.of_int (to_lowercase c));

608

+         emit_character l c (fun () ->

609

+         script_data_double_escape_start_state l' tag_buffer)

610

+

611

+@@ -787,7 +791,7 @@ let tokenize report (input, get_location

612

+

613

+       | Some (l, 0) ->

614

+         report l (`Bad_token ("U+0000", "script", "null")) !throw (fun () ->

615

+-        emit_character l Uutf.u_rep (fun () ->

616

++        emit_character l (Uchar.to_int Uutf.u_rep) (fun () ->

617

+         script_data_double_escaped_state l'))

618

+

619

+       | None ->

620

+@@ -811,7 +815,7 @@ let tokenize report (input, get_location

621

+

622

+       | Some (l, 0) ->

623

+         report l (`Bad_token ("U+0000", "script", "null")) !throw (fun () ->

624

+-        emit_character l Uutf.u_rep (fun () ->

625

++        emit_character l (Uchar.to_int Uutf.u_rep) (fun () ->

626

+         script_data_double_escaped_state l'))

627

+

628

+       | None ->

629

+@@ -838,7 +842,7 @@ let tokenize report (input, get_location

630

+

631

+       | Some (l, 0) ->

632

+         report l (`Bad_token ("U+0000", "script", "null")) !throw (fun () ->

633

+-        emit_character l Uutf.u_rep (fun () ->

634

++        emit_character l (Uchar.to_int Uutf.u_rep) (fun () ->

635

+         script_data_double_escaped_state l'))

636

+

637

+       | None ->

638

+@@ -872,7 +876,7 @@ let tokenize report (input, get_location

639

+         else script_data_double_escaped_state l')

640

+

641

+       | Some (l, c) when is_alphabetic c ->

642

+-        add_utf_8 tag_buffer (to_lowercase c);

643

++        add_utf_8 tag_buffer (Uchar.of_int (to_lowercase c));

644

+         emit_character l c (fun () ->

645

+         script_data_double_escape_end_state l' tag_buffer)

646

+

647

+@@ -910,10 +914,10 @@ let tokenize report (input, get_location

648

+       | Some (l, (0x0022 | 0x0027 | 0x003C | 0x003D as c)) ->

649

+         report l (`Bad_token (char c, "attribute name",

650

+                               "invalid start character")) !throw (fun () ->

651

+-        start_attribute c)

652

++        start_attribute (Uchar.of_int c))

653

+

654

+       | Some (_, c) ->

655

+-        start_attribute (to_lowercase c)

656

++        start_attribute (Uchar.of_int (to_lowercase c))

657

+     end

658

+

659

+   (* 8.2.4.35. *)

660

+@@ -942,14 +946,14 @@ let tokenize report (input, get_location

661

+       | Some (l, (0x0022 | 0x0027 | 0x003C as c)) ->

662

+         report l (`Bad_token (char c, "attribute name",

663

+                               "invalid name character")) !throw (fun () ->

664

+-        add_utf_8 name_buffer c;

665

++        add_utf_8 name_buffer (Uchar.of_int c);

666

+         attribute_name_state l' tag name_buffer)

667

+

668

+       | None ->

669

+         report (get_location ()) (`Unexpected_eoi "tag") !throw data_state

670

+

671

+       | Some (_, c) ->

672

+-        add_utf_8 name_buffer (to_lowercase c);

673

++        add_utf_8 name_buffer (Uchar.of_int (to_lowercase c));

674

+         attribute_name_state l' tag name_buffer

675

+     end

676

+

677

+@@ -985,13 +989,13 @@ let tokenize report (input, get_location

678

+       | Some (l, (0x0022 | 0x0027 | 0x003C as c)) ->

679

+         report l (`Bad_token (char c, "attribute name",

680

+                               "invalid start character")) !throw (fun () ->

681

+-        start_next_attribute c)

682

++        start_next_attribute (Uchar.of_int c))

683

+

684

+       | None ->

685

+         report (get_location ()) (`Unexpected_eoi "tag") !throw data_state

686

+

687

+       | Some (_, c) ->

688

+-        start_next_attribute (to_lowercase c)

689

++        start_next_attribute (Uchar.of_int (to_lowercase c))

690

+     end

691

+

692

+   (* 8.2.4.37. *)

693

+@@ -1030,13 +1034,13 @@ let tokenize report (input, get_location

694

+       | Some (l, (0x003C | 0x003D | 0x0060 as c)) ->

695

+         report l (`Bad_token (char c, "attribute value",

696

+                               "invalid start character")) !throw (fun () ->

697

+-        start_value attribute_value_unquoted_state (Some c))

698

++        start_value attribute_value_unquoted_state (Some (Uchar.of_int c)))

699

+

700

+       | None ->

701

+         report (get_location ()) (`Unexpected_eoi "tag") !throw data_state

702

+

703

+       | Some (_, c) ->

704

+-        start_value attribute_value_unquoted_state (Some c)

705

++        start_value attribute_value_unquoted_state (Some (Uchar.of_int c))

706

+     end

707

+

708

+   (* 8.2.4.38 and 8.2.4.39. *)

709

+@@ -1062,7 +1066,7 @@ let tokenize report (input, get_location

710

+           data_state

711

+

712

+       | Some (_, c) ->

713

+-        add_utf_8 value_buffer c;

714

++        add_utf_8 value_buffer (Uchar.of_int c);

715

+         attribute_value_quoted_state quote l' tag name value_buffer

716

+     end

717

+

718

+@@ -1092,14 +1096,14 @@ let tokenize report (input, get_location

719

+       | Some (l, (0x0022 | 0x0027 | 0x003C | 0x003D | 0x0060 as c)) ->

720

+         report l (`Bad_token (char c, "attribute value",

721

+                               "invalid character")) !throw (fun () ->

722

+-        add_utf_8 value_buffer c;

723

++        add_utf_8 value_buffer (Uchar.of_int c);

724

+         attribute_value_unquoted_state l' tag name value_buffer)

725

+

726

+       | None ->

727

+         report (get_location ()) (`Unexpected_eoi "tag") !throw data_state

728

+

729

+       | Some (_, c) ->

730

+-        add_utf_8 value_buffer c;

731

++        add_utf_8 value_buffer (Uchar.of_int c);

732

+         attribute_value_unquoted_state l' tag name value_buffer

733

+     end

734

+

735

+@@ -1107,7 +1111,7 @@ let tokenize report (input, get_location

736

+   and character_reference_in_attribute allowed l value_buffer k =

737

+     consume_character_reference true (Some allowed) l begin function

738

+       | None ->

739

+-        add_utf_8 value_buffer 0x0026;

740

++        add_utf_8 value_buffer (Uchar.of_int 0x0026);

741

+         k ()

742

+

743

+       | Some (`One c) ->

744

+@@ -1176,7 +1180,7 @@ let tokenize report (input, get_location

745

+           emit_comment l' buffer

746

+

747

+         | Some (_, c) ->

748

+-          add_utf_8 buffer c;

749

++          add_utf_8 buffer (Uchar.of_int c);

750

+           consume ()

751

+       end

752

+     in

753

+@@ -1239,7 +1243,7 @@ let tokenize report (input, get_location

754

+         emit_comment l' buffer)

755

+

756

+       | Some (_, c) ->

757

+-        add_utf_8 buffer c;

758

++        add_utf_8 buffer (Uchar.of_int c);

759

+         comment_state l' buffer

760

+     end

761

+

762

+@@ -1266,7 +1270,7 @@ let tokenize report (input, get_location

763

+

764

+       | Some (_, c) ->

765

+         Buffer.add_char buffer '-';

766

+-        add_utf_8 buffer c;

767

++        add_utf_8 buffer (Uchar.of_int c);

768

+         comment_state l' buffer

769

+     end

770

+

771

+@@ -1286,7 +1290,7 @@ let tokenize report (input, get_location

772

+         emit_comment l' buffer)

773

+

774

+       | Some (_, c) ->

775

+-        add_utf_8 buffer c;

776

++        add_utf_8 buffer (Uchar.of_int c);

777

+         comment_state l' buffer

778

+     end

779

+

780

+@@ -1308,7 +1312,7 @@ let tokenize report (input, get_location

781

+

782

+       | Some (_, c) ->

783

+         Buffer.add_char buffer '-';

784

+-        add_utf_8 buffer c;

785

++        add_utf_8 buffer (Uchar.of_int c);

786

+         comment_state l' buffer

787

+     end

788

+

789

+@@ -1343,7 +1347,7 @@ let tokenize report (input, get_location

790

+         report l (`Bad_token ("--" ^ (char c), "comment",

791

+                               "'--' should be in '-->'")) !throw (fun () ->

792

+         Buffer.add_string buffer "--";

793

+-        add_utf_8 buffer c;

794

++        add_utf_8 buffer (Uchar.of_int c);

795

+         comment_state l' buffer)

796

+     end

797

+

798

+@@ -1369,7 +1373,7 @@ let tokenize report (input, get_location

799

+

800

+       | Some (_, c) ->

801

+         Buffer.add_string buffer "--!";

802

+-        add_utf_8 buffer c;

803

++        add_utf_8 buffer (Uchar.of_int c);

804

+         comment_state l' buffer

805

+     end

806

+

807

+@@ -1420,7 +1424,7 @@ let tokenize report (input, get_location

808

+

809

+       | Some (_, c) ->

810

+         doctype._doctype_name <-

811

+-          add_doctype_char doctype._doctype_name (to_lowercase c);

812

++          add_doctype_char doctype._doctype_name (Uchar.of_int (to_lowercase c));

813

+         doctype_name_state l' doctype

814

+     end

815

+

816

+@@ -1445,7 +1449,7 @@ let tokenize report (input, get_location

817

+

818

+       | Some (_, c) ->

819

+         doctype._doctype_name <-

820

+-          add_doctype_char doctype._doctype_name (to_lowercase c);

821

++          add_doctype_char doctype._doctype_name (Uchar.of_int (to_lowercase c));

822

+         doctype_name_state l' doctype

823

+     end

824

+

825

+@@ -1574,7 +1578,7 @@ let tokenize report (input, get_location

826

+         emit_doctype ~quirks:true l' doctype)

827

+

828

+       | Some (_, c) ->

829

+-        add doctype c;

830

++        add doctype (Uchar.of_int c);

831

+         doctype_identifier_quoted_state add quote next_state l' doctype

832

+     end

833

+

834

+Index: markup.ml-0.7.2/src/html_writer.ml

835

+===================================================================

836

+--- markup.ml-0.7.2.orig/src/html_writer.ml

837

++++ markup.ml-0.7.2/src/html_writer.ml

838

+@@ -8,7 +8,7 @@ let _escape_attribute s =

839

+   Uutf.String.fold_utf_8 (fun () _ -> function

840

+     | `Malformed _ -> ()

841

+     | `Uchar c ->

842

+-      match c with

843

++      match (Uchar.to_int c) with

844

+       | 0x0026 -> Buffer.add_string buffer "&amp;"

845

+       | 0x00A0 -> Buffer.add_string buffer "&nbsp;"

846

+       | 0x0022 -> Buffer.add_string buffer "&quot;"

847

+@@ -21,7 +21,7 @@ let _escape_text s =

848

+   Uutf.String.fold_utf_8 (fun () _ -> function

849

+     | `Malformed _ -> ()

850

+     | `Uchar c ->

851

+-      match c with

852

++      match (Uchar.to_int c) with

853

+       | 0x0026 -> Buffer.add_string buffer "&amp;"

854

+       | 0x00A0 -> Buffer.add_string buffer "&nbsp;"

855

+       | 0x003C -> Buffer.add_string buffer "&lt;"

856

+Index: markup.ml-0.7.2/src/input.ml

857

+===================================================================

858

+--- markup.ml-0.7.2.orig/src/input.ml

859

++++ markup.ml-0.7.2/src/input.ml

860

+@@ -27,13 +27,13 @@ let preprocess is_valid_char report sour

861

+       in

862

+

863

+       let rec iterate () =

864

+-        next source throw empty (function

865

++        next source throw empty (fun x -> match Uchar.to_int x with

866

+           | 0xFEFF when !first_char -> first_char := false; iterate ()

867

+

868

+           | 0x0D ->

869

+-            next source throw newline (function

870

++            next source throw newline (fun y -> match Uchar.to_int y with

871

+               | 0x0A -> newline ()

872

+-              | c -> push source c; newline ())

873

++              | c -> push source (Uchar.of_int c); newline ())

874

+

875

+           | 0x0A -> newline ()

876

+

877

+Index: markup.ml-0.7.2/src/input.mli

878

+===================================================================

879

+--- markup.ml-0.7.2.orig/src/input.mli

880

++++ markup.ml-0.7.2/src/input.mli

881

+@@ -4,5 +4,5 @@

882

+ open Common

883

+

884

+ val preprocess :

885

+-  (int -> bool) -> Error.parse_handler -> int Kstream.t ->

886

++  (int -> bool) -> Error.parse_handler -> Uchar.t Kstream.t ->

887

+     (location * int) Kstream.t * (unit -> location)

888

+Index: markup.ml-0.7.2/src/markup.ml

889

+===================================================================

890

+--- markup.ml-0.7.2.orig/src/markup.ml

891

++++ markup.ml-0.7.2/src/markup.ml

892

+@@ -187,7 +187,7 @@ sig

893

+

894

+     val decode :

895

+       ?report:(location -> Error.t -> unit io) -> t ->

896

+-      (char, _) stream -> (int, async) stream

897

++      (char, _) stream -> (Uchar.t, async) stream

898

+   end

899

+

900

+   val parse_xml :

901

+Index: markup.ml-0.7.2/src/markup.mli

902

+===================================================================

903

+--- markup.ml-0.7.2.orig/src/markup.mli

904

++++ markup.ml-0.7.2/src/markup.mli

905

+@@ -194,7 +194,7 @@ sig

906

+

907

+   val decode :

908

+     ?report:(location -> Error.t -> unit) -> t ->

909

+-    (char, 's) stream -> (int, 's) stream

910

++    (char, 's) stream -> (Uchar.t, 's) stream

911

+   (** Applies a decoder to a byte stream. Illegal input byte sequences result in

912

+       calls to the error handler [~report] with error kind [`Decoding_error].

913

+       The illegal bytes are then skipped, and zero or more U+FFFD replacement

914

+@@ -764,7 +764,7 @@ sig

915

+

916

+     val decode :

917

+       ?report:(location -> Error.t -> unit io) -> Encoding.t ->

918

+-      (char, _) stream -> (int, async) stream

919

++      (char, _) stream -> (Uchar.t, async) stream

920

+   end

921

+

922

+   (** {2 XML} *)

923

+@@ -838,7 +838,7 @@ val kstream : ('a, _) stream -> 'a Kstre

924

+ val of_kstream : 'a Kstream.t -> ('a, _) stream

925

+

926

+ val preprocess_input_stream :

927

+-  (int, 's) stream -> (location * int, 's) stream * (unit -> location)

928

++  (Uchar.t, 's) stream -> (location * int, 's) stream * (unit -> location)

929

+

930

+ (**/**)

931

+

932

+Index: markup.ml-0.7.2/src/utility.ml

933

+===================================================================

934

+--- markup.ml-0.7.2.orig/src/utility.ml

935

++++ markup.ml-0.7.2/src/utility.ml

936

+@@ -346,11 +346,11 @@ let xhtml_entity name =

937

+

938

+     match lookup 0 with

939

+     | `One c ->

940

+-      add_utf_8 buffer c;

941

++      add_utf_8 buffer (Uchar.of_int c);

942

+       Some (Buffer.contents buffer)

943

+     | `Two (c, c') ->

944

+-      add_utf_8 buffer c;

945

+-      add_utf_8 buffer c';

946

++      add_utf_8 buffer (Uchar.of_int c);

947

++      add_utf_8 buffer (Uchar.of_int c');

948

+       Some (Buffer.contents buffer)

949

+

950

+   with Exit -> None

951

+Index: markup.ml-0.7.2/src/xml_tokenizer.ml

952

+===================================================================

953

+--- markup.ml-0.7.2.orig/src/xml_tokenizer.ml

954

++++ markup.ml-0.7.2/src/xml_tokenizer.ml

955

+@@ -101,7 +101,7 @@ let tokenize report resolve_reference (i

956

+               end

957

+

958

+           | _, c when filter c ->

959

+-            add_utf_8 buffer c;

960

++            add_utf_8 buffer (Uchar.of_int c);

961

+             read ()

962

+

963

+           | l, c ->

964

+@@ -133,7 +133,7 @@ let tokenize report resolve_reference (i

965

+

966

+       | _, c when is_name_start_char c ->

967

+         let buffer = Buffer.create 32 in

968

+-        add_utf_8 buffer c;

969

++        add_utf_8 buffer (Uchar.of_int c);

970

+         let rec read () =

971

+           next input !throw unexpected_eoi begin function

972

+             | _, 0x003B ->

973

+@@ -146,7 +146,7 @@ let tokenize report resolve_reference (i

974

+               end

975

+

976

+             | _, c when is_name_char c ->

977

+-              add_utf_8 buffer c;

978

++              add_utf_8 buffer (Uchar.of_int c);

979

+               read ()

980

+

981

+             | l, c ->

982

+@@ -218,7 +218,7 @@ let tokenize report resolve_reference (i

983

+           report_if (not @@ is_name_start_char c) l (fun () ->

984

+             `Bad_token (char c, "attribute", "invalid start character"))

985

+             !throw (fun () ->

986

+-          add_utf_8 name_buffer c;

987

++          add_utf_8 name_buffer (Uchar.of_int c);

988

+           name_state ())

989

+       end

990

+

991

+@@ -235,7 +235,7 @@ let tokenize report resolve_reference (i

992

+           report_if (not @@ is_name_start_char c) l (fun () ->

993

+             `Bad_token (char c, "attribute", "invalid name character"))

994

+             !throw (fun () ->

995

+-          add_utf_8 name_buffer c;

996

++          add_utf_8 name_buffer (Uchar.of_int c);

997

+           name_state ())

998

+       end

999

+

1000

+@@ -275,14 +275,14 @@ let tokenize report resolve_reference (i

1001

+           report l

1002

+             (`Bad_token ("&", "attribute", "replace with '&amp;'"))

1003

+             !throw (fun () ->

1004

+-          add_utf_8 value_buffer 0x0026;

1005

++          add_utf_8 value_buffer (Uchar.of_int 0x0026);

1006

+           state ())

1007

+       end

1008

+

1009

+     and handle_lt l state =

1010

+       report l (`Bad_token ("<", "attribute", "replace with '&lt;'")) !throw

1011

+         (fun () ->

1012

+-      add_utf_8 value_buffer 0x003C;

1013

++      add_utf_8 value_buffer (Uchar.of_int 0x003C);

1014

+       state ())

1015

+

1016

+     and quoted_value_state quote =

1017

+@@ -300,7 +300,7 @@ let tokenize report resolve_reference (i

1018

+           quoted_value_state quote)

1019

+

1020

+         | _, c ->

1021

+-          add_utf_8 value_buffer c;

1022

++          add_utf_8 value_buffer (Uchar.of_int c);

1023

+           quoted_value_state quote

1024

+       end

1025

+

1026

+@@ -317,7 +317,7 @@ let tokenize report resolve_reference (i

1027

+           handle_lt l unquoted_value_state

1028

+

1029

+         | _, c ->

1030

+-          add_utf_8 value_buffer c;

1031

++          add_utf_8 value_buffer (Uchar.of_int c);

1032

+           unquoted_value_state ()

1033

+       end

1034

+

1035

+@@ -372,7 +372,7 @@ let tokenize report resolve_reference (i

1036

+           report_if (not @@ is_name_start_char c) l (fun () ->

1037

+             `Bad_token (char c, pi, "invalid start character")) !throw

1038

+             (fun () ->

1039

+-          add_utf_8 target_buffer c;

1040

++          add_utf_8 target_buffer (Uchar.of_int c);

1041

+           target_state ())

1042

+       end

1043

+

1044

+@@ -388,13 +388,13 @@ let tokenize report resolve_reference (i

1045

+           report_if (not @@ is_name_char c) l (fun () ->

1046

+             `Bad_token (char c, pi, "invalid name character")) !throw

1047

+             (fun () ->

1048

+-          add_utf_8 target_buffer c;

1049

++          add_utf_8 target_buffer (Uchar.of_int c);

1050

+           target_state ())

1051

+       end

1052

+

1053

+     and text_state () =

1054

+       next' pi finish_pi (fun (_, c) ->

1055

+-        add_utf_8 text_buffer c;

1056

++        add_utf_8 text_buffer (Uchar.of_int c);

1057

+         text_state ())

1058

+

1059

+     and xml_declaration_state () =

1060

+@@ -572,7 +572,7 @@ let tokenize report resolve_reference (i

1061

+   and initial_state () =

1062

+     next input !throw (fun () -> emit_eoi ()) begin function

1063

+       | l, (0x005D as c) ->

1064

+-        add_character l c;

1065

++        add_character l (Uchar.of_int c);

1066

+         one_bracket_state l

1067

+

1068

+       | l, 0x003C ->

1069

+@@ -583,7 +583,7 @@ let tokenize report resolve_reference (i

1070

+           | None ->

1071

+             report l (`Bad_token (char c, "text", "replace with '&amp;'"))

1072

+               !throw (fun () ->

1073

+-            add_character l c;

1074

++            add_character l (Uchar.of_int c);

1075

+             initial_state ())

1076

+

1077

+           | Some s ->

1078

+@@ -591,14 +591,14 @@ let tokenize report resolve_reference (i

1079

+             initial_state ())

1080

+

1081

+       | l, c ->

1082

+-        add_character l c;

1083

++        add_character l (Uchar.of_int c);

1084

+         initial_state ()

1085

+     end

1086

+

1087

+   and one_bracket_state l' =

1088

+     next_option input !throw begin function

1089

+       | Some (l, (0x005D as c)) ->

1090

+-        add_character l c;

1091

++        add_character l (Uchar.of_int c);

1092

+         two_brackets_state l' l

1093

+

1094

+       | v ->

1095

+@@ -611,11 +611,11 @@ let tokenize report resolve_reference (i

1096

+       | Some (l, (0x003E as c)) ->

1097

+         report l' (`Bad_token ("]]>", "text", "must end a CDATA section"))

1098

+           !throw (fun () ->

1099

+-        add_character l c;

1100

++        add_character l (Uchar.of_int c);

1101

+         initial_state ())

1102

+

1103

+       | Some (l, (0x005D as c)) ->

1104

+-        add_character l c;

1105

++        add_character l (Uchar.of_int c);

1106

+         two_brackets_state l'' l

1107

+

1108

+       | v ->

1109

+@@ -626,7 +626,7 @@ let tokenize report resolve_reference (i

1110

+   and begin_markup_state l' =

1111

+     let recover v =

1112

+       lt_in_text l' (fun () ->

1113

+-      add_character l' 0x003C;

1114

++      add_character l' (Uchar.of_int 0x003C);

1115

+       push_option input v;

1116

+       initial_state ())

1117

+     in

1118

+@@ -648,7 +648,7 @@ let tokenize report resolve_reference (i

1119

+

1120

+       | _, c when is_name_start_char c ->

1121

+         let tag_name_buffer = Buffer.create 32 in

1122

+-        add_utf_8 tag_name_buffer c;

1123

++        add_utf_8 tag_name_buffer (Uchar.of_int c);

1124

+         start_tag_state l' tag_name_buffer

1125

+

1126

+       | l, c as v ->

1127

+@@ -660,7 +660,7 @@ let tokenize report resolve_reference (i

1128

+   and start_tag_state l' buffer =

1129

+     let recover v =

1130

+       lt_in_text l' (fun () ->

1131

+-      add_character l' 0x003C;

1132

++      add_character l' (Uchar.of_int 0x003C);

1133

+       add_string l' (Buffer.contents buffer);

1134

+       push_option input v;

1135

+       initial_state ())

1136

+@@ -680,7 +680,7 @@ let tokenize report resolve_reference (i

1137

+         attributes_state l' (Buffer.contents buffer) []

1138

+

1139

+       | _, c when is_name_char c ->

1140

+-        add_utf_8 buffer c;

1141

++        add_utf_8 buffer (Uchar.of_int c);

1142

+         start_tag_state l' buffer

1143

+

1144

+       | l, c as v ->

1145

+@@ -731,8 +731,8 @@ let tokenize report resolve_reference (i

1146

+   and end_tag_state l' =

1147

+     let recover v =

1148

+       lt_in_text l' (fun () ->

1149

+-      add_character l' 0x003C;

1150

+-      add_character l' 0x002F;

1151

++      add_character l' (Uchar.of_int 0x003C);

1152

++      add_character l' (Uchar.of_int 0x002F);

1153

+       push_option input v;

1154

+       initial_state ())

1155

+     in

1156

+@@ -743,7 +743,7 @@ let tokenize report resolve_reference (i

1157

+     begin function

1158

+       | _, c when is_name_start_char c ->

1159

+         let name_buffer = Buffer.create 32 in

1160

+-        add_utf_8 name_buffer c;

1161

++        add_utf_8 name_buffer (Uchar.of_int c);

1162

+         end_tag_name_state l' name_buffer

1163

+

1164

+       | l, c as v ->

1165

+@@ -755,8 +755,8 @@ let tokenize report resolve_reference (i

1166

+   and end_tag_name_state l' buffer =

1167

+     let recover v =

1168

+       lt_in_text l' (fun () ->

1169

+-      add_character l' 0x003C;

1170

+-      add_character l' 0x002F;

1171

++      add_character l' (Uchar.of_int 0x003C);

1172

++      add_character l' (Uchar.of_int 0x002F);

1173

+       add_string l' (Buffer.contents buffer);

1174

+       push_option input v;

1175

+       initial_state ())

1176

+@@ -773,7 +773,7 @@ let tokenize report resolve_reference (i

1177

+         end_tag_whitespace_state false l' (Buffer.contents buffer)

1178

+

1179

+       | _, c when is_name_char c ->

1180

+-        add_utf_8 buffer c;

1181

++        add_utf_8 buffer (Uchar.of_int c);

1182

+         end_tag_name_state l' buffer

1183

+

1184

+       | l, c as v ->

1185

+@@ -821,8 +821,8 @@ let tokenize report resolve_reference (i

1186

+

1187

+       | v ->

1188

+         bad_comment_start "<!" l' (fun () ->

1189

+-        add_character l' 0x003C;

1190

+-        add_character l' 0x0021;

1191

++        add_character l' (Uchar.of_int 0x003C);

1192

++        add_character l' (Uchar.of_int 0x0021);

1193

+         push_option input v;

1194

+         initial_state ())

1195

+     end

1196

+@@ -834,9 +834,9 @@ let tokenize report resolve_reference (i

1197

+

1198

+       | v ->

1199

+         bad_comment_start "<!-" l' (fun () ->

1200

+-        add_character l' 0x003C;

1201

+-        add_character l' 0x0021;

1202

+-        add_character l' 0x002D;

1203

++        add_character l' (Uchar.of_int 0x003C);

1204

++        add_character l' (Uchar.of_int 0x0021);

1205

++        add_character l' (Uchar.of_int 0x002D);

1206

+         push_option input v;

1207

+         initial_state ())

1208

+     end

1209

+@@ -852,7 +852,7 @@ let tokenize report resolve_reference (i

1210

+         comment_one_dash_state l' l buffer

1211

+

1212

+       | _, c ->

1213

+-        add_utf_8 buffer c;

1214

++        add_utf_8 buffer (Uchar.of_int c);

1215

+         comment_state l' buffer

1216

+     end

1217

+

1218

+@@ -863,8 +863,8 @@ let tokenize report resolve_reference (i

1219

+         comment_two_dashes_state false l' l'' buffer

1220

+

1221

+       | _, c ->

1222

+-        add_utf_8 buffer 0x002D;

1223

+-        add_utf_8 buffer c;

1224

++        add_utf_8 buffer (Uchar.of_int 0x002D);

1225

++        add_utf_8 buffer (Uchar.of_int c);

1226

+         comment_state l' buffer

1227

+     end

1228

+

1229

+@@ -883,14 +883,14 @@ let tokenize report resolve_reference (i

1230

+

1231

+       | _, 0x002D ->

1232

+         recover (fun () ->

1233

+-        add_utf_8 buffer 0x002D;

1234

++        add_utf_8 buffer (Uchar.of_int 0x002D);

1235

+         comment_two_dashes_state true l' l'' buffer)

1236

+

1237

+       | _, c ->

1238

+         recover (fun () ->

1239

+-        add_utf_8 buffer 0x002D;

1240

+-        add_utf_8 buffer 0x002D;

1241

+-        add_utf_8 buffer c;

1242

++        add_utf_8 buffer (Uchar.of_int 0x002D);

1243

++        add_utf_8 buffer (Uchar.of_int 0x002D);

1244

++        add_utf_8 buffer (Uchar.of_int c);

1245

+         comment_state l' buffer)

1246

+     end

1247

+

1248

+@@ -905,9 +905,9 @@ let tokenize report resolve_reference (i

1249

+           !throw (fun () ->

1250

+         lt_in_text l' (fun () ->

1251

+         push_list input cs;

1252

+-        add_character l' 0x003C;

1253

+-        add_character l' 0x0021;

1254

+-        add_character l' 0x005B;

1255

++        add_character l' (Uchar.of_int 0x003C);

1256

++        add_character l' (Uchar.of_int 0x0021);

1257

++        add_character l' (Uchar.of_int 0x005B);

1258

+         initial_state ()))

1259

+     end

1260

+

1261

+@@ -918,7 +918,7 @@ let tokenize report resolve_reference (i

1262

+         cdata_one_bracket_state l' l

1263

+

1264

+       | l, c ->

1265

+-        add_character l c;

1266

++        add_character l (Uchar.of_int c);

1267

+         cdata_state l'

1268

+     end

1269

+

1270

+@@ -929,8 +929,8 @@ let tokenize report resolve_reference (i

1271

+         cdata_two_brackets_state l' l'' l

1272

+

1273

+       | l, c ->

1274

+-        add_character l'' 0x005D;

1275

+-        add_character l c;

1276

++        add_character l'' (Uchar.of_int 0x005D);

1277

++        add_character l   (Uchar.of_int c);

1278

+         cdata_state l'

1279

+     end

1280

+

1281

+@@ -941,13 +941,13 @@ let tokenize report resolve_reference (i

1282

+         initial_state ()

1283

+

1284

+       | l, 0x005D ->

1285

+-        add_character l'' 0x005D;

1286

++        add_character l'' (Uchar.of_int 0x005D);

1287

+         cdata_two_brackets_state l' l''' l

1288

+

1289

+       | l, c ->

1290

+-        add_character l'' 0x005D;

1291

+-        add_character l''' 0x005D;

1292

+-        add_character l c;

1293

++        add_character l'' (Uchar.of_int 0x005D);

1294

++        add_character l''' (Uchar.of_int 0x005D);

1295

++        add_character l (Uchar.of_int c);

1296

+         cdata_state l'

1297

+     end

1298

+

1299

+@@ -963,9 +963,9 @@ let tokenize report resolve_reference (i

1300

+           !throw (fun () ->

1301

+         lt_in_text l' (fun () ->

1302

+         push_list input cs;

1303

+-        add_character l' 0x003C;

1304

+-        add_character l' 0x0021;

1305

+-        add_character l' 0x0044;

1306

++        add_character l' (Uchar.of_int 0x003C);

1307

++        add_character l' (Uchar.of_int 0x0021);

1308

++        add_character l' (Uchar.of_int 0x0044);

1309

+         initial_state ()))

1310

+     end

1311

+

1312

+@@ -980,15 +980,15 @@ let tokenize report resolve_reference (i

1313

+         emit_doctype l' buffer initial_state

1314

+

1315

+       | _, (0x0022 | 0x0027 as c) ->

1316

+-        add_utf_8 buffer c;

1317

++        add_utf_8 buffer (Uchar.of_int c);

1318

+         doctype_quoted_state (fun () -> doctype_state l' buffer) c l' buffer

1319

+

1320

+       | _, (0x003C as c) ->

1321

+-        add_utf_8 buffer c;

1322

++        add_utf_8 buffer (Uchar.of_int c);

1323

+         doctype_item_state (fun () -> doctype_state l' buffer) l' buffer

1324

+

1325

+       | _, c ->

1326

+-        add_utf_8 buffer c;

1327

++        add_utf_8 buffer (Uchar.of_int c);

1328

+         doctype_state l' buffer

1329

+     end

1330

+

1331

+@@ -996,11 +996,11 @@ let tokenize report resolve_reference (i

1332

+     next input !throw (fun () -> unterminated_doctype l' buffer)

1333

+     begin function

1334

+       | _, c when c = quote ->

1335

+-        add_utf_8 buffer c;

1336

++        add_utf_8 buffer (Uchar.of_int c);

1337

+         state ()

1338

+

1339

+       | _, c ->

1340

+-        add_utf_8 buffer c;

1341

++        add_utf_8 buffer (Uchar.of_int c);

1342

+         doctype_quoted_state state quote l' buffer

1343

+     end

1344

+

1345

+@@ -1008,18 +1008,18 @@ let tokenize report resolve_reference (i

1346

+     next input !throw (fun () -> unterminated_doctype l' buffer)

1347

+     begin function

1348

+       | _, (0x0021 as c) ->

1349

+-        add_utf_8 buffer c;

1350

++        add_utf_8 buffer (Uchar.of_int c);

1351

+         doctype_declaration_state state l' buffer

1352

+

1353

+       | l, (0x003F as c) ->

1354

+-        add_utf_8 buffer c;

1355

+-        let undo = tap (fun (_, c) -> add_utf_8 buffer c) input in

1356

++        add_utf_8 buffer (Uchar.of_int c);

1357

++        let undo = tap (fun (_, c) -> add_utf_8 buffer (Uchar.of_int c)) input in

1358

+         parse_declaration_or_processing_instruction l (fun _ ->

1359

+         undo ();

1360

+         state ())

1361

+

1362

+       | _, c ->

1363

+-        add_utf_8 buffer c;

1364

++        add_utf_8 buffer (Uchar.of_int c);

1365

+         state ()

1366

+     end

1367

+

1368

+@@ -1027,16 +1027,16 @@ let tokenize report resolve_reference (i

1369

+     next input !throw (fun () -> unterminated_doctype l' buffer)

1370

+     begin function

1371

+       | _, (0x003E as c) ->

1372

+-        add_utf_8 buffer c;

1373

++        add_utf_8 buffer (Uchar.of_int c);

1374

+         state ()

1375

+

1376

+       | _, (0x0022 | 0x0027 as c) ->

1377

+-        add_utf_8 buffer c;

1378

++        add_utf_8 buffer (Uchar.of_int c);

1379

+         doctype_quoted_state

1380

+           (fun () -> doctype_declaration_state state l' buffer) c l' buffer

1381

+

1382

+       | _, c ->

1383

+-        add_utf_8 buffer c;

1384

++        add_utf_8 buffer (Uchar.of_int c);

1385

+         doctype_declaration_state state l' buffer

1386

+     end

1387

+

1388

1389

diff --git a/dev-ml/markup/markup-0.7.2.ebuild b/dev-ml/markup/markup-0.7.2-r1.ebuild

1390

similarity index 82%

1391

rename from dev-ml/markup/markup-0.7.2.ebuild

1392

rename to dev-ml/markup/markup-0.7.2-r1.ebuild

1393

index 235c575..f70ac55 100644

1394

--- a/dev-ml/markup/markup-0.7.2.ebuild

1395

+++ b/dev-ml/markup/markup-0.7.2-r1.ebuild

1396

@@ -4,21 +4,21 @@

1397

1398

 EAPI=5

1399

1400

-inherit findlib

1401

+inherit findlib eutils

1402

1403

 DESCRIPTION="Error-recovering streaming HTML5 and XML parsers"

1404

 HOMEPAGE="https://github.com/aantron/markup.ml"

1405

 SRC_URI="https://github.com/aantron/markup.ml/archive/${PV}.tar.gz -> ${P}.tar.gz"

1406

1407

 LICENSE="BSD"

1408

-SLOT="0/${PV}"

1409

+SLOT="0/${PV}p1"

1410

 KEYWORDS="~amd64"

1411

 IUSE="doc test"

1412

1413

 DEPEND="

1414

 	dev-lang/ocaml:=[ocamlopt]

1415

 	dev-ml/lwt:=[ocamlopt]

1416

-	dev-ml/uutf:=[ocamlopt]

1417

+	>=dev-ml/uutf-1.0:=[ocamlopt]

1418

"

1419

 RDEPEND="${DEPEND}"

1420

 DEPEND="${DEPEND}

1421

@@ -26,6 +26,11 @@ DEPEND="${DEPEND}

1422

 	dev-ml/ocamlbuild"

1423

 S="${WORKDIR}/${PN}.ml-${PV}"

1424

1425

+src_prepare() {

1426

+	epatch "${FILESDIR}/uutf.patch" \

1427

+		"${FILESDIR}/test.patch"

1428

+}

1429

+

1430

 src_compile() {

1431

 	emake

1432

 	use doc && emake docs

Gentoo Archives: gentoo-commits