Gentoo Archives: gentoo-commits

From: "Mike Frysinger (vapier)" <vapier@g.o>
To: gentoo-commits@l.g.o
Subject: [gentoo-commits] gentoo commit in src/patchsets/coreutils/8.2: 000_all_coreutils-i18n.patch
Date: Tue, 05 Jan 2010 05:00:06
Message-Id: E1NS1WO-0005zG-Hs@stork.gentoo.org
1 vapier 10/01/05 04:59:56
2
3 Added: 000_all_coreutils-i18n.patch
4 Log:
5 grab fedora unicode patch
6
7 Revision Changes Path
8 1.1 src/patchsets/coreutils/8.2/000_all_coreutils-i18n.patch
9
10 file : http://sources.gentoo.org/viewcvs.py/gentoo/src/patchsets/coreutils/8.2/000_all_coreutils-i18n.patch?rev=1.1&view=markup
11 plain: http://sources.gentoo.org/viewcvs.py/gentoo/src/patchsets/coreutils/8.2/000_all_coreutils-i18n.patch?rev=1.1&content-type=text/plain
12
13 Index: 000_all_coreutils-i18n.patch
14 ===================================================================
15 ripped from Fedora
16
17 diff -urNp coreutils-8.0-orig/lib/linebuffer.h coreutils-8.0/lib/linebuffer.h
18 --- coreutils-8.0-orig/lib/linebuffer.h 2009-10-06 10:59:48.000000000 +0200
19 +++ coreutils-8.0/lib/linebuffer.h 2009-10-07 10:07:16.000000000 +0200
20 @@ -21,6 +21,11 @@
21
22 # include <stdio.h>
23
24 +/* Get mbstate_t. */
25 +# if HAVE_WCHAR_H
26 +# include <wchar.h>
27 +# endif
28 +
29 /* A `struct linebuffer' holds a line of text. */
30
31 struct linebuffer
32 @@ -28,6 +33,9 @@ struct linebuffer
33 size_t size; /* Allocated. */
34 size_t length; /* Used. */
35 char *buffer;
36 +# if HAVE_WCHAR_H
37 + mbstate_t state;
38 +# endif
39 };
40
41 /* Initialize linebuffer LINEBUFFER for use. */
42 diff -urNp coreutils-8.0-orig/src/cut.c coreutils-8.0/src/cut.c
43 --- coreutils-8.0-orig/src/cut.c 2009-09-23 10:25:44.000000000 +0200
44 +++ coreutils-8.0/src/cut.c 2009-10-07 10:07:16.000000000 +0200
45 @@ -28,6 +28,11 @@
46 #include <assert.h>
47 #include <getopt.h>
48 #include <sys/types.h>
49 +
50 +/* Get mbstate_t, mbrtowc(). */
51 +#if HAVE_WCHAR_H
52 +# include <wchar.h>
53 +#endif
54 #include "system.h"
55
56 #include "error.h"
57 @@ -36,6 +41,18 @@
58 #include "quote.h"
59 #include "xstrndup.h"
60
61 +/* MB_LEN_MAX is incorrectly defined to be 1 in at least one GCC
62 + installation; work around this configuration error. */
63 +#if !defined MB_LEN_MAX || MB_LEN_MAX < 2
64 +# undef MB_LEN_MAX
65 +# define MB_LEN_MAX 16
66 +#endif
67 +
68 +/* Some systems, like BeOS, have multibyte encodings but lack mbstate_t. */
69 +#if HAVE_MBRTOWC && defined mbstate_t
70 +# define mbrtowc(pwc, s, n, ps) (mbrtowc) (pwc, s, n, 0)
71 +#endif
72 +
73 /* The official name of this program (e.g., no `g' prefix). */
74 #define PROGRAM_NAME "cut"
75
76 @@ -71,6 +88,52 @@
77 } \
78 while (0)
79
80 +/* Refill the buffer BUF to get a multibyte character. */
81 +#define REFILL_BUFFER(BUF, BUFPOS, BUFLEN, STREAM) \
82 + do \
83 + { \
84 + if (BUFLEN < MB_LEN_MAX && !feof (STREAM) && !ferror (STREAM)) \
85 + { \
86 + memmove (BUF, BUFPOS, BUFLEN); \
87 + BUFLEN += fread (BUF + BUFLEN, sizeof(char), BUFSIZ, STREAM); \
88 + BUFPOS = BUF; \
89 + } \
90 + } \
91 + while (0)
92 +
93 +/* Get wide character on BUFPOS. BUFPOS is not included after that.
94 + If byte sequence is not valid as a character, CONVFAIL is 1. Otherwise 0. */
95 +#define GET_NEXT_WC_FROM_BUFFER(WC, BUFPOS, BUFLEN, MBLENGTH, STATE, CONVFAIL) \
96 + do \
97 + { \
98 + mbstate_t state_bak; \
99 + \
100 + if (BUFLEN < 1) \
101 + { \
102 + WC = WEOF; \
103 + break; \
104 + } \
105 + \
106 + /* Get a wide character. */ \
107 + CONVFAIL = 0; \
108 + state_bak = STATE; \
109 + MBLENGTH = mbrtowc ((wchar_t *)&WC, BUFPOS, BUFLEN, &STATE); \
110 + \
111 + switch (MBLENGTH) \
112 + { \
113 + case (size_t)-1: \
114 + case (size_t)-2: \
115 + CONVFAIL++; \
116 + STATE = state_bak; \
117 + /* Fall througn. */ \
118 + \
119 + case 0: \
120 + MBLENGTH = 1; \
121 + break; \
122 + } \
123 + } \
124 + while (0)
125 +
126 struct range_pair
127 {
128 size_t lo;
129 @@ -89,7 +152,7 @@ static char *field_1_buffer;
130 /* The number of bytes allocated for FIELD_1_BUFFER. */
131 static size_t field_1_bufsize;
132
133 -/* The largest field or byte index used as an endpoint of a closed
134 +/* The largest byte, character or field index used as an endpoint of a closed
135 or degenerate range specification; this doesn't include the starting
136 index of right-open-ended ranges. For example, with either range spec
137 `2-5,9-', `2-3,5,9-' this variable would be set to 5. */
138 @@ -101,10 +164,11 @@ static size_t eol_range_start;
139
140 /* This is a bit vector.
141 In byte mode, which bytes to output.
142 + In character mode, which characters to output.
143 In field mode, which DELIM-separated fields to output.
144 - Both bytes and fields are numbered starting with 1,
145 + Bytes, characters and fields are numbered starting with 1,
146 so the zeroth bit of this array is unused.
147 - A field or byte K has been selected if
148 + A byte, character or field K has been selected if
149 (K <= MAX_RANGE_ENDPOINT and is_printable_field(K))
150 || (EOL_RANGE_START > 0 && K >= EOL_RANGE_START). */
151 static unsigned char *printable_field;
152 @@ -113,15 +177,25 @@ enum operating_mode
153 {
154 undefined_mode,
155
156 - /* Output characters that are in the given bytes. */
157 + /* Output bytes that are at the given positions. */
158 byte_mode,
159
160 + /* Output characters that are at the given positions. */
161 + character_mode,
162 +
163 /* Output the given delimeter-separated fields. */
164 field_mode
165 };
166
167 static enum operating_mode operating_mode;
168
169 +/* If nonzero, when in byte mode, don't split multibyte characters. */
170 +static int byte_mode_character_aware;
171 +
172 +/* If nonzero, the function for single byte locale is work
173 + if this program runs on multibyte locale. */
174 +static int force_singlebyte_mode;
175 +
176 /* If true do not output lines containing no delimeter characters.
177 Otherwise, all such lines are printed. This option is valid only
178 with field mode. */
179 @@ -133,6 +207,9 @@ static bool complement;
180
181 /* The delimeter character for field mode. */
182 static unsigned char delim;
183 +#if HAVE_WCHAR_H
184 +static wchar_t wcdelim;
185 +#endif
186
187 /* True if the --output-delimiter=STRING option was specified. */
188 static bool output_delimiter_specified;
189 @@ -206,7 +283,7 @@ Mandatory arguments to long options are
190 -f, --fields=LIST select only these fields; also print any line\n\
191 that contains no delimiter character, unless\n\
192 the -s option is specified\n\
193 - -n (ignored)\n\
194 + -n with -b: don't split multibyte characters\n\
195 "), stdout);
196 fputs (_("\
197 --complement complement the set of selected bytes, characters\n\
198 @@ -365,7 +442,7 @@ set_fields (const char *fieldstr)
199 in_digits = false;
200 /* Starting a range. */
201 if (dash_found)
202 - FATAL_ERROR (_("invalid byte or field list"));
203 + FATAL_ERROR (_("invalid byte, character or field list"));
204 dash_found = true;
205 fieldstr++;
206
207 @@ -389,14 +466,16 @@ set_fields (const char *fieldstr)
208 if (!rhs_specified)
209 {
210 /* `n-'. From `initial' to end of line. */
211 - eol_range_start = initial;
212 + if (eol_range_start == 0 ||
213 + (eol_range_start != 0 && eol_range_start > initial))
214 + eol_range_start = initial;
215 field_found = true;
216 }
217 else
218 {
219 /* `m-n' or `-n' (1-n). */
220 if (value < initial)
221 - FATAL_ERROR (_("invalid decreasing range"));
222 + FATAL_ERROR (_("invalid byte, character or field list"));
223
224 /* Is there already a range going to end of line? */
225 if (eol_range_start != 0)
226 @@ -476,6 +555,9 @@ set_fields (const char *fieldstr)
227 if (operating_mode == byte_mode)
228 error (0, 0,
229 _("byte offset %s is too large"), quote (bad_num));
230 + else if (operating_mode == character_mode)
231 + error (0, 0,
232 + _("character offset %s is too large"), quote (bad_num));
233 else
234 error (0, 0,
235 _("field number %s is too large"), quote (bad_num));
236 @@ -486,7 +568,7 @@ set_fields (const char *fieldstr)
237 fieldstr++;
238 }
239 else
240 - FATAL_ERROR (_("invalid byte or field list"));
241 + FATAL_ERROR (_("invalid byte, character or field list"));
242 }
243
244 max_range_endpoint = 0;
245 @@ -579,6 +661,63 @@ cut_bytes (FILE *stream)
246 }
247 }
248
249 +#if HAVE_MBRTOWC
250 +/* This function is in use for the following case.
251 +
252 + 1. Read from the stream STREAM, printing to standard output any selected
253 + characters.
254 +
255 + 2. Read from stream STREAM, printing to standard output any selected bytes,
256 + without splitting multibyte characters. */
257 +
258 +static void
259 +cut_characters_or_cut_bytes_no_split (FILE *stream)
260 +{
261 + int idx; /* number of bytes or characters in the line so far. */
262 + char buf[MB_LEN_MAX + BUFSIZ]; /* For spooling a read byte sequence. */
263 + char *bufpos; /* Next read position of BUF. */
264 + size_t buflen; /* The length of the byte sequence in buf. */
265 + wint_t wc; /* A gotten wide character. */
266 + size_t mblength; /* The byte size of a multibyte character which shows
267 + as same character as WC. */
268 + mbstate_t state; /* State of the stream. */
269 + int convfail; /* 1, when conversion is failed. Otherwise 0. */
270 +
271 + idx = 0;
272 + buflen = 0;
273 + bufpos = buf;
274 + memset (&state, '\0', sizeof(mbstate_t));
275 +
276 + while (1)
277 + {
278 + REFILL_BUFFER (buf, bufpos, buflen, stream);
279 +
280 + GET_NEXT_WC_FROM_BUFFER (wc, bufpos, buflen, mblength, state, convfail);
281 +
282 + if (wc == WEOF)
283 + {
284 + if (idx > 0)
285 + putchar ('\n');
286 + break;
287 + }
288 + else if (wc == L'\n')
289 + {
290 + putchar ('\n');
291 + idx = 0;
292 + }
293 + else
294 + {
295 + idx += (operating_mode == byte_mode) ? mblength : 1;
296 + if (print_kth (idx, NULL))
297 + fwrite (bufpos, mblength, sizeof(char), stdout);
298 + }
299 +
300 + buflen -= mblength;
301 + bufpos += mblength;
302 + }
303 +}
304 +#endif
305 +
306 /* Read from stream STREAM, printing to standard output any selected fields. */
307
308 static void
309 @@ -701,13 +840,192 @@ cut_fields (FILE *stream)
310 }
311 }
312
313 +#if HAVE_MBRTOWC
314 +static void
315 +cut_fields_mb (FILE *stream)
316 +{
317 + int c;
318 + unsigned int field_idx;
319 + int found_any_selected_field;
320 + int buffer_first_field;
321 + int empty_input;
322 + char buf[MB_LEN_MAX + BUFSIZ]; /* For spooling a read byte sequence. */
323 + char *bufpos; /* Next read position of BUF. */
324 + size_t buflen; /* The length of the byte sequence in buf. */
325 + wint_t wc = 0; /* A gotten wide character. */
326 + size_t mblength; /* The byte size of a multibyte character which shows
327 + as same character as WC. */
328 + mbstate_t state; /* State of the stream. */
329 + int convfail; /* 1, when conversion is failed. Otherwise 0. */
330 +
331 + found_any_selected_field = 0;
332 + field_idx = 1;
333 + bufpos = buf;
334 + buflen = 0;
335 + memset (&state, '\0', sizeof(mbstate_t));
336 +
337 + c = getc (stream);
338 + empty_input = (c == EOF);
339 + if (c != EOF)
340 + ungetc (c, stream);
341 + else
342 + wc = WEOF;
343 +
344 + /* To support the semantics of the -s flag, we may have to buffer
345 + all of the first field to determine whether it is `delimited.'
346 + But that is unnecessary if all non-delimited lines must be printed
347 + and the first field has been selected, or if non-delimited lines
348 + must be suppressed and the first field has *not* been selected.
349 + That is because a non-delimited line has exactly one field. */
350 + buffer_first_field = (suppress_non_delimited ^ !print_kth (1, NULL));
351 +
352 + while (1)
353 + {
354 + if (field_idx == 1 && buffer_first_field)
355 + {
356 + int len = 0;
357 +
358 + while (1)
359 + {
360 + REFILL_BUFFER (buf, bufpos, buflen, stream);
361 +
362 + GET_NEXT_WC_FROM_BUFFER
363 + (wc, bufpos, buflen, mblength, state, convfail);
364 +
365 + if (wc == WEOF)
366 + break;
367 +
368 + field_1_buffer = xrealloc (field_1_buffer, len + mblength);
369 + memcpy (field_1_buffer + len, bufpos, mblength);
370 + len += mblength;
371 + buflen -= mblength;
372 + bufpos += mblength;
373 +
374 + if (!convfail && (wc == L'\n' || wc == wcdelim))
375 + break;
376 + }
377 +
378 + if (wc == WEOF)
379 + break;
380 +
381 + /* If the first field extends to the end of line (it is not
382 + delimited) and we are printing all non-delimited lines,
383 + print this one. */
384 + if (convfail || (!convfail && wc != wcdelim))
385 + {
386 + if (suppress_non_delimited)
387 + {
388 + /* Empty. */
389 + }
390 + else
391 + {
392 + fwrite (field_1_buffer, sizeof (char), len, stdout);
393 + /* Make sure the output line is newline terminated. */
394 + if (convfail || (!convfail && wc != L'\n'))
395 + putchar ('\n');
396 + }
397 + continue;
398 + }
399 +
400 + if (print_kth (1, NULL))
401 + {
402 + /* Print the field, but not the trailing delimiter. */
403 + fwrite (field_1_buffer, sizeof (char), len - 1, stdout);
404 + found_any_selected_field = 1;
405 + }
406 + ++field_idx;
407 + }
408 +
409 + if (wc != WEOF)
410 + {
411 + if (print_kth (field_idx, NULL))
412 + {
413 + if (found_any_selected_field)
414 + {
415 + fwrite (output_delimiter_string, sizeof (char),
416 + output_delimiter_length, stdout);
417 + }
418 + found_any_selected_field = 1;
419 + }
420 +
421 + while (1)
422 + {
423 + REFILL_BUFFER (buf, bufpos, buflen, stream);
424 +
425 + GET_NEXT_WC_FROM_BUFFER
426 + (wc, bufpos, buflen, mblength, state, convfail);
427 +
428 + if (wc == WEOF)
429 + break;
430 + else if (!convfail && (wc == wcdelim || wc == L'\n'))
431 + {
432 + buflen -= mblength;
433 + bufpos += mblength;
434 + break;
435 + }
436 +
437 + if (print_kth (field_idx, NULL))
438 + fwrite (bufpos, mblength, sizeof(char), stdout);
439 +
440 + buflen -= mblength;
441 + bufpos += mblength;
442 + }
443 + }
444 +
445 + if ((!convfail || wc == L'\n') && buflen < 1)
446 + wc = WEOF;
447 +
448 + if (!convfail && wc == wcdelim)
449 + ++field_idx;
450 + else if (wc == WEOF || (!convfail && wc == L'\n'))
451 + {
452 + if (found_any_selected_field
453 + || (!empty_input && !(suppress_non_delimited && field_idx == 1)))
454 + putchar ('\n');
455 + if (wc == WEOF)
456 + break;
457 + field_idx = 1;
458 + found_any_selected_field = 0;
459 + }
460 + }
461 +}
462 +#endif
463 +
464 static void
465 cut_stream (FILE *stream)
466 {
467 - if (operating_mode == byte_mode)
468 - cut_bytes (stream);
469 +#if HAVE_MBRTOWC
470 + if (MB_CUR_MAX > 1 && !force_singlebyte_mode)
471 + {
472 + switch (operating_mode)
473 + {
474 + case byte_mode:
475 + if (byte_mode_character_aware)
476 + cut_characters_or_cut_bytes_no_split (stream);
477 + else
478 + cut_bytes (stream);
479 + break;
480 +
481 + case character_mode:
482 + cut_characters_or_cut_bytes_no_split (stream);
483 + break;
484 +
485 + case field_mode:
486 + cut_fields_mb (stream);
487 + break;
488 +
489 + default:
490 + abort ();
491 + }
492 + }
493 else
494 - cut_fields (stream);
495 +#endif
496 + {
497 + if (operating_mode == field_mode)
498 + cut_fields (stream);
499 + else
500 + cut_bytes (stream);
501 + }
502 }
503
504 /* Process file FILE to standard output.
505 @@ -757,6 +1075,8 @@ main (int argc, char **argv)
506 bool ok;
507 bool delim_specified = false;
508 char *spec_list_string IF_LINT(= NULL);
509 + char mbdelim[MB_LEN_MAX + 1];
510 + size_t delimlen = 0;
511
512 initialize_main (&argc, &argv);
513 set_program_name (argv[0]);
514 @@ -779,7 +1099,6 @@ main (int argc, char **argv)
515 switch (optc)
516 {
517 case 'b':
518 - case 'c':
519 /* Build the byte list. */
520 if (operating_mode != undefined_mode)
521 FATAL_ERROR (_("only one type of list may be specified"));
522 @@ -787,6 +1106,14 @@ main (int argc, char **argv)
523 spec_list_string = optarg;
524 break;
525
526 + case 'c':
527 + /* Build the character list. */
528 + if (operating_mode != undefined_mode)
529 + FATAL_ERROR (_("only one type of list may be specified"));
530 + operating_mode = character_mode;
531 + spec_list_string = optarg;
532 + break;
533 +
534 case 'f':
535 /* Build the field list. */
536 if (operating_mode != undefined_mode)
537 @@ -798,10 +1125,35 @@ main (int argc, char **argv)
538 case 'd':
539 /* New delimiter. */
540 /* Interpret -d '' to mean `use the NUL byte as the delimiter.' */
541 - if (optarg[0] != '\0' && optarg[1] != '\0')
542 - FATAL_ERROR (_("the delimiter must be a single character"));
543 - delim = optarg[0];
544 - delim_specified = true;
545 + {
546 +#if HAVE_MBRTOWC
547 + if(MB_CUR_MAX > 1)
548 + {
549 + mbstate_t state;
550 +
551 + memset (&state, '\0', sizeof(mbstate_t));
552 + delimlen = mbrtowc (&wcdelim, optarg, strnlen(optarg, MB_LEN_MAX), &state);
553 +
554 + if (delimlen == (size_t)-1 || delimlen == (size_t)-2)
555 + ++force_singlebyte_mode;
556 + else
557 + {
558 + delimlen = (delimlen < 1) ? 1 : delimlen;
559 + if (wcdelim != L'\0' && *(optarg + delimlen) != '\0')
560 + FATAL_ERROR (_("the delimiter must be a single character"));
561 + memcpy (mbdelim, optarg, delimlen);
562 + }
563 + }
564 +
565 + if (MB_CUR_MAX <= 1 || force_singlebyte_mode)
566 +#endif
567 + {
568 + if (optarg[0] != '\0' && optarg[1] != '\0')
569 + FATAL_ERROR (_("the delimiter must be a single character"));
570 + delim = (unsigned char) optarg[0];
571 + }
572 + delim_specified = true;
573 + }
574 break;
575
576 case OUTPUT_DELIMITER_OPTION:
577 @@ -814,6 +1166,7 @@ main (int argc, char **argv)
578 break;
579
580 case 'n':
581 + byte_mode_character_aware = 1;
582 break;
583
584 case 's':
585 @@ -836,7 +1189,7 @@ main (int argc, char **argv)
586 if (operating_mode == undefined_mode)
587 FATAL_ERROR (_("you must specify a list of bytes, characters, or fields"));
588
589 - if (delim != '\0' && operating_mode != field_mode)
590 + if (delim_specified && operating_mode != field_mode)
591 FATAL_ERROR (_("an input delimiter may be specified only\
592 when operating on fields"));
593
594 @@ -863,15 +1216,34 @@ main (int argc, char **argv)
595 }
596
597 if (!delim_specified)
598 - delim = '\t';
599 + {
600 + delim = '\t';
601 +#ifdef HAVE_MBRTOWC
602 + wcdelim = L'\t';
603 + mbdelim[0] = '\t';
604 + mbdelim[1] = '\0';
605 + delimlen = 1;
606 +#endif
607 + }
608
609 if (output_delimiter_string == NULL)
610 {
611 - static char dummy[2];
612 - dummy[0] = delim;
613 - dummy[1] = '\0';
614 - output_delimiter_string = dummy;
615 - output_delimiter_length = 1;
616 +#ifdef HAVE_MBRTOWC
617 + if (MB_CUR_MAX > 1 && !force_singlebyte_mode)
618 + {
619 + output_delimiter_string = xstrdup(mbdelim);
620 + output_delimiter_length = delimlen;
621 + }
622 +
623 + if (MB_CUR_MAX <= 1 || force_singlebyte_mode)
624 +#endif
625 + {
626 + static char dummy[2];
627 + dummy[0] = delim;
628 + dummy[1] = '\0';
629 + output_delimiter_string = dummy;
630 + output_delimiter_length = 1;
631 + }
632 }
633
634 if (optind == argc)
635 diff -urNp coreutils-8.0-orig/src/expand.c coreutils-8.0/src/expand.c
636 --- coreutils-8.0-orig/src/expand.c 2009-09-29 15:27:54.000000000 +0200
637 +++ coreutils-8.0/src/expand.c 2009-10-07 10:07:16.000000000 +0200
638 @@ -37,11 +37,28 @@
639 #include <stdio.h>
640 #include <getopt.h>
641 #include <sys/types.h>
642 +
643 +/* Get mbstate_t, mbrtowc(), wcwidth(). */
644 +#if HAVE_WCHAR_H
645 +# include <wchar.h>
646 +#endif
647 +
648 #include "system.h"
649 #include "error.h"
650 #include "quote.h"
651 #include "xstrndup.h"
652
653 +/* MB_LEN_MAX is incorrectly defined to be 1 in at least one GCC
654 + installation; work around this configuration error. */
655 +#if !defined MB_LEN_MAX || MB_LEN_MAX < 2
656 +# define MB_LEN_MAX 16
657 +#endif
658 +
659 +/* Some systems, like BeOS, have multibyte encodings but lack mbstate_t. */
660 +#if HAVE_MBRTOWC && defined mbstate_t
661 +# define mbrtowc(pwc, s, n, ps) (mbrtowc) (pwc, s, n, 0)
662 +#endif
663 +
664 /* The official name of this program (e.g., no `g' prefix). */
665 #define PROGRAM_NAME "expand"
666
667 @@ -357,6 +374,142 @@ expand (void)
668 }
669 }
670
671 +#if HAVE_MBRTOWC
672 +static void
673 +expand_multibyte (void)
674 +{
675 + FILE *fp; /* Input strem. */
676 + mbstate_t i_state; /* Current shift state of the input stream. */
677 + mbstate_t i_state_bak; /* Back up the I_STATE. */
678 + mbstate_t o_state; /* Current shift state of the output stream. */
679 + char buf[MB_LEN_MAX + BUFSIZ]; /* For spooling a read byte sequence. */
680 + char *bufpos; /* Next read position of BUF. */
681 + size_t buflen = 0; /* The length of the byte sequence in buf. */
682 + wchar_t wc; /* A gotten wide character. */
683 + size_t mblength; /* The byte size of a multibyte character
684 + which shows as same character as WC. */
685 + int tab_index = 0; /* Index in `tab_list' of next tabstop. */
686 + int column = 0; /* Column on screen of the next char. */
687 + int next_tab_column; /* Column the next tab stop is on. */
688 + int convert = 1; /* If nonzero, perform translations. */
689 +
690 + fp = next_file ((FILE *) NULL);
691 + if (fp == NULL)
692 + return;
693 +
694 + memset (&o_state, '\0', sizeof(mbstate_t));
695 + memset (&i_state, '\0', sizeof(mbstate_t));
696 +
697 + for (;;)
698 + {
699 + /* Refill the buffer BUF. */
700 + if (buflen < MB_LEN_MAX && !feof(fp) && !ferror(fp))
701 + {
702 + memmove (buf, bufpos, buflen);
703 + buflen += fread (buf + buflen, sizeof(char), BUFSIZ, fp);
704 + bufpos = buf;
705 + }
706 +
707 + /* No character is left in BUF. */
708 + if (buflen < 1)
709 + {
710 + fp = next_file (fp);
711 +
712 + if (fp == NULL)
713 + break; /* No more files. */
714 + else
715 + {
716 + memset (&i_state, '\0', sizeof(mbstate_t));
717 + continue;
718 + }
719 + }
720 +
721 + /* Get a wide character. */
722 + i_state_bak = i_state;
723 + mblength = mbrtowc (&wc, bufpos, buflen, &i_state);
724 +
725 + switch (mblength)
726 + {
727 + case (size_t)-1: /* illegal byte sequence. */
728 + case (size_t)-2:
729 + mblength = 1;
730 + i_state = i_state_bak;
731 + if (convert)
732 + {
733 + ++column;
734 + if (convert_entire_line == 0)
735 + convert = 0;
736 + }
737 + putchar (*bufpos);
738 + break;
739 +
740 + case 0: /* null. */
741 + mblength = 1;
742 + if (convert && convert_entire_line == 0)
743 + convert = 0;
744 + putchar ('\0');
745 + break;
746 +
747 + default:
748 + if (wc == L'\n') /* LF. */
749 + {
750 + tab_index = 0;
751 + column = 0;
752 + convert = 1;
753 + putchar ('\n');
754 + }
755 + else if (wc == L'\t' && convert) /* Tab. */
756 + {
757 + if (tab_size == 0)
758 + {
759 + /* Do not let tab_index == first_free_tab;
760 + stop when it is 1 less. */
761 + while (tab_index < first_free_tab - 1
762 + && column >= tab_list[tab_index])
763 + tab_index++;
764 + next_tab_column = tab_list[tab_index];
765 + if (tab_index < first_free_tab - 1)
766 + tab_index++;
767 + if (column >= next_tab_column)
768 + next_tab_column = column + 1;
769 + }
770 + else
771 + next_tab_column = column + tab_size - column % tab_size;
772 +
773 + while (column < next_tab_column)
774 + {
775 + putchar (' ');
776 + ++column;
777 + }
778 + }
779 + else /* Others. */
780 + {
781 + if (convert)
782 + {
783 + if (wc == L'\b')
784 + {
785 + if (column > 0)
786 + --column;
787 + }
788 + else
789 + {
790 + int width; /* The width of WC. */
791 +
792 + width = wcwidth (wc);
793 + column += (width > 0) ? width : 0;
794 + if (convert_entire_line == 0)
795 + convert = 0;
796 + }
797 + }
798 + fwrite (bufpos, sizeof(char), mblength, stdout);
799 + }
800 + }
801 + buflen -= mblength;
802 + bufpos += mblength;
803 + }
804 +}
805 +#endif
806 +
807 int
808 main (int argc, char **argv)
809 {
810 @@ -421,7 +574,12 @@ main (int argc, char **argv)
811
812 file_list = (optind < argc ? &argv[optind] : stdin_argv);
813
814 - expand ();
815 +#if HAVE_MBRTOWC
816 + if (MB_CUR_MAX > 1)
817 + expand_multibyte ();
818 + else
819 +#endif
820 + expand ();
821
822 if (have_read_stdin && fclose (stdin) != 0)
823 error (EXIT_FAILURE, errno, "-");
824 diff -urNp coreutils-8.0-orig/src/fold.c coreutils-8.0/src/fold.c
825 --- coreutils-8.0-orig/src/fold.c 2009-09-23 10:25:44.000000000 +0200
826 +++ coreutils-8.0/src/fold.c 2009-10-07 10:07:16.000000000 +0200
827 @@ -22,11 +22,33 @@
828 #include <getopt.h>
829 #include <sys/types.h>
830
831 +/* Get mbstate_t, mbrtowc(), wcwidth(). */
832 +#if HAVE_WCHAR_H
833 +# include <wchar.h>
834 +#endif
835 +
836 +/* Get iswprint(), iswblank(), wcwidth(). */
837 +#if HAVE_WCTYPE_H
838 +# include <wctype.h>
839 +#endif
840 +
841 #include "system.h"
842 #include "error.h"
843 #include "quote.h"
844 #include "xstrtol.h"
845
846 +/* MB_LEN_MAX is incorrectly defined to be 1 in at least one GCC
847 + installation; work around this configuration error. */
848 +#if !defined MB_LEN_MAX || MB_LEN_MAX < 2
849 +# undef MB_LEN_MAX
850 +# define MB_LEN_MAX 16
851 +#endif
852 +
853 +/* Some systems, like BeOS, have multibyte encodings but lack mbstate_t. */
854 +#if HAVE_MBRTOWC && defined mbstate_t
855 +# define mbrtowc(pwc, s, n, ps) (mbrtowc) (pwc, s, n, 0)
856 +#endif
857 +
858 #define TAB_WIDTH 8
859
860 /* The official name of this program (e.g., no `g' prefix). */
861 @@ -34,20 +56,41 @@
862
863 #define AUTHORS proper_name ("David MacKenzie")
864
865 +#define FATAL_ERROR(Message) \
866 + do \
867 + { \
868 + error (0, 0, (Message)); \
869 + usage (2); \
870 + } \
871 + while (0)
872 +
873 +enum operating_mode
874 +{
875 + /* Fold texts by columns that are at the given positions. */
876 + column_mode,
877 +
878 + /* Fold texts by bytes that are at the given positions. */
879 + byte_mode,
880 +
881 + /* Fold texts by characters that are at the given positions. */
882 + character_mode,
883 +};
884 +
885 +/* The argument shows current mode. (Default: column_mode) */
886 +static enum operating_mode operating_mode;
887 +
888 /* If nonzero, try to break on whitespace. */
889 static bool break_spaces;
890
891 -/* If nonzero, count bytes, not column positions. */
892 -static bool count_bytes;
893 -
894 /* If nonzero, at least one of the files we read was standard input. */
895 static bool have_read_stdin;
896
897 -static char const shortopts[] = "bsw:0::1::2::3::4::5::6::7::8::9::";
898 +static char const shortopts[] = "bcsw:0::1::2::3::4::5::6::7::8::9::";
899
900 static struct option const longopts[] =
901 {
902 {"bytes", no_argument, NULL, 'b'},
903 + {"characters", no_argument, NULL, 'c'},
904 {"spaces", no_argument, NULL, 's'},
905 {"width", required_argument, NULL, 'w'},
906 {GETOPT_HELP_OPTION_DECL},
907 @@ -77,6 +120,7 @@ Mandatory arguments to long options are
908 "), stdout);
909 fputs (_("\
910 -b, --bytes count bytes rather than columns\n\
911 + -c, --characters count characters rather than columns\n\
912 -s, --spaces break at spaces\n\
913 -w, --width=WIDTH use WIDTH columns instead of 80\n\
914 "), stdout);
915 @@ -94,7 +138,7 @@ Mandatory arguments to long options are
916 static size_t
917 adjust_column (size_t column, char c)
918 {
919 - if (!count_bytes)
920 + if (operating_mode != byte_mode)
921 {
922 if (c == '\b')
923 {
924 @@ -117,30 +161,14 @@ adjust_column (size_t column, char c)
925 to stdout, with maximum line length WIDTH.
926 Return true if successful. */
927
928 -static bool
929 -fold_file (char const *filename, size_t width)
930 +static void
931 +fold_text (FILE *istream, size_t width, int *saved_errno)
932 {
933 - FILE *istream;
934 int c;
935 size_t column = 0; /* Screen column where next char will go. */
936 size_t offset_out = 0; /* Index in `line_out' for next char. */
937 static char *line_out = NULL;
938 static size_t allocated_out = 0;
939 - int saved_errno;
940 -
941 - if (STREQ (filename, "-"))
942 - {
943 - istream = stdin;
944 - have_read_stdin = true;
945 - }
946 - else
947 - istream = fopen (filename, "r");
948 -
949 - if (istream == NULL)
950 - {
951 - error (0, errno, "%s", filename);
952 - return false;
953 - }
954
955 while ((c = getc (istream)) != EOF)
956 {
957 @@ -168,6 +196,15 @@ fold_file (char const *filename, size_t
958 bool found_blank = false;
959 size_t logical_end = offset_out;
960
961 + /* If LINE_OUT has no wide character,
962 + put a new wide character in LINE_OUT
963 + if column is bigger than width. */
964 + if (offset_out == 0)
965 + {
966 + line_out[offset_out++] = c;
967 + continue;
968 + }
969 +
970 /* Look for the last blank. */
971 while (logical_end)
972 {
973 @@ -214,11 +251,222 @@ fold_file (char const *filename, size_t
974 line_out[offset_out++] = c;
975 }
976
977 - saved_errno = errno;
978 + *saved_errno = errno;
979
980 if (offset_out)
981 fwrite (line_out, sizeof (char), (size_t) offset_out, stdout);
982
983 +}
984 +
985 +#if HAVE_MBRTOWC
986 +static void
987 +fold_multibyte_text (FILE *istream, size_t width, int *saved_errno)
988 +{
989 + char buf[MB_LEN_MAX + BUFSIZ]; /* For spooling a read byte sequence. */
990 + size_t buflen = 0; /* The length of the byte sequence in buf. */
991 + char *bufpos = NULL; /* Next read position of BUF. */
992 + wint_t wc; /* A gotten wide character. */
993 + size_t mblength; /* The byte size of a multibyte character which shows
994 + as same character as WC. */
995 + mbstate_t state, state_bak; /* State of the stream. */
996 + int convfail; /* 1, when conversion is failed. Otherwise 0. */
997 +
998 + static char *line_out = NULL;
999 + size_t offset_out = 0; /* Index in `line_out' for next char. */
1000 + static size_t allocated_out = 0;
1001 +
1002 + int increment;
1003 + size_t column = 0;
1004 +
1005 + size_t last_blank_pos;
1006 + size_t last_blank_column;
1007 + int is_blank_seen;
1008 + int last_blank_increment = 0;
1009 + int is_bs_following_last_blank;
1010 + size_t bs_following_last_blank_num;
1011 + int is_cr_after_last_blank;
1012 +
1013 +#define CLEAR_FLAGS \
1014 + do \
1015 + { \
1016 + last_blank_pos = 0; \
1017 + last_blank_column = 0; \
1018 + is_blank_seen = 0; \
1019 + is_bs_following_last_blank = 0; \
1020 + bs_following_last_blank_num = 0; \
1021 + is_cr_after_last_blank = 0; \
1022 + } \
1023 + while (0)
1024 +
1025 +#define START_NEW_LINE \
1026 + do \
1027 + { \
1028 + putchar ('\n'); \
1029 + column = 0; \
1030 + offset_out = 0; \
1031 + CLEAR_FLAGS; \
1032 + } \
1033 + while (0)
1034 +
1035 + CLEAR_FLAGS;
1036 + memset (&state, '\0', sizeof(mbstate_t));
1037 +
1038 + for (;; bufpos += mblength, buflen -= mblength)
1039 + {
1040 + if (buflen < MB_LEN_MAX && !feof (istream) && !ferror (istream))
1041 + {
1042 + memmove (buf, bufpos, buflen);
1043 + buflen += fread (buf + buflen, sizeof(char), BUFSIZ, istream);
1044 + bufpos = buf;
1045 + }
1046 +
1047 + if (buflen < 1)
1048 + break;
1049 +
1050 + /* Get a wide character. */
1051 + convfail = 0;
1052 + state_bak = state;
1053 + mblength = mbrtowc ((wchar_t *)&wc, bufpos, buflen, &state);
1054 +
1055 + switch (mblength)
1056 + {
1057 + case (size_t)-1:
1058 + case (size_t)-2:
1059 + convfail++;
1060 + state = state_bak;
1061 + /* Fall through. */
1062 +
1063 + case 0:
1064 + mblength = 1;
1065 + break;
1066 + }
1067 +
1068 +rescan:
1069 + if (operating_mode == byte_mode) /* byte mode */
1070 + increment = mblength;
1071 + else if (operating_mode == character_mode) /* character mode */
1072 + increment = 1;
1073 + else /* column mode */
1074 + {
1075 + if (convfail)
1076 + increment = 1;
1077 + else
1078 + {
1079 + switch (wc)
1080 + {
1081 + case L'\n':
1082 + fwrite (line_out, sizeof(char), offset_out, stdout);
1083 + START_NEW_LINE;
1084 + continue;
1085 +
1086 + case L'\b':
1087 + increment = (column > 0) ? -1 : 0;
1088 + break;
1089 +
1090 + case L'\r':
1091 + increment = -1 * column;
1092 + break;
1093 +
1094 + case L'\t':
1095 + increment = 8 - column % 8;
1096 + break;
1097 +
1098 + default:
1099 + increment = wcwidth (wc);
1100 + increment = (increment < 0) ? 0 : increment;
1101 + }
1102 + }
1103 + }
1104 +
1105 + if (column + increment > width && break_spaces && last_blank_pos)
1106 + {
1107 + fwrite (line_out, sizeof(char), last_blank_pos, stdout);
1108 + putchar ('\n');
1109 +
1110 + offset_out = offset_out - last_blank_pos;
1111 + column = column - last_blank_column + ((is_cr_after_last_blank)
1112 + ? last_blank_increment : bs_following_last_blank_num);
1113 + memmove (line_out, line_out + last_blank_pos, offset_out);
1114 + CLEAR_FLAGS;
1115 + goto rescan;
1116 + }
1117 +
1118 + if (column + increment > width && column != 0)
1119 + {
1120 + fwrite (line_out, sizeof(char), offset_out, stdout);
1121 + START_NEW_LINE;
1122 + goto rescan;
1123 + }
1124 +
1125 + if (allocated_out < offset_out + mblength)
1126 + {
1127 + line_out = X2REALLOC (line_out, &allocated_out);
1128 + }
1129 +
1130 + memcpy (line_out + offset_out, bufpos, mblength);
1131 + offset_out += mblength;
1132 + column += increment;
1133 +
1134 + if (is_blank_seen && !convfail && wc == L'\r')
1135 + is_cr_after_last_blank = 1;
1136 +
1137 + if (is_bs_following_last_blank && !convfail && wc == L'\b')
1138 + ++bs_following_last_blank_num;
1139 + else
1140 + is_bs_following_last_blank = 0;
1141 +
1142 + if (break_spaces && !convfail && iswblank (wc))
1143 + {
1144 + last_blank_pos = offset_out;
1145 + last_blank_column = column;
1146 + is_blank_seen = 1;
1147 + last_blank_increment = increment;
1148 + is_bs_following_last_blank = 1;
1149 + bs_following_last_blank_num = 0;
1150 + is_cr_after_last_blank = 0;
1151 + }
1152 + }
1153 +
1154 + *saved_errno = errno;
1155 +
1156 + if (offset_out)
1157 + fwrite (line_out, sizeof (char), (size_t) offset_out, stdout);
1158 +
1159 +}
1160 +#endif
1161 +
1162 +/* Fold file FILENAME, or standard input if FILENAME is "-",
1163 + to stdout, with maximum line length WIDTH.
1164 + Return 0 if successful, 1 if an error occurs. */
1165 +
1166 +static bool
1167 +fold_file (char *filename, size_t width)
1168 +{
1169 + FILE *istream;
1170 + int saved_errno;
1171 +
1172 + if (STREQ (filename, "-"))
1173 + {
1174 + istream = stdin;
1175 + have_read_stdin = 1;
1176 + }
1177 + else
1178 + istream = fopen (filename, "r");
1179 +
1180 + if (istream == NULL)
1181 + {
1182 + error (0, errno, "%s", filename);
1183 + return 1;
1184 + }
1185 +
1186 + /* Define how ISTREAM is being folded. */
1187 +#if HAVE_MBRTOWC
1188 + if (MB_CUR_MAX > 1)
1189 + fold_multibyte_text (istream, width, &saved_errno);
1190 + else
1191 +#endif
1192 + fold_text (istream, width, &saved_errno);
1193 +
1194 if (ferror (istream))
1195 {
1196 error (0, saved_errno, "%s", filename);
1197 @@ -251,7 +499,8 @@ main (int argc, char **argv)
1198
1199 atexit (close_stdout);
1200
1201 - break_spaces = count_bytes = have_read_stdin = false;
1202 + operating_mode = column_mode;
1203 + break_spaces = have_read_stdin = false;
1204
1205 while ((optc = getopt_long (argc, argv, shortopts, longopts, NULL)) != -1)
1206 {
1207 @@ -260,7 +509,15 @@ main (int argc, char **argv)
1208 switch (optc)
1209 {
1210 case 'b': /* Count bytes rather than columns. */
1211 - count_bytes = true;
1212 + if (operating_mode != column_mode)
1213 + FATAL_ERROR (_("only one way of folding may be specified"));
1214 + operating_mode = byte_mode;
1215 + break;
1216 +
1217 + case 'c':
1218 + if (operating_mode != column_mode)
1219 + FATAL_ERROR (_("only one way of folding may be specified"));
1220 + operating_mode = character_mode;
1221 break;
1222
1223 case 's': /* Break at word boundaries. */
1224 diff -urNp coreutils-8.0-orig/src/join.c coreutils-8.0/src/join.c
1225 --- coreutils-8.0-orig/src/join.c 2009-09-23 10:25:44.000000000 +0200
1226 +++ coreutils-8.0/src/join.c 2009-10-07 10:07:16.000000000 +0200
1227 @@ -22,17 +22,31 @@
1228 #include <sys/types.h>
1229 #include <getopt.h>
1230
1231 +/* Get mbstate_t, mbrtowc(), mbrtowc(), wcwidth(). */
1232 +#if HAVE_WCHAR_H
1233 +# include <wchar.h>
1234 +#endif
1235 +
1236 +/* Get iswblank(), towupper. */
1237 +#if HAVE_WCTYPE_H
1238 +# include <wctype.h>
1239 +#endif
1240 +
1241 #include "system.h"
1242 #include "error.h"
1243 #include "hard-locale.h"
1244 #include "linebuffer.h"
1245 -#include "memcasecmp.h"
1246 #include "quote.h"
1247 #include "stdio--.h"
1248 #include "xmemcoll.h"
1249 #include "xstrtol.h"
1250 #include "argmatch.h"
1251
1252 +/* Some systems, like BeOS, have multibyte encodings but lack mbstate_t. */
1253 +#if HAVE_MBRTOWC && defined mbstate_t
1254 +# define mbrtowc(pwc, s, n, ps) (mbrtowc) (pwc, s, n, 0)
1255 +#endif
1256 +
1257 /* The official name of this program (e.g., no `g' prefix). */
1258 #define PROGRAM_NAME "join"
1259
1260 @@ -121,10 +135,12 @@ static struct outlist outlist_head;
1261 /* Last element in `outlist', where a new element can be added. */
1262 static struct outlist *outlist_end = &outlist_head;
1263
1264 -/* Tab character separating fields. If negative, fields are separated
1265 - by any nonempty string of blanks, otherwise by exactly one
1266 - tab character whose value (when cast to unsigned char) equals TAB. */
1267 -static int tab = -1;
1268 +/* Tab character separating fields. If NULL, fields are separated
1269 + by any nonempty string of blanks. */
1270 +static char *tab = NULL;
1271 +
1272 +/* The number of bytes used for tab. */
1273 +static size_t tablen = 0;
1274
1275 /* If nonzero, check that the input is correctly ordered. */
1276 static enum
1277 @@ -239,10 +255,11 @@ xfields (struct line *line)
1278 if (ptr == lim)
1279 return;
1280
1281 - if (0 <= tab)
1282 + if (tab != NULL)
1283 {
1284 + unsigned char t = tab[0];
1285 char *sep;
1286 - for (; (sep = memchr (ptr, tab, lim - ptr)) != NULL; ptr = sep + 1)
1287 + for (; (sep = memchr (ptr, t, lim - ptr)) != NULL; ptr = sep + 1)
1288 extract_field (line, ptr, sep - ptr);
1289 }
1290 else
1291 @@ -269,6 +286,148 @@ xfields (struct line *line)
1292 extract_field (line, ptr, lim - ptr);
1293 }
1294
1295 +#if HAVE_MBRTOWC
1296 +static void
1297 +xfields_multibyte (struct line *line)
1298 +{
1299 + char *ptr = line->buf.buffer;
1300 + char const *lim = ptr + line->buf.length - 1;
1301 + wchar_t wc = 0;
1302 + size_t mblength = 1;
1303 + mbstate_t state, state_bak;
1304 +
1305 + memset (&state, 0, sizeof (mbstate_t));
1306 +
1307 + if (ptr >= lim)
1308 + return;
1309 +
1310 + if (tab != NULL)
1311 + {
1312 + unsigned char t = tab[0];
1313 + char *sep = ptr;
1314 + for (; ptr < lim; ptr = sep + mblength)
1315 + {
1316 + sep = ptr;
1317 + while (sep < lim)
1318 + {
1319 + state_bak = state;
1320 + mblength = mbrtowc (&wc, sep, lim - sep + 1, &state);
1321 +
1322 + if (mblength == (size_t)-1 || mblength == (size_t)-2)
1323 + {
1324 + mblength = 1;
1325 + state = state_bak;
1326 + }
1327 + mblength = (mblength < 1) ? 1 : mblength;
1328 +
1329 + if (mblength == tablen && !memcmp (sep, tab, mblength))
1330 + break;
1331 + else
1332 + {
1333 + sep += mblength;
1334 + continue;
1335 + }
1336 + }
1337 +
1338 + if (sep >= lim)
1339 + break;
1340 +
1341 + extract_field (line, ptr, sep - ptr);
1342 + }
1343 + }
1344 + else
1345 + {
1346 + /* Skip leading blanks before the first field. */
1347 + while(ptr < lim)
1348 + {
1349 + state_bak = state;
1350 + mblength = mbrtowc (&wc, ptr, lim - ptr + 1, &state);
1351 +
1352 + if (mblength == (size_t)-1 || mblength == (size_t)-2)
1353 + {
1354 + mblength = 1;
1355 + state = state_bak;
1356 + break;
1357 + }
1358 + mblength = (mblength < 1) ? 1 : mblength;
1359 +
1360 + if (!iswblank(wc))
1361 + break;
1362 + ptr += mblength;
1363 + }
1364 +
1365 + do
1366 + {
1367 + char *sep;
1368 + state_bak = state;
1369 + mblength = mbrtowc (&wc, ptr, lim - ptr + 1, &state);
1370 + if (mblength == (size_t)-1 || mblength == (size_t)-2)
1371 + {
1372 + mblength = 1;
1373 + state = state_bak;
1374 + break;
1375 + }
1376 + mblength = (mblength < 1) ? 1 : mblength;
1377 +
1378 + sep = ptr + mblength;
1379 + while (sep < lim)
1380 + {
1381 + state_bak = state;
1382 + mblength = mbrtowc (&wc, sep, lim - sep + 1, &state);
1383 + if (mblength == (size_t)-1 || mblength == (size_t)-2)
1384 + {
1385 + mblength = 1;
1386 + state = state_bak;
1387 + break;
1388 + }
1389 + mblength = (mblength < 1) ? 1 : mblength;
1390 +
1391 + if (iswblank (wc))
1392 + break;
1393 +
1394 + sep += mblength;
1395 + }
1396 +
1397 + extract_field (line, ptr, sep - ptr);
1398 + if (sep >= lim)
1399 + return;
1400 +
1401 + state_bak = state;
1402 + mblength = mbrtowc (&wc, sep, lim - sep + 1, &state);
1403 + if (mblength == (size_t)-1 || mblength == (size_t)-2)
1404 + {
1405 + mblength = 1;
1406 + state = state_bak;
1407 + break;
1408 + }
1409 + mblength = (mblength < 1) ? 1 : mblength;
1410 +
1411 + ptr = sep + mblength;
1412 + while (ptr < lim)
1413 + {
1414 + state_bak = state;
1415 + mblength = mbrtowc (&wc, ptr, lim - ptr + 1, &state);
1416 + if (mblength == (size_t)-1 || mblength == (size_t)-2)
1417 + {
1418 + mblength = 1;
1419 + state = state_bak;
1420 + break;
1421 + }
1422 + mblength = (mblength < 1) ? 1 : mblength;
1423 +
1424 + if (!iswblank (wc))
1425 + break;
1426 +
1427 + ptr += mblength;
1428 + }
1429 + }
1430 + while (ptr < lim);
1431 + }
1432 +
1433 + extract_field (line, ptr, lim - ptr);
1434 +}
1435 +#endif
1436 +
1437 static void
1438 freeline (struct line *line)
1439 {
1440 @@ -287,56 +446,115 @@ keycmp (struct line const *line1, struct
1441 size_t jf_1, size_t jf_2)
1442 {
1443 /* Start of field to compare in each file. */
1444 - char *beg1;
1445 - char *beg2;
1446 -
1447 - size_t len1;
1448 - size_t len2; /* Length of fields to compare. */
1449 + char *beg[2];
1450 + char *copy[2];
1451 + size_t len[2]; /* Length of fields to compare. */
1452 int diff;
1453 + int i, j;
1454
1455 if (jf_1 < line1->nfields)
1456 {
1457 - beg1 = line1->fields[jf_1].beg;
1458 - len1 = line1->fields[jf_1].len;
1459 + beg[0] = line1->fields[jf_1].beg;
1460 + len[0] = line1->fields[jf_1].len;
1461 }
1462 else
1463 {
1464 - beg1 = NULL;
1465 - len1 = 0;
1466 + beg[0] = NULL;
1467 + len[0] = 0;
1468 }
1469
1470 if (jf_2 < line2->nfields)
1471 {
1472 - beg2 = line2->fields[jf_2].beg;
1473 - len2 = line2->fields[jf_2].len;
1474 + beg[1] = line2->fields[jf_2].beg;
1475 + len[1] = line2->fields[jf_2].len;
1476 }
1477 else
1478 {
1479 - beg2 = NULL;
1480 - len2 = 0;
1481 + beg[1] = NULL;
1482 + len[1] = 0;
1483 }
1484
1485 - if (len1 == 0)
1486 - return len2 == 0 ? 0 : -1;
1487 - if (len2 == 0)
1488 + if (len[0] == 0)
1489 + return len[1] == 0 ? 0 : -1;
1490 + if (len[1] == 0)
1491 return 1;
1492
1493 if (ignore_case)
1494 {
1495 - /* FIXME: ignore_case does not work with NLS (in particular,
1496 - with multibyte chars). */
1497 - diff = memcasecmp (beg1, beg2, MIN (len1, len2));
1498 +#ifdef HAVE_MBRTOWC
1499 + if (MB_CUR_MAX > 1)
1500 + {
1501 + size_t mblength;
1502 + wchar_t wc, uwc;
1503 + mbstate_t state, state_bak;
1504 +
1505 + memset (&state, '\0', sizeof (mbstate_t));
1506 +
1507 + for (i = 0; i < 2; i++)
1508 + {
1509 + copy[i] = alloca (len[i] + 1);
1510 +
1511 + for (j = 0; j < MIN (len[0], len[1]);)
1512 + {
1513 + state_bak = state;
1514 + mblength = mbrtowc (&wc, beg[i] + j, len[i] - j, &state);
1515 +
1516 + switch (mblength)
1517 + {
1518 + case (size_t) -1:
1519 + case (size_t) -2:
1520 + state = state_bak;
1521 + /* Fall through */
1522 + case 0:
1523 + mblength = 1;
1524 + break;
1525 +
1526 + default:
1527 + uwc = towupper (wc);
1528 +
1529 + if (uwc != wc)
1530 + {
1531 + mbstate_t state_wc;
1532 +
1533 + memset (&state_wc, '\0', sizeof (mbstate_t));
1534 + wcrtomb (copy[i] + j, uwc, &state_wc);
1535 + }
1536 + else
1537 + memcpy (copy[i] + j, beg[i] + j, mblength);
1538 + }
1539 + j += mblength;
1540 + }
1541 + copy[i][j] = '\0';
1542 + }
1543 + }
1544 + else
1545 +#endif
1546 + {
1547 + for (i = 0; i < 2; i++)
1548 + {
1549 + copy[i] = alloca (len[i] + 1);
1550 +
1551 + for (j = 0; j < MIN (len[0], len[1]); j++)
1552 + copy[i][j] = toupper (beg[i][j]);
1553 +
1554 + copy[i][j] = '\0';
1555 + }
1556 + }
1557 }
1558 else
1559 {
1560 - if (hard_LC_COLLATE)
1561 - return xmemcoll (beg1, len1, beg2, len2);
1562 - diff = memcmp (beg1, beg2, MIN (len1, len2));
1563 + copy[0] = (unsigned char *) beg[0];
1564 + copy[1] = (unsigned char *) beg[1];
1565 }
1566
1567 + if (hard_LC_COLLATE)
1568 + return xmemcoll ((char *) copy[0], len[0], (char *) copy[1], len[1]);
1569 + diff = memcmp (copy[0], copy[1], MIN (len[0], len[1]));
1570 +
1571 +
1572 if (diff)
1573 return diff;
1574 - return len1 < len2 ? -1 : len1 != len2;
1575 + return len[0] - len[1];
1576 }
1577
1578 /* Check that successive input lines PREV and CURRENT from input file
1579 @@ -417,6 +635,11 @@ get_line (FILE *fp, struct line **linep,
1580 return false;
1581 }
1582
1583 +#if HAVE_MBRTOWC
1584 + if (MB_CUR_MAX > 1)
1585 + xfields_multibyte (line);
1586 + else
1587 +#endif
1588 xfields (line);
1589
1590 if (prevline[which - 1])
1591 @@ -518,11 +741,18 @@ prfield (size_t n, struct line const *li
1592
1593 /* Print the join of LINE1 and LINE2. */
1594
1595 +#define PUT_TAB_CHAR \
1596 + do \
1597 + { \
1598 + (tab != NULL) ? \
1599 + fwrite(tab, sizeof(char), tablen, stdout) : putchar (' '); \
1600 + } \
1601 + while (0)
1602 +
1603 static void
1604 prjoin (struct line const *line1, struct line const *line2)
1605 {
1606 const struct outlist *outlist;
1607 - char output_separator = tab < 0 ? ' ' : tab;
1608
1609 outlist = outlist_head.next;
1610 if (outlist)
1611 @@ -557,7 +787,7 @@ prjoin (struct line const *line1, struct
1612 o = o->next;
1613 if (o == NULL)
1614 break;
1615 - putchar (output_separator);
1616 + PUT_TAB_CHAR;
1617 }
1618 putchar ('\n');
1619 }
1620 @@ -575,23 +805,23 @@ prjoin (struct line const *line1, struct
1621 prfield (join_field_1, line1);
1622 for (i = 0; i < join_field_1 && i < line1->nfields; ++i)
1623 {
1624 - putchar (output_separator);
1625 + PUT_TAB_CHAR;
1626 prfield (i, line1);
1627 }
1628 for (i = join_field_1 + 1; i < line1->nfields; ++i)
1629 {
1630 - putchar (output_separator);
1631 + PUT_TAB_CHAR;
1632 prfield (i, line1);
1633 }
1634
1635 for (i = 0; i < join_field_2 && i < line2->nfields; ++i)
1636 {
1637 - putchar (output_separator);
1638 + PUT_TAB_CHAR;
1639 prfield (i, line2);
1640 }
1641 for (i = join_field_2 + 1; i < line2->nfields; ++i)
1642 {
1643 - putchar (output_separator);
1644 + PUT_TAB_CHAR;
1645 prfield (i, line2);
1646 }
1647 putchar ('\n');
1648 @@ -1022,20 +1252,41 @@ main (int argc, char **argv)
1649
1650 case 't':
1651 {
1652 - unsigned char newtab = optarg[0];
1653 - if (! newtab)
1654 + char *newtab;
1655 + size_t newtablen;
1656 + if (! optarg[0])
1657 error (EXIT_FAILURE, 0, _("empty tab"));
1658 - if (optarg[1])
1659 + newtab = xstrdup (optarg);
1660 +#if HAVE_MBRTOWC
1661 + if (MB_CUR_MAX > 1)
1662 + {
1663 + mbstate_t state;
1664 +
1665 + memset (&state, 0, sizeof (mbstate_t));
1666 + newtablen = mbrtowc (NULL, newtab,
1667 + strnlen (newtab, MB_LEN_MAX),
1668 + &state);
1669 + if (newtablen == (size_t) 0
1670 + || newtablen == (size_t) -1
1671 + || newtablen == (size_t) -2)
1672 + newtablen = 1;
1673 + }
1674 + else
1675 +#endif
1676 + newtablen = 1;
1677 +
1678 + if (newtablen == 1 && newtab[1])
1679 + {
1680 + if (STREQ (newtab, "\\0"))
1681 + newtab[0] = '\0';
1682 + }
1683 + if (tab != NULL && strcmp (tab, newtab))
1684 {
1685 - if (STREQ (optarg, "\\0"))
1686 - newtab = '\0';
1687 - else
1688 - error (EXIT_FAILURE, 0, _("multi-character tab %s"),
1689 - quote (optarg));
1690 + free (newtab);
1691 + error (EXIT_FAILURE, 0, _("incompatible tabs"));
1692 }
1693 - if (0 <= tab && tab != newtab)
1694 - error (EXIT_FAILURE, 0, _("incompatible tabs"));
1695 tab = newtab;
1696 + tablen = newtablen;
1697 }
1698 break;
1699
1700 diff -urNp coreutils-8.0-orig/src/pr.c coreutils-8.0/src/pr.c
1701 --- coreutils-8.0-orig/src/pr.c 2009-09-29 15:27:54.000000000 +0200
1702 +++ coreutils-8.0/src/pr.c 2009-10-07 10:07:16.000000000 +0200
1703 @@ -312,6 +312,32 @@
1704
1705 #include <getopt.h>
1706 #include <sys/types.h>
1707 +
1708 +/* Get MB_LEN_MAX. */
1709 +#include <limits.h>
1710 +/* MB_LEN_MAX is incorrectly defined to be 1 in at least one GCC
1711 + installation; work around this configuration error. */
1712 +#if !defined MB_LEN_MAX || MB_LEN_MAX == 1
1713 +# define MB_LEN_MAX 16
1714 +#endif
1715 +
1716 +/* Get MB_CUR_MAX. */
1717 +#include <stdlib.h>
1718 +
1719 +/* Solaris 2.5 has a bug: <wchar.h> must be included before <wctype.h>. */
1720 +/* Get mbstate_t, mbrtowc(), wcwidth(). */
1721 +#if HAVE_WCHAR_H
1722 +# include <wchar.h>
1723 +#endif
1724 +
1725 +/* Get iswprint(). -- for wcwidth(). */
1726 +#if HAVE_WCTYPE_H
1727 +# include <wctype.h>
1728 +#endif
1729 +#if !defined iswprint && !HAVE_ISWPRINT
1730 +# define iswprint(wc) 1
1731 +#endif
1732 +
1733 #include "system.h"
1734 #include "error.h"
1735 #include "hard-locale.h"
1736 @@ -322,6 +348,18 @@
1737 #include "strftime.h"
1738 #include "xstrtol.h"
1739
1740 +/* Some systems, like BeOS, have multibyte encodings but lack mbstate_t. */
1741 +#if HAVE_MBRTOWC && defined mbstate_t
1742 +# define mbrtowc(pwc, s, n, ps) (mbrtowc) (pwc, s, n, 0)
1743 +#endif
1744 +
1745 +#ifndef HAVE_DECL_WCWIDTH
1746 +"this configure-time declaration test was not run"
1747 +#endif
1748 +#if !HAVE_DECL_WCWIDTH
1749 +extern int wcwidth ();
1750 +#endif
1751 +
1752 /* The official name of this program (e.g., no `g' prefix). */
1753 #define PROGRAM_NAME "pr"
1754
1755 @@ -414,7 +452,20 @@ struct COLUMN
1756
1757 typedef struct COLUMN COLUMN;
1758
1759 -static int char_to_clump (char c);
1760 +/* Funtion pointers to switch functions for single byte locale or for
1761 + multibyte locale. If multibyte functions do not exist in your sysytem,
1762 + these pointers always point the function for single byte locale. */
1763 +static void (*print_char) (char c);
1764 +static int (*char_to_clump) (char c);
1765 +
1766 +/* Functions for single byte locale. */
1767 +static void print_char_single (char c);
1768 +static int char_to_clump_single (char c);
1769 +
1770 +/* Functions for multibyte locale. */
1771 +static void print_char_multi (char c);
1772 +static int char_to_clump_multi (char c);
1773 +
1774 static bool read_line (COLUMN *p);
1775 static bool print_page (void);
1776 static bool print_stored (COLUMN *p);
1777 @@ -424,6 +475,7 @@ static void print_header (void);
1778 static void pad_across_to (int position);
1779 static void add_line_number (COLUMN *p);
1780 static void getoptarg (char *arg, char switch_char, char *character,
1781 + int *character_length, int *character_width,
1782 int *number);
1783 void usage (int status);
1784 static void print_files (int number_of_files, char **av);
1785 @@ -438,7 +490,6 @@ static void store_char (char c);
1786 static void pad_down (int lines);
1787 static void read_rest_of_line (COLUMN *p);
1788 static void skip_read (COLUMN *p, int column_number);
1789 -static void print_char (char c);
1790 static void cleanup (void);
1791 static void print_sep_string (void);
1792 static void separator_string (const char *optarg_S);
1793 @@ -450,7 +501,7 @@ static COLUMN *column_vector;
1794 we store the leftmost columns contiguously in buff.
1795 To print a line from buff, get the index of the first character
1796 from line_vector[i], and print up to line_vector[i + 1]. */
1797 -static char *buff;
1798 +static unsigned char *buff;
1799
1800 /* Index of the position in buff where the next character
1801 will be stored. */
1802 @@ -554,7 +605,7 @@ static int chars_per_column;
1803 static bool untabify_input = false;
1804
1805 /* (-e) The input tab character. */
1806 -static char input_tab_char = '\t';
1807 +static char input_tab_char[MB_LEN_MAX] = "\t";
1808
1809 /* (-e) Tabstops are at chars_per_tab, 2*chars_per_tab, 3*chars_per_tab, ...
1810 where the leftmost column is 1. */
1811 @@ -564,7 +615,10 @@ static int chars_per_input_tab = 8;
1812 static bool tabify_output = false;
1813
1814 /* (-i) The output tab character. */
1815 -static char output_tab_char = '\t';
1816 +static char output_tab_char[MB_LEN_MAX] = "\t";
1817 +
1818 +/* (-i) The byte length of output tab character. */
1819 +static int output_tab_char_length = 1;
1820
1821 /* (-i) The width of the output tab. */
1822 static int chars_per_output_tab = 8;
1823 @@ -638,7 +692,13 @@ static int power_10;
1824 static bool numbered_lines = false;
1825
1826 /* (-n) Character which follows each line number. */
1827 -static char number_separator = '\t';
1828 +static char number_separator[MB_LEN_MAX] = "\t";
1829 +
1830 +/* (-n) The byte length of the character which follows each line number. */
1831 +static int number_separator_length = 1;
1832 +
1833 +/* (-n) The character width of the character which follows each line number. */
1834 +static int number_separator_width = 0;
1835
1836 /* (-n) line counting starts with 1st line of input file (not with 1st
1837 line of 1st page printed). */
1838 @@ -691,6 +751,7 @@ static bool use_col_separator = false;
1839 -a|COLUMN|-m is a `space' and with the -J option a `tab'. */
1840 static char *col_sep_string = (char *) "";
1841 static int col_sep_length = 0;
1842 +static int col_sep_width = 0;
1843 static char *column_separator = (char *) " ";
1844 static char *line_separator = (char *) "\t";
1845
1846 @@ -847,6 +908,13 @@ separator_string (const char *optarg_S)
1847 col_sep_length = (int) strlen (optarg_S);
1848 col_sep_string = xmalloc (col_sep_length + 1);
1849 strcpy (col_sep_string, optarg_S);
1850 +
1851 +#if HAVE_MBRTOWC
1852 + if (MB_CUR_MAX > 1)
1853 + col_sep_width = mbswidth (col_sep_string, 0);
1854 + else
1855 +#endif
1856 + col_sep_width = col_sep_length;
1857 }
1858
1859 int
1860 @@ -871,6 +939,21 @@ main (int argc, char **argv)
1861
1862 atexit (close_stdout);
1863
1864 +/* Define which functions are used, the ones for single byte locale or the ones
1865 + for multibyte locale. */
1866 +#if HAVE_MBRTOWC
1867 + if (MB_CUR_MAX > 1)
1868 + {
1869 + print_char = print_char_multi;
1870 + char_to_clump = char_to_clump_multi;
1871 + }
1872 + else
1873 +#endif
1874 + {
1875 + print_char = print_char_single;
1876 + char_to_clump = char_to_clump_single;
1877 + }
1878 +
1879 n_files = 0;
1880 file_names = (argc > 1
1881 ? xmalloc ((argc - 1) * sizeof (char *))
1882 @@ -947,8 +1030,12 @@ main (int argc, char **argv)
1883 break;
1884 case 'e':
1885 if (optarg)
1886 - getoptarg (optarg, 'e', &input_tab_char,
1887 - &chars_per_input_tab);
1888 + {
1889 + int dummy_length, dummy_width;
1890 +
1891 + getoptarg (optarg, 'e', input_tab_char, &dummy_length,
1892 + &dummy_width, &chars_per_input_tab);
1893 + }
1894 /* Could check tab width > 0. */
1895 untabify_input = true;
1896 break;
1897 @@ -961,8 +1048,12 @@ main (int argc, char **argv)
1898 break;
1899 case 'i':
1900 if (optarg)
1901 - getoptarg (optarg, 'i', &output_tab_char,
1902 - &chars_per_output_tab);
1903 + {
1904 + int dummy_width;
1905 +
1906 + getoptarg (optarg, 'i', output_tab_char, &output_tab_char_length,
1907 + &dummy_width, &chars_per_output_tab);
1908 + }
1909 /* Could check tab width > 0. */
1910 tabify_output = true;
1911 break;
1912 @@ -989,8 +1080,8 @@ main (int argc, char **argv)
1913 case 'n':
1914 numbered_lines = true;
1915 if (optarg)
1916 - getoptarg (optarg, 'n', &number_separator,
1917 - &chars_per_number);
1918 + getoptarg (optarg, 'n', number_separator, &number_separator_length,
1919 + &number_separator_width, &chars_per_number);
1920 break;
1921 case 'N':
1922 skip_count = false;
1923 @@ -1029,7 +1120,7 @@ main (int argc, char **argv)
1924 old_s = false;
1925 /* Reset an additional input of -s, -S dominates -s */
1926 col_sep_string = bad_cast ("");
1927 - col_sep_length = 0;
1928 + col_sep_length = col_sep_width = 0;
1929 use_col_separator = true;
1930 if (optarg)
1931 separator_string (optarg);
1932 @@ -1186,10 +1277,45 @@ main (int argc, char **argv)
1933 a number. */
1934
1935 static void
1936 -getoptarg (char *arg, char switch_char, char *character, int *number)
1937 +getoptarg (char *arg, char switch_char, char *character, int *character_length,
1938 + int *character_width, int *number)
1939 {
1940 if (!ISDIGIT (*arg))
1941 - *character = *arg++;
1942 + {
1943 +#ifdef HAVE_MBRTOWC
1944 + if (MB_CUR_MAX > 1) /* for multibyte locale. */
1945 + {
1946 + wchar_t wc;
1947 + size_t mblength;
1948 + int width;
1949 + mbstate_t state = {'\0'};
1950 +
1951 + mblength = mbrtowc (&wc, arg, strnlen(arg, MB_LEN_MAX), &state);
1952 +
1953 + if (mblength == (size_t)-1 || mblength == (size_t)-2)
1954 + {
1955 + *character_length = 1;
1956 + *character_width = 1;
1957 + }
1958 + else
1959 + {
1960 + *character_length = (mblength < 1) ? 1 : mblength;
1961 + width = wcwidth (wc);
1962 + *character_width = (width < 0) ? 0 : width;
1963 + }
1964 +
1965 + strncpy (character, arg, *character_length);
1966 + arg += *character_length;
1967 + }
1968 + else /* for single byte locale. */
1969 +#endif
1970 + {
1971 + *character = *arg++;
1972 + *character_length = 1;
1973 + *character_width = 1;
1974 + }
1975 + }
1976 +
1977 if (*arg)
1978 {
1979 long int tmp_long;
1980 @@ -1248,7 +1374,7 @@ init_parameters (int number_of_files)
1981 else
1982 col_sep_string = column_separator;
1983
1984 - col_sep_length = 1;
1985 + col_sep_length = col_sep_width = 1;
1986 use_col_separator = true;
1987 }
1988 /* It's rather pointless to define a TAB separator with column
1989 @@ -1279,11 +1405,11 @@ init_parameters (int number_of_files)
1990 TAB_WIDTH (chars_per_input_tab, chars_per_number); */
1991
1992 /* Estimate chars_per_text without any margin and keep it constant. */
1993 - if (number_separator == '\t')
1994 + if (number_separator[0] == '\t')
1995 number_width = chars_per_number +
1996 TAB_WIDTH (chars_per_default_tab, chars_per_number);
1997 else
1998 - number_width = chars_per_number + 1;
1999 + number_width = chars_per_number + number_separator_width;
2000
2001 /* The number is part of the column width unless we are
2002 printing files in parallel. */
2003 @@ -1298,7 +1424,7 @@ init_parameters (int number_of_files)
2004 }
2005
2006 chars_per_column = (chars_per_line - chars_used_by_number -
2007 - (columns - 1) * col_sep_length) / columns;
2008 + (columns - 1) * col_sep_width) / columns;
2009
2010 if (chars_per_column < 1)
2011 error (EXIT_FAILURE, 0, _("page width too narrow"));
2012 @@ -1423,7 +1549,7 @@ init_funcs (void)
2013
2014 /* Enlarge p->start_position of first column to use the same form of
2015 padding_not_printed with all columns. */
2016 - h = h + col_sep_length;
2017 + h = h + col_sep_width;
2018
2019 /* This loop takes care of all but the rightmost column. */
2020
2021 @@ -1457,7 +1583,7 @@ init_funcs (void)
2022 }
2023 else
2024 {
2025 - h = h_next + col_sep_length;
2026 + h = h_next + col_sep_width;
2027 h_next = h + chars_per_column;
2028 }
2029 }
2030 @@ -1747,9 +1873,9 @@ static void
2031 align_column (COLUMN *p)
2032 {
2033 padding_not_printed = p->start_position;
2034 - if (padding_not_printed - col_sep_length > 0)
2035 + if (padding_not_printed - col_sep_width > 0)
2036 {
2037 - pad_across_to (padding_not_printed - col_sep_length);
2038 + pad_across_to (padding_not_printed - col_sep_width);
2039 padding_not_printed = ANYWHERE;
2040 }
2041
2042 @@ -2020,13 +2146,13 @@ store_char (char c)
2043 /* May be too generous. */
2044 buff = X2REALLOC (buff, &buff_allocated);
2045 }
2046 - buff[buff_current++] = c;
2047 + buff[buff_current++] = (unsigned char) c;
2048 }
2049
2050 static void
2051 add_line_number (COLUMN *p)
2052 {
2053 - int i;
2054 + int i, j;
2055 char *s;
2056 int left_cut;
2057
2058 @@ -2049,22 +2175,24 @@ add_line_number (COLUMN *p)
2059 /* Tabification is assumed for multiple columns, also for n-separators,
2060 but `default n-separator = TAB' hasn't been given priority over
2061 equal column_width also specified by POSIX. */
2062 - if (number_separator == '\t')
2063 + if (number_separator[0] == '\t')
2064 {
2065 i = number_width - chars_per_number;
2066 while (i-- > 0)
2067 (p->char_func) (' ');
2068 }
2069 else
2070 - (p->char_func) (number_separator);
2071 + for (j = 0; j < number_separator_length; j++)
2072 + (p->char_func) (number_separator[j]);
2073 }
2074 else
2075 /* To comply with POSIX, we avoid any expansion of default TAB
2076 separator with a single column output. No column_width requirement
2077 has to be considered. */
2078 {
2079 - (p->char_func) (number_separator);
2080 - if (number_separator == '\t')
2081 + for (j = 0; j < number_separator_length; j++)
2082 + (p->char_func) (number_separator[j]);
2083 + if (number_separator[0] == '\t')
2084 output_position = POS_AFTER_TAB (chars_per_output_tab,
2085 output_position);
2086 }
2087 @@ -2225,7 +2353,7 @@ print_white_space (void)
2088 while (goal - h_old > 1
2089 && (h_new = POS_AFTER_TAB (chars_per_output_tab, h_old)) <= goal)
2090 {
2091 - putchar (output_tab_char);
2092 + fwrite (output_tab_char, sizeof(char), output_tab_char_length, stdout);
2093 h_old = h_new;
2094 }
2095 while (++h_old <= goal)
2096 @@ -2245,6 +2373,7 @@ print_sep_string (void)
2097 {
2098 char *s;
2099 int l = col_sep_length;
2100 + int not_space_flag;
2101
2102 s = col_sep_string;
2103
2104 @@ -2258,6 +2387,7 @@ print_sep_string (void)
2105 {
2106 for (; separators_not_printed > 0; --separators_not_printed)
2107 {
2108 + not_space_flag = 0;
2109 while (l-- > 0)
2110 {
2111 /* 3 types of sep_strings: spaces only, spaces and chars,
2112 @@ -2271,12 +2401,15 @@ print_sep_string (void)
2113 }
2114 else
2115 {
2116 + not_space_flag = 1;
2117 if (spaces_not_printed > 0)
2118 print_white_space ();
2119 putchar (*s++);
2120 - ++output_position;
2121 }
2122 }
2123 + if (not_space_flag)
2124 + output_position += col_sep_width;
2125 +
2126 /* sep_string ends with some spaces */
2127 if (spaces_not_printed > 0)
2128 print_white_space ();
2129 @@ -2304,7 +2437,7 @@ print_clump (COLUMN *p, int n, char *clu
2130 required number of tabs and spaces. */
2131
2132 static void
2133 -print_char (char c)
2134 +print_char_single (char c)
2135 {
2136 if (tabify_output)
2137 {
2138 @@ -2328,6 +2461,74 @@ print_char (char c)
2139 putchar (c);
2140 }
2141
2142 +#ifdef HAVE_MBRTOWC
2143 +static void
2144 +print_char_multi (char c)
2145 +{
2146 + static size_t mbc_pos = 0;
2147 + static char mbc[MB_LEN_MAX] = {'\0'};
2148 + static mbstate_t state = {'\0'};
2149 + mbstate_t state_bak;
2150 + wchar_t wc;
2151 + size_t mblength;
2152 + int width;
2153 +
2154 + if (tabify_output)
2155 + {
2156 + state_bak = state;
2157 + mbc[mbc_pos++] = c;
2158 + mblength = mbrtowc (&wc, mbc, mbc_pos, &state);
2159 +
2160 + while (mbc_pos > 0)
2161 + {
2162 + switch (mblength)
2163 + {
2164 + case (size_t)-2:
2165 + state = state_bak;
2166 + return;
2167 +
2168 + case (size_t)-1:
2169 + state = state_bak;
2170 + ++output_position;
2171 + putchar (mbc[0]);
2172 + memmove (mbc, mbc + 1, MB_CUR_MAX - 1);
2173 + --mbc_pos;
2174 + break;
2175 +
2176 + case 0:
2177 + mblength = 1;
2178 +
2179 + default:
2180 + if (wc == L' ')
2181 + {
2182 + memmove (mbc, mbc + mblength, MB_CUR_MAX - mblength);
2183 + --mbc_pos;
2184 + ++spaces_not_printed;
2185 + return;
2186 + }
2187 + else if (spaces_not_printed > 0)
2188 + print_white_space ();
2189 +
2190 + /* Nonprintables are assumed to have width 0, except L'\b'. */
2191 + if ((width = wcwidth (wc)) < 1)
2192 + {
2193 + if (wc == L'\b')
2194 + --output_position;
2195 + }
2196 + else
2197 + output_position += width;
2198 +
2199 + fwrite (mbc, sizeof(char), mblength, stdout);
2200 + memmove (mbc, mbc + mblength, MB_CUR_MAX - mblength);
2201 + mbc_pos -= mblength;
2202 + }
2203 + }
2204 + return;
2205 + }
2206 + putchar (c);
2207 +}
2208 +#endif
2209 +
2210 /* Skip to page PAGE before printing.
2211 PAGE may be larger than total number of pages. */
2212
2213 @@ -2507,9 +2708,9 @@ read_line (COLUMN *p)
2214 align_empty_cols = false;
2215 }
2216
2217 - if (padding_not_printed - col_sep_length > 0)
2218 + if (padding_not_printed - col_sep_width > 0)
2219 {
2220 - pad_across_to (padding_not_printed - col_sep_length);
2221 + pad_across_to (padding_not_printed - col_sep_width);
2222 padding_not_printed = ANYWHERE;
2223 }
2224
2225 @@ -2610,9 +2811,9 @@ print_stored (COLUMN *p)
2226 }
2227 }
2228
2229 - if (padding_not_printed - col_sep_length > 0)
2230 + if (padding_not_printed - col_sep_width > 0)
2231 {
2232 - pad_across_to (padding_not_printed - col_sep_length);
2233 + pad_across_to (padding_not_printed - col_sep_width);
2234 padding_not_printed = ANYWHERE;
2235 }
2236
2237 @@ -2625,8 +2826,8 @@ print_stored (COLUMN *p)
2238 if (spaces_not_printed == 0)
2239 {
2240 output_position = p->start_position + end_vector[line];
2241 - if (p->start_position - col_sep_length == chars_per_margin)
2242 - output_position -= col_sep_length;
2243 + if (p->start_position - col_sep_width == chars_per_margin)
2244 + output_position -= col_sep_width;
2245 }
2246
2247 return true;
2248 @@ -2645,7 +2846,7 @@ print_stored (COLUMN *p)
2249 number of characters is 1.) */
2250
2251 static int
2252 -char_to_clump (char c)
2253 +char_to_clump_single (char c)
2254 {
2255 unsigned char uc = c;
2256 char *s = clump_buff;
2257 @@ -2655,10 +2856,10 @@ char_to_clump (char c)
2258 int chars;
2259 int chars_per_c = 8;
2260
2261 - if (c == input_tab_char)
2262 + if (c == input_tab_char[0])
2263 chars_per_c = chars_per_input_tab;
2264
2265 - if (c == input_tab_char || c == '\t')
2266 + if (c == input_tab_char[0] || c == '\t')
2267 {
2268 width = TAB_WIDTH (chars_per_c, input_position);
2269
2270 @@ -2739,6 +2940,154 @@ char_to_clump (char c)
2271 return chars;
2272 }
2273
2274 +#ifdef HAVE_MBRTOWC
2275 +static int
2276 +char_to_clump_multi (char c)
2277 +{
2278 + static size_t mbc_pos = 0;
2279 + static char mbc[MB_LEN_MAX] = {'\0'};
2280 + static mbstate_t state = {'\0'};
2281 + mbstate_t state_bak;
2282 + wchar_t wc;
2283 + size_t mblength;
2284 + int wc_width;
2285 + register char *s = clump_buff;
2286 + register int i, j;
2287 + char esc_buff[4];
2288 + int width;
2289 + int chars;
2290 + int chars_per_c = 8;
2291 +
2292 + state_bak = state;
2293 + mbc[mbc_pos++] = c;
2294 + mblength = mbrtowc (&wc, mbc, mbc_pos, &state);
2295 +
2296 + width = 0;
2297 + chars = 0;
2298 + while (mbc_pos > 0)
2299 + {
2300 + switch (mblength)
2301 + {
2302 + case (size_t)-2:
2303 + state = state_bak;
2304 + return 0;
2305 +
2306 + case (size_t)-1:
2307 + state = state_bak;
2308 + mblength = 1;
2309 +
2310 + if (use_esc_sequence || use_cntrl_prefix)
2311 + {
2312 + width = +4;
2313 + chars = +4;
2314 + *s++ = '\\';
2315 + sprintf (esc_buff, "%03o", mbc[0]);
2316 + for (i = 0; i <= 2; ++i)
2317 + *s++ = (int) esc_buff[i];
2318 + }
2319 + else
2320 + {
2321 + width += 1;
2322 + chars += 1;
2323 + *s++ = mbc[0];
2324 + }
2325 + break;
2326 +
2327 + case 0:
2328 + mblength = 1;
2329 + /* Fall through */
2330 +
2331 + default:
2332 + if (memcmp (mbc, input_tab_char, mblength) == 0)
2333 + chars_per_c = chars_per_input_tab;
2334 +
2335 + if (memcmp (mbc, input_tab_char, mblength) == 0 || c == '\t')
2336 + {
2337 + int width_inc;
2338 +
2339 + width_inc = TAB_WIDTH (chars_per_c, input_position);
2340 + width += width_inc;
2341 +
2342 + if (untabify_input)
2343 + {
2344 + for (i = width_inc; i; --i)
2345 + *s++ = ' ';
2346 + chars += width_inc;
2347 + }
2348 + else
2349 + {
2350 + for (i = 0; i < mblength; i++)
2351 + *s++ = mbc[i];
2352 + chars += mblength;
2353 + }
2354 + }
2355 + else if ((wc_width = wcwidth (wc)) < 1)
2356 + {
2357 + if (use_esc_sequence)
2358 + {
2359 + for (i = 0; i < mblength; i++)
2360 + {
2361 + width += 4;
2362 + chars += 4;
2363 + *s++ = '\\';
2364 + sprintf (esc_buff, "%03o", c);
2365 + for (j = 0; j <= 2; ++j)
2366 + *s++ = (int) esc_buff[j];
2367 + }
2368 + }
2369 + else if (use_cntrl_prefix)
2370 + {
2371 + if (wc < 0200)
2372 + {
2373 + width += 2;
2374 + chars += 2;
2375 + *s++ = '^';
2376 + *s++ = wc ^ 0100;
2377 + }
2378 + else
2379 + {
2380 + for (i = 0; i < mblength; i++)
2381 + {
2382 + width += 4;
2383 + chars += 4;
2384 + *s++ = '\\';
2385 + sprintf (esc_buff, "%03o", c);
2386 + for (j = 0; j <= 2; ++j)
2387 + *s++ = (int) esc_buff[j];
2388 + }
2389 + }
2390 + }
2391 + else if (wc == L'\b')
2392 + {
2393 + width += -1;
2394 + chars += 1;
2395 + *s++ = c;
2396 + }
2397 + else
2398 + {
2399 + width += 0;
2400 + chars += mblength;
2401 + for (i = 0; i < mblength; i++)
2402 + *s++ = mbc[i];
2403 + }
2404 + }
2405 + else
2406 + {
2407 + width += wc_width;
2408 + chars += mblength;
2409 + for (i = 0; i < mblength; i++)
2410 + *s++ = mbc[i];
2411 + }
2412 + }
2413 + memmove (mbc, mbc + mblength, MB_CUR_MAX - mblength);
2414 + mbc_pos -= mblength;
2415 + }
2416 +
2417 + input_position += width;
2418 + return chars;
2419 +}
2420 +#endif
2421 +
2422 /* We've just printed some files and need to clean up things before
2423 looking for more options and printing the next batch of files.
2424
2425 diff -urNp coreutils-8.0-orig/src/sort.c coreutils-8.0/src/sort.c
2426 --- coreutils-8.0-orig/src/sort.c 2009-09-29 15:27:54.000000000 +0200
2427 +++ coreutils-8.0/src/sort.c 2009-10-07 10:07:16.000000000 +0200
2428 @@ -22,10 +22,19 @@
2429
2430 #include <config.h>
2431
2432 +#include <assert.h>
2433 #include <getopt.h>
2434 #include <sys/types.h>
2435 #include <sys/wait.h>
2436 #include <signal.h>
2437 +#if HAVE_WCHAR_H
2438 +# include <wchar.h>
2439 +#endif
2440 +/* Get isw* functions. */
2441 +#if HAVE_WCTYPE_H
2442 +# include <wctype.h>
2443 +#endif
2444 +
2445 #include "system.h"
2446 #include "argmatch.h"
2447 #include "error.h"
2448 @@ -122,14 +131,38 @@ static int decimal_point;
2449 /* Thousands separator; if -1, then there isn't one. */
2450 static int thousands_sep;
2451
2452 +static int force_general_numcompare = 0;
2453 +
2454 /* Nonzero if the corresponding locales are hard. */
2455 static bool hard_LC_COLLATE;
2456 -#if HAVE_NL_LANGINFO
2457 +#if HAVE_LANGINFO_CODESET
2458 static bool hard_LC_TIME;
2459 #endif
2460
2461 #define NONZERO(x) ((x) != 0)
2462
2463 +/* get a multibyte character's byte length. */
2464 +#define GET_BYTELEN_OF_CHAR(LIM, PTR, MBLENGTH, STATE) \
2465 + do \
2466 + { \
2467 + wchar_t wc; \
2468 + mbstate_t state_bak; \
2469 + \
2470 + state_bak = STATE; \
2471 + mblength = mbrtowc (&wc, PTR, LIM - PTR, &STATE); \
2472 + \
2473 + switch (MBLENGTH) \
2474 + { \
2475 + case (size_t)-1: \
2476 + case (size_t)-2: \
2477 + STATE = state_bak; \
2478 + /* Fall through. */ \
2479 + case 0: \
2480 + MBLENGTH = 1; \
2481 + } \
2482 + } \
2483 + while (0)
2484 +
2485 /* The kind of blanks for '-b' to skip in various options. */
2486 enum blanktype { bl_start, bl_end, bl_both };
2487
2488 @@ -268,13 +301,11 @@ static bool reverse;
2489 they were read if all keys compare equal. */
2490 static bool stable;
2491
2492 -/* If TAB has this value, blanks separate fields. */
2493 -enum { TAB_DEFAULT = CHAR_MAX + 1 };
2494 -
2495 -/* Tab character separating fields. If TAB_DEFAULT, then fields are
2496 +/* Tab character separating fields. If tab_length is 0, then fields are
2497 separated by the empty string between a non-blank character and a blank
2498 character. */
2499 -static int tab = TAB_DEFAULT;
2500 +static char tab[MB_LEN_MAX + 1];
2501 +static size_t tab_length = 0;
2502
2503 /* Flag to remove consecutive duplicate lines from the output.
2504 Only the last of a sequence of equal lines will be output. */
2505 @@ -712,6 +743,44 @@ reap_some (void)
2506 update_proc (pid);
2507 }
2508
2509 +/* Function pointers. */
2510 +static void
2511 +(*inittables) (void);
2512 +static char *
2513 +(*begfield) (const struct line*, const struct keyfield *);
2514 +static char *
2515 +(*limfield) (const struct line*, const struct keyfield *);
2516 +static int
2517 +(*getmonth) (char const *, size_t);
2518 +static int
2519 +(*keycompare) (const struct line *, const struct line *);
2520 +static int
2521 +(*numcompare) (const char *, const char *);
2522 +
2523 +/* Test for white space multibyte character.
2524 + Set LENGTH the byte length of investigated multibyte character. */
2525 +#if HAVE_MBRTOWC
2526 +static int
2527 +ismbblank (const char *str, size_t len, size_t *length)
2528 +{
2529 + size_t mblength;
2530 + wchar_t wc;
2531 + mbstate_t state;
2532 +
2533 + memset (&state, '\0', sizeof(mbstate_t));
2534 + mblength = mbrtowc (&wc, str, len, &state);
2535 +
2536 + if (mblength == (size_t)-1 || mblength == (size_t)-2)
2537 + {
2538 + *length = 1;
2539 + return 0;
2540 + }
2541 +
2542 + *length = (mblength < 1) ? 1 : mblength;
2543 + return iswblank (wc);
2544 +}
2545 +#endif
2546 +
2547 /* Clean up any remaining temporary files. */
2548
2549 static void
2550 @@ -1093,7 +1162,7 @@ zaptemp (const char *name)
2551 free (node);
2552 }
2553
2554 -#if HAVE_NL_LANGINFO
2555 +#if HAVE_LANGINFO_CODESET
2556
2557 static int
2558 struct_month_cmp (const void *m1, const void *m2)
2559 @@ -1108,7 +1177,7 @@ struct_month_cmp (const void *m1, const
2560 /* Initialize the character class tables. */
2561
2562 static void
2563 -inittables (void)
2564 +inittables_uni (void)
2565 {
2566 size_t i;
2567
2568 @@ -1120,7 +1189,7 @@ inittables (void)
2569 fold_toupper[i] = toupper (i);
2570 }
2571
2572 -#if HAVE_NL_LANGINFO
2573 +#if HAVE_LANGINFO_CODESET
2574 /* If we're not in the "C" locale, read different names for months. */
2575 if (hard_LC_TIME)
2576 {
2577 @@ -1202,6 +1271,64 @@ specify_nmerge (int oi, char c, char con
2578 xstrtol_fatal (e, oi, c, long_options, s);
2579 }
2580
2581 +#if HAVE_MBRTOWC
2582 +static void
2583 +inittables_mb (void)
2584 +{
2585 + int i, j, k, l;
2586 + char *name, *s;
2587 + size_t s_len, mblength;
2588 + char mbc[MB_LEN_MAX];
2589 + wchar_t wc, pwc;
2590 + mbstate_t state_mb, state_wc;
2591 +
2592 + for (i = 0; i < MONTHS_PER_YEAR; i++)
2593 + {
2594 + s = (char *) nl_langinfo (ABMON_1 + i);
2595 + s_len = strlen (s);
2596 + monthtab[i].name = name = (char *) xmalloc (s_len + 1);
2597 + monthtab[i].val = i + 1;
2598 +
2599 + memset (&state_mb, '\0', sizeof (mbstate_t));
2600 + memset (&state_wc, '\0', sizeof (mbstate_t));
2601 +
2602 + for (j = 0; j < s_len;)
2603 + {
2604 + if (!ismbblank (s + j, s_len - j, &mblength))
2605 + break;
2606 + j += mblength;
2607 + }
2608 +
2609 + for (k = 0; j < s_len;)
2610 + {
2611 + mblength = mbrtowc (&wc, (s + j), (s_len - j), &state_mb);
2612 + assert (mblength != (size_t)-1 && mblength != (size_t)-2);
2613 + if (mblength == 0)
2614 + break;
2615 +
2616 + pwc = towupper (wc);
2617 + if (pwc == wc)
2618 + {
2619 + memcpy (mbc, s + j, mblength);
2620 + j += mblength;
2621 + }
2622 + else
2623 + {
2624 + j += mblength;
2625 + mblength = wcrtomb (mbc, pwc, &state_wc);
2626 + assert (mblength != (size_t)0 && mblength != (size_t)-1);
2627 + }
2628 +
2629 + for (l = 0; l < mblength; l++)
2630 + name[k++] = mbc[l];
2631 + }
2632 + name[k] = '\0';
2633 + }
2634 + qsort ((void *) monthtab, MONTHS_PER_YEAR,
2635 + sizeof (struct month), struct_month_cmp);
2636 +}
2637 +#endif
2638 +
2639 /* Specify the amount of main memory to use when sorting. */
2640 static void
2641 specify_sort_size (int oi, char c, char const *s)
2642 @@ -1412,7 +1539,7 @@ buffer_linelim (struct buffer const *buf
2643 by KEY in LINE. */
2644
2645 static char *
2646 -begfield (const struct line *line, const struct keyfield *key)
2647 +begfield_uni (const struct line *line, const struct keyfield *key)
2648 {
2649 char *ptr = line->text, *lim = ptr + line->length - 1;
2650 size_t sword = key->sword;
2651 @@ -1421,10 +1548,10 @@ begfield (const struct line *line, const
2652 /* The leading field separator itself is included in a field when -t
2653 is absent. */
2654
2655 - if (tab != TAB_DEFAULT)
2656 + if (tab_length)
2657 while (ptr < lim && sword--)
2658 {
2659 - while (ptr < lim && *ptr != tab)
2660 + while (ptr < lim && *ptr != tab[0])
2661 ++ptr;
2662 if (ptr < lim)
2663 ++ptr;
2664 @@ -1450,11 +1577,70 @@ begfield (const struct line *line, const
2665 return ptr;
2666 }
2667
2668 +#if HAVE_MBRTOWC
2669 +static char *
2670 +begfield_mb (const struct line *line, const struct keyfield *key)
2671 +{
2672 + int i;
2673 + char *ptr = line->text, *lim = ptr + line->length - 1;
2674 + size_t sword = key->sword;
2675 + size_t schar = key->schar;
2676 + size_t mblength;
2677 + mbstate_t state;
2678 +
2679 + memset (&state, '\0', sizeof(mbstate_t));
2680 +
2681 + if (tab_length)
2682 + while (ptr < lim && sword--)
2683 + {
2684 + while (ptr < lim && memcmp (ptr, tab, tab_length) != 0)
2685 + {
2686 + GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
2687 + ptr += mblength;
2688 + }
2689 + if (ptr < lim)
2690 + {
2691 + GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
2692 + ptr += mblength;
2693 + }
2694 + }
2695 + else
2696 + while (ptr < lim && sword--)
2697 + {
2698 + while (ptr < lim && ismbblank (ptr, lim - ptr, &mblength))
2699 + ptr += mblength;
2700 + if (ptr < lim)
2701 + {
2702 + GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
2703 + ptr += mblength;
2704 + }
2705 + while (ptr < lim && !ismbblank (ptr, lim - ptr, &mblength))
2706 + ptr += mblength;
2707 + }
2708 +
2709 + if (key->skipsblanks)
2710 + while (ptr < lim && ismbblank (ptr, lim - ptr, &mblength))
2711 + ptr += mblength;
2712 +
2713 + for (i = 0; i < schar; i++)
2714 + {
2715 + GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
2716 +
2717 + if (ptr + mblength > lim)
2718 + break;
2719 + else
2720 + ptr += mblength;
2721 + }
2722 +
2723 + return ptr;
2724 +}
2725 +#endif
2726 +
2727 /* Return the limit of (a pointer to the first character after) the field
2728 in LINE specified by KEY. */
2729
2730 static char *
2731 -limfield (const struct line *line, const struct keyfield *key)
2732 +limfield_uni (const struct line *line, const struct keyfield *key)
2733 {
2734 char *ptr = line->text, *lim = ptr + line->length - 1;
2735 size_t eword = key->eword, echar = key->echar;
2736 @@ -1469,10 +1655,10 @@ limfield (const struct line *line, const
2737 `beginning' is the first character following the delimiting TAB.
2738 Otherwise, leave PTR pointing at the first `blank' character after
2739 the preceding field. */
2740 - if (tab != TAB_DEFAULT)
2741 + if (tab_length)
2742 while (ptr < lim && eword--)
2743 {
2744 - while (ptr < lim && *ptr != tab)
2745 + while (ptr < lim && *ptr != tab[0])
2746 ++ptr;
2747 if (ptr < lim && (eword || echar))
2748 ++ptr;
2749 @@ -1518,10 +1704,10 @@ limfield (const struct line *line, const
2750 */
2751
2752 /* Make LIM point to the end of (one byte past) the current field. */
2753 - if (tab != TAB_DEFAULT)
2754 + if (tab_length)
2755 {
2756 char *newlim;
2757 - newlim = memchr (ptr, tab, lim - ptr);
2758 + newlim = memchr (ptr, tab[0], lim - ptr);
2759 if (newlim)
2760 lim = newlim;
2761 }
2762 @@ -1552,6 +1738,113 @@ limfield (const struct line *line, const
2763 return ptr;
2764 }
2765
2766 +#if HAVE_MBRTOWC
2767 +static char *
2768 +limfield_mb (const struct line *line, const struct keyfield *key)
2769 +{
2770 + char *ptr = line->text, *lim = ptr + line->length - 1;
2771 + size_t eword = key->eword, echar = key->echar;
2772 + int i;
2773 + size_t mblength;
2774 + mbstate_t state;
2775 +
2776 + if (echar == 0)
2777 + eword++; /* skip all of end field. */
2778 +
2779 + memset (&state, '\0', sizeof(mbstate_t));
2780 +
2781 + if (tab_length)
2782 + while (ptr < lim && eword--)
2783 + {
2784 + while (ptr < lim && memcmp (ptr, tab, tab_length) != 0)
2785 + {
2786 + GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
2787 + ptr += mblength;
2788 + }
2789 + if (ptr < lim && (eword | echar))
2790 + {
2791 + GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
2792 + ptr += mblength;
2793 + }
2794 + }
2795 + else
2796 + while (ptr < lim && eword--)
2797 + {
2798 + while (ptr < lim && ismbblank (ptr, lim - ptr, &mblength))
2799 + ptr += mblength;
2800 + if (ptr < lim)
2801 + {
2802 + GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
2803 + ptr += mblength;
2804 + }
2805 + while (ptr < lim && !ismbblank (ptr, lim - ptr, &mblength))
2806 + ptr += mblength;
2807 + }
2808 +
2809 +
2810 +# ifdef POSIX_UNSPECIFIED
2811 + /* Make LIM point to the end of (one byte past) the current field. */
2812 + if (tab_length)
2813 + {
2814 + char *newlim, *p;
2815 +
2816 + newlim = NULL;
2817 + for (p = ptr; p < lim;)
2818 + {
2819 + if (memcmp (p, tab, tab_length) == 0)
2820 + {
2821 + newlim = p;
2822 + break;
2823 + }
2824 +
2825 + GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
2826 + p += mblength;
2827 + }
2828 + }
2829 + else
2830 + {
2831 + char *newlim;
2832 + newlim = ptr;
2833 +
2834 + while (newlim < lim && ismbblank (newlim, lim - newlim, &mblength))
2835 + newlim += mblength;
2836 + if (ptr < lim)
2837 + {
2838 + GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
2839 + ptr += mblength;
2840 + }
2841 + while (newlim < lim && !ismbblank (newlim, lim - newlim, &mblength))
2842 + newlim += mblength;
2843 + lim = newlim;
2844 + }
2845 +# endif
2846 +
2847 + if (echar != 0)
2848 + {
2849 + /* If we're skipping leading blanks, don't start counting characters
2850 + * until after skipping past any leading blanks. */
2851 + if (key->skipsblanks)
2852 + while (ptr < lim && ismbblank (ptr, lim - ptr, &mblength))
2853 + ptr += mblength;
2854 +
2855 + memset (&state, '\0', sizeof(mbstate_t));
2856 +
2857 + /* Advance PTR by ECHAR (if possible), but no further than LIM. */
2858 + for (i = 0; i < echar; i++)
2859 + {
2860 + GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
2861 +
2862 + if (ptr + mblength > lim)
2863 + break;
2864 + else
2865 + ptr += mblength;
2866 + }
2867 + }
2868 +
2869 + return ptr;
2870 +}
2871 +#endif
2872 +
2873 /* Fill BUF reading from FP, moving buf->left bytes from the end
2874 of buf->buf to the beginning first. If EOF is reached and the
2875 file wasn't terminated by a newline, supply one. Set up BUF's line
2876 @@ -1634,8 +1927,24 @@ fillbuf (struct buffer *buf, FILE *fp, c
2877 else
2878 {
2879 if (key->skipsblanks)
2880 - while (blanks[to_uchar (*line_start)])
2881 - line_start++;
2882 + {
2883 +#if HAVE_MBRTOWC
2884 + if (MB_CUR_MAX > 1)
2885 + {
2886 + size_t mblength;
2887 + mbstate_t state;
2888 + memset (&state, '\0', sizeof(mbstate_t));
2889 + while (line_start < line->keylim &&
2890 + ismbblank (line_start,
2891 + line->keylim - line_start,
2892 + &mblength))
2893 + line_start += mblength;
2894 + }
2895 + else
2896 +#endif
2897 + while (blanks[to_uchar (*line_start)])
2898 + line_start++;
2899 + }
2900 line->keybeg = line_start;
2901 }
2902 }
2903 @@ -1673,7 +1982,7 @@ fillbuf (struct buffer *buf, FILE *fp, c
2904 hideously fast. */
2905
2906 static int
2907 -numcompare (const char *a, const char *b)
2908 +numcompare_uni (const char *a, const char *b)
2909 {
2910 while (blanks[to_uchar (*a)])
2911 a++;
2912 @@ -1782,6 +2091,25 @@ human_numcompare (const char *a, const c
2913 : strnumcmp (a, b, decimal_point, thousands_sep));
2914 }
2915
2916 +#if HAVE_MBRTOWC
2917 +static int
2918 +numcompare_mb (const char *a, const char *b)
2919 +{
2920 + size_t mblength, len;
2921 + len = strlen (a); /* okay for UTF-8 */
2922 + while (*a && ismbblank (a, len > MB_CUR_MAX ? MB_CUR_MAX : len, &mblength))
2923 + {
2924 + a += mblength;
2925 + len -= mblength;
2926 + }
2927 + len = strlen (b); /* okay for UTF-8 */
2928 + while (*b && ismbblank (b, len > MB_CUR_MAX ? MB_CUR_MAX : len, &mblength))
2929 + b += mblength;
2930 +
2931 + return strnumcmp (a, b, decimal_point, thousands_sep);
2932 +}
2933 +#endif /* HAV_EMBRTOWC */
2934 +
2935 static int
2936 general_numcompare (const char *sa, const char *sb)
2937 {
2938 @@ -1815,7 +2143,7 @@ general_numcompare (const char *sa, cons
2939 Return 0 if the name in S is not recognized. */
2940
2941 static int
2942 -getmonth (char const *month, size_t len)
2943 +getmonth_uni (char const *month, size_t len)
2944 {
2945 size_t lo = 0;
2946 size_t hi = MONTHS_PER_YEAR;
2947 @@ -1996,11 +2324,79 @@ compare_version (char *restrict texta, s
2948 return diff;
2949 }
2950
2951 +#if HAVE_MBRTOWC
2952 +static int
2953 +getmonth_mb (const char *s, size_t len)
2954 +{
2955 + char *month;
2956 + register size_t i;
2957 + register int lo = 0, hi = MONTHS_PER_YEAR, result;
2958 + char *tmp;
2959 + size_t wclength, mblength;
2960 + const char **pp;
2961 + const wchar_t **wpp;
2962 + wchar_t *month_wcs;
2963 + mbstate_t state;
2964 +
2965 + while (len > 0 && ismbblank (s, len, &mblength))
2966 + {
2967 + s += mblength;
2968 + len -= mblength;
2969 + }
2970 +
2971 + if (len == 0)
2972 + return 0;
2973 +
2974 + month = (char *) alloca (len + 1);
2975 +
2976 + tmp = (char *) alloca (len + 1);
2977 + memcpy (tmp, s, len);
2978 + tmp[len] = '\0';
2979 + pp = (const char **)&tmp;
2980 + month_wcs = (wchar_t *) alloca ((len + 1) * sizeof (wchar_t));
2981 + memset (&state, '\0', sizeof(mbstate_t));
2982 +
2983 + wclength = mbsrtowcs (month_wcs, pp, len + 1, &state);
2984 + assert (wclength != (size_t)-1 && *pp == NULL);
2985 +
2986 + for (i = 0; i < wclength; i++)
2987 + {
2988 + month_wcs[i] = towupper(month_wcs[i]);
2989 + if (iswblank (month_wcs[i]))
2990 + {
2991 + month_wcs[i] = L'\0';
2992 + break;
2993 + }
2994 + }
2995 +
2996 + wpp = (const wchar_t **)&month_wcs;
2997 +
2998 + mblength = wcsrtombs (month, wpp, len + 1, &state);
2999 + assert (mblength != (-1) && *wpp == NULL);
3000 +
3001 + do
3002 + {
3003 + int ix = (lo + hi) / 2;
3004 +
3005 + if (strncmp (month, monthtab[ix].name, strlen (monthtab[ix].name)) < 0)
3006 + hi = ix;
3007 + else
3008 + lo = ix;
3009 + }
3010 + while (hi - lo > 1);
3011 +
3012 + result = (!strncmp (month, monthtab[lo].name, strlen (monthtab[lo].name))
3013 + ? monthtab[lo].val : 0);
3014 +
3015 + return result;
3016 +}
3017 +#endif
3018 +
3019 /* Compare two lines A and B trying every key in sequence until there
3020 are no more keys or a difference is found. */
3021
3022 static int
3023 -keycompare (const struct line *a, const struct line *b)
3024 +keycompare_uni (const struct line *a, const struct line *b)
3025 {
3026 struct keyfield *key = keylist;
3027
3028 @@ -2180,6 +2576,179 @@ keycompare (const struct line *a, const
3029 return key->reverse ? -diff : diff;
3030 }
3031
3032 +#if HAVE_MBRTOWC
3033 +static int
3034 +keycompare_mb (const struct line *a, const struct line *b)
3035 +{
3036 + struct keyfield *key = keylist;
3037 +
3038 + /* For the first iteration only, the key positions have been
3039 + precomputed for us. */
3040 + char *texta = a->keybeg;
3041 + char *textb = b->keybeg;
3042 + char *lima = a->keylim;
3043 + char *limb = b->keylim;
3044 +
3045 + size_t mblength_a, mblength_b;
3046 + wchar_t wc_a, wc_b;
3047 + mbstate_t state_a, state_b;
3048 +
3049 + int diff;
3050 +
3051 + memset (&state_a, '\0', sizeof(mbstate_t));
3052 + memset (&state_b, '\0', sizeof(mbstate_t));
3053 +
3054 + for (;;)
3055 + {
3056 + char const *translate = key->translate;
3057 + bool const *ignore = key->ignore;
3058 +
3059 + /* Find the lengths. */
3060 + size_t lena = lima <= texta ? 0 : lima - texta;
3061 + size_t lenb = limb <= textb ? 0 : limb - textb;
3062 +
3063 + /* Actually compare the fields. */
3064 + if (key->random)
3065 + diff = compare_random (texta, lena, textb, lenb);
3066 + else if (key->numeric | key->general_numeric | key->human_numeric)
3067 + {
3068 + char savea = *lima, saveb = *limb;
3069 +
3070 + *lima = *limb = '\0';
3071 + diff = (key->numeric ? numcompare (texta, textb)
3072 + : key->general_numeric ? general_numcompare (texta, textb)
3073 + : human_numcompare (texta, textb, key));
3074 + *lima = savea, *limb = saveb;
3075 + }
3076 + else if (key->version)
3077 + diff = compare_version (texta, lena, textb, lenb);
3078 + else if (key->month)
3079 + diff = getmonth (texta, lena) - getmonth (textb, lenb);
3080 + else
3081 + {
3082 + if (ignore || translate)
3083 + {
3084 + char *copy_a = (char *) alloca (lena + 1 + lenb + 1);
3085 + char *copy_b = copy_a + lena + 1;
3086 + size_t new_len_a, new_len_b;
3087 + size_t i, j;
3088 +
3089 + /* Ignore and/or translate chars before comparing. */
3090 +# define IGNORE_CHARS(NEW_LEN, LEN, TEXT, COPY, WC, MBLENGTH, STATE) \
3091 + do \
3092 + { \
3093 + wchar_t uwc; \
3094 + char mbc[MB_LEN_MAX]; \
3095 + mbstate_t state_wc; \
3096 + \
3097 + for (NEW_LEN = i = 0; i < LEN;) \
3098 + { \
3099 + mbstate_t state_bak; \
3100 + \
3101 + state_bak = STATE; \
3102 + MBLENGTH = mbrtowc (&WC, TEXT + i, LEN - i, &STATE); \
3103 + \
3104 + if (MBLENGTH == (size_t)-2 || MBLENGTH == (size_t)-1 \
3105 + || MBLENGTH == 0) \
3106 + { \
3107 + if (MBLENGTH == (size_t)-2 || MBLENGTH == (size_t)-1) \
3108 + STATE = state_bak; \
3109 + if (!ignore) \
3110 + COPY[NEW_LEN++] = TEXT[i++]; \
3111 + continue; \
3112 + } \
3113 + \
3114 + if (ignore) \
3115 + { \
3116 + if ((ignore == nonprinting && !iswprint (WC)) \
3117 + || (ignore == nondictionary \
3118 + && !iswalnum (WC) && !iswblank (WC))) \
3119 + { \
3120 + i += MBLENGTH; \
3121 + continue; \
3122 + } \
3123 + } \
3124 + \
3125 + if (translate) \
3126 + { \
3127 + \
3128 + uwc = towupper(WC); \
3129 + if (WC == uwc) \
3130 + { \
3131 + memcpy (mbc, TEXT + i, MBLENGTH); \
3132 + i += MBLENGTH; \
3133 + } \
3134 + else \
3135 + { \
3136 + i += MBLENGTH; \
3137 + WC = uwc; \
3138 + memset (&state_wc, '\0', sizeof (mbstate_t)); \
3139 + \
3140 + MBLENGTH = wcrtomb (mbc, WC, &state_wc); \
3141 + assert (MBLENGTH != (size_t)-1 && MBLENGTH != 0); \
3142 + } \
3143 + \
3144 + for (j = 0; j < MBLENGTH; j++) \
3145 + COPY[NEW_LEN++] = mbc[j]; \
3146 + } \
3147 + else \
3148 + for (j = 0; j < MBLENGTH; j++) \
3149 + COPY[NEW_LEN++] = TEXT[i++]; \
3150 + } \
3151 + COPY[NEW_LEN] = '\0'; \
3152 + } \
3153 + while (0)
3154 + IGNORE_CHARS (new_len_a, lena, texta, copy_a,
3155 + wc_a, mblength_a, state_a);
3156 + IGNORE_CHARS (new_len_b, lenb, textb, copy_b,
3157 + wc_b, mblength_b, state_b);
3158 + diff = xmemcoll (copy_a, new_len_a, copy_b, new_len_b);
3159 + }
3160 + else if (lena == 0)
3161 + diff = - NONZERO (lenb);
3162 + else if (lenb == 0)
3163 + goto greater;
3164 + else
3165 + diff = xmemcoll (texta, lena, textb, lenb);
3166 + }
3167 +
3168 + if (diff)
3169 + goto not_equal;
3170 +
3171 + key = key->next;
3172 + if (! key)
3173 + break;
3174 +
3175 + /* Find the beginning and limit of the next field. */
3176 + if (key->eword != -1)
3177 + lima = limfield (a, key), limb = limfield (b, key);
3178 + else
3179 + lima = a->text + a->length - 1, limb = b->text + b->length - 1;
3180 +
3181 + if (key->sword != -1)
3182 + texta = begfield (a, key), textb = begfield (b, key);
3183 + else
3184 + {
3185 + texta = a->text, textb = b->text;
3186 + if (key->skipsblanks)
3187 + {
3188 + while (texta < lima && ismbblank (texta, lima - texta, &mblength_a))
3189 + texta += mblength_a;
3190 + while (textb < limb && ismbblank (textb, limb - textb, &mblength_b))
3191 + textb += mblength_b;
3192 + }
3193 + }
3194 + }
3195 +
3196 + return 0;
3197 +
3198 +greater:
3199 + diff = 1;
3200 +not_equal:
3201 + return key->reverse ? -diff : diff;
3202 +}
3203 +#endif
3204 +
3205 /* Compare two lines A and B, returning negative, zero, or positive
3206 depending on whether A compares less than, equal to, or greater than B. */
3207
3208 @@ -3178,7 +3747,7 @@ main (int argc, char **argv)
3209 initialize_exit_failure (SORT_FAILURE);
3210
3211 hard_LC_COLLATE = hard_locale (LC_COLLATE);
3212 -#if HAVE_NL_LANGINFO
3213 +#if HAVE_LANGINFO_CODESET
3214 hard_LC_TIME = hard_locale (LC_TIME);
3215 #endif
3216
3217 @@ -3199,6 +3768,27 @@ main (int argc, char **argv)
3218 thousands_sep = -1;
3219 }
3220
3221 +#if HAVE_MBRTOWC
3222 + if (MB_CUR_MAX > 1)
3223 + {
3224 + inittables = inittables_mb;
3225 + begfield = begfield_mb;
3226 + limfield = limfield_mb;
3227 + getmonth = getmonth_mb;
3228 + keycompare = keycompare_mb;
3229 + numcompare = numcompare_mb;
3230 + }
3231 + else
3232 +#endif
3233 + {
3234 + inittables = inittables_uni;
3235 + begfield = begfield_uni;
3236 + limfield = limfield_uni;
3237 + getmonth = getmonth_uni;
3238 + keycompare = keycompare_uni;
3239 + numcompare = numcompare_uni;
3240 + }
3241 +
3242 have_read_stdin = false;
3243 inittables ();
3244
3245 @@ -3459,13 +4049,35 @@ main (int argc, char **argv)
3246
3247 case 't':
3248 {
3249 - char newtab = optarg[0];
3250 - if (! newtab)
3251 + char newtab[MB_LEN_MAX + 1];
3252 + size_t newtab_length = 1;
3253 + strncpy (newtab, optarg, MB_LEN_MAX);
3254 + if (! newtab[0])
3255 error (SORT_FAILURE, 0, _("empty tab"));
3256 - if (optarg[1])
3257 +#if HAVE_MBRTOWC
3258 + if (MB_CUR_MAX > 1)
3259 + {
3260 + wchar_t wc;
3261 + mbstate_t state;
3262 + size_t i;
3263 +
3264 + memset (&state, '\0', sizeof (mbstate_t));
3265 + newtab_length = mbrtowc (&wc, newtab, strnlen (newtab,
3266 + MB_LEN_MAX),
3267 + &state);
3268 + switch (newtab_length)
3269 + {
3270 + case (size_t) -1:
3271 + case (size_t) -2:
3272 + case 0:
3273 + newtab_length = 1;
3274 + }
3275 + }
3276 +#endif
3277 + if (newtab_length == 1 && optarg[1])
3278 {
3279 if (STREQ (optarg, "\\0"))
3280 - newtab = '\0';
3281 + newtab[0] = '\0';
3282 else
3283 {
3284 /* Provoke with `sort -txx'. Complain about
3285 @@ -3476,9 +4088,12 @@ main (int argc, char **argv)
3286 quote (optarg));
3287 }
3288 }
3289 - if (tab != TAB_DEFAULT && tab != newtab)
3290 + if (tab_length
3291 + && (tab_length != newtab_length
3292 + || memcmp (tab, newtab, tab_length) != 0))
3293 error (SORT_FAILURE, 0, _("incompatible tabs"));
3294 - tab = newtab;
3295 + memcpy (tab, newtab, newtab_length);
3296 + tab_length = newtab_length;
3297 }
3298 break;
3299
3300 diff -urNp coreutils-8.0-orig/src/unexpand.c coreutils-8.0/src/unexpand.c
3301 --- coreutils-8.0-orig/src/unexpand.c 2009-09-29 15:27:54.000000000 +0200
3302 +++ coreutils-8.0/src/unexpand.c 2009-10-07 10:07:16.000000000 +0200
3303 @@ -38,11 +38,28 @@
3304 #include <stdio.h>
3305 #include <getopt.h>
3306 #include <sys/types.h>
3307 +
3308 +/* Get mbstate_t, mbrtowc(), wcwidth(). */
3309 +#if HAVE_WCHAR_H
3310 +# include <wchar.h>
3311 +#endif
3312 +
3313 #include "system.h"
3314 #include "error.h"
3315 #include "quote.h"
3316 #include "xstrndup.h"
3317
3318 +/* MB_LEN_MAX is incorrectly defined to be 1 in at least one GCC
3319 + installation; work around this configuration error. */
3320 +#if !defined MB_LEN_MAX || MB_LEN_MAX < 2
3321 +# define MB_LEN_MAX 16
3322 +#endif
3323 +
3324 +/* Some systems, like BeOS, have multibyte encodings but lack mbstate_t. */
3325 +#if HAVE_MBRTOWC && defined mbstate_t
3326 +# define mbrtowc(pwc, s, n, ps) (mbrtowc) (pwc, s, n, 0)
3327 +#endif
3328 +
3329 /* The official name of this program (e.g., no `g' prefix). */
3330 #define PROGRAM_NAME "unexpand"
3331
3332 @@ -102,6 +119,208 @@ static struct option const longopts[] =
3333 {NULL, 0, NULL, 0}
3334 };
3335
3336 +static FILE *next_file (FILE *fp);
3337 +
3338 +#if HAVE_MBRTOWC
3339 +static void
3340 +unexpand_multibyte (void)
3341 +{
3342 + FILE *fp; /* Input stream. */
3343 + mbstate_t i_state; /* Current shift state of the input stream. */
3344 + mbstate_t i_state_bak; /* Back up the I_STATE. */
3345 + mbstate_t o_state; /* Current shift state of the output stream. */
3346 + char buf[MB_LEN_MAX + BUFSIZ]; /* For spooling a read byte sequence. */
3347 + char *bufpos; /* Next read position of BUF. */
3348 + size_t buflen = 0; /* The length of the byte sequence in buf. */
3349 + wint_t wc; /* A gotten wide character. */
3350 + size_t mblength; /* The byte size of a multibyte character
3351 + which shows as same character as WC. */
3352 +
3353 + /* Index in `tab_list' of next tabstop: */
3354 + int tab_index = 0; /* For calculating width of pending tabs. */
3355 + int print_tab_index = 0; /* For printing as many tabs as possible. */
3356 + unsigned int column = 0; /* Column on screen of next char. */
3357 + int next_tab_column; /* Column the next tab stop is on. */
3358 + int convert = 1; /* If nonzero, perform translations. */
3359 + unsigned int pending = 0; /* Pending columns of blanks. */
3360 +
3361 + fp = next_file ((FILE *) NULL);
3362 + if (fp == NULL)
3363 + return;
3364 +
3365 + memset (&o_state, '\0', sizeof(mbstate_t));
3366 + memset (&i_state, '\0', sizeof(mbstate_t));
3367 +
3368 + for (;;)
3369 + {
3370 + if (buflen < MB_LEN_MAX && !feof(fp) && !ferror(fp))
3371 + {
3372 + memmove (buf, bufpos, buflen);
3373 + buflen += fread (buf + buflen, sizeof(char), BUFSIZ, fp);
3374 + bufpos = buf;
3375 + }
3376 +
3377 + /* Get a wide character. */
3378 + if (buflen < 1)
3379 + {
3380 + mblength = 1;
3381 + wc = WEOF;
3382 + }
3383 + else
3384 + {
3385 + i_state_bak = i_state;
3386 + mblength = mbrtowc ((wchar_t *)&wc, bufpos, buflen, &i_state);
3387 + }
3388 +
3389 + if (mblength == (size_t)-1 || mblength == (size_t)-2)
3390 + {
3391 + i_state = i_state_bak;
3392 + wc = L'\0';
3393 + }
3394 +
3395 + if (wc == L' ' && convert && column < INT_MAX)
3396 + {
3397 + ++pending;
3398 + ++column;
3399 + }
3400 + else if (wc == L'\t' && convert)
3401 + {
3402 + if (tab_size == 0)
3403 + {
3404 + /* Do not let tab_index == first_free_tab;
3405 + stop when it is 1 less. */
3406 + while (tab_index < first_free_tab - 1
3407 + && column >= tab_list[tab_index])
3408 + tab_index++;
3409 + next_tab_column = tab_list[tab_index];
3410 + if (tab_index < first_free_tab - 1)
3411 + tab_index++;
3412 + if (column >= next_tab_column)
3413 + {
3414 + convert = 0; /* Ran out of tab stops. */
3415 + goto flush_pend_mb;
3416 + }
3417 + }
3418 + else
3419 + {
3420 + next_tab_column = column + tab_size - column % tab_size;
3421 + }
3422 + pending += next_tab_column - column;
3423 + column = next_tab_column;
3424 + }
3425 + else
3426 + {
3427 +flush_pend_mb:
3428 + /* Flush pending spaces. Print as many tabs as possible,
3429 + then print the rest as spaces. */
3430 + if (pending == 1)
3431 + {
3432 + putchar (' ');
3433 + pending = 0;
3434 + }
3435 + column -= pending;
3436 + while (pending > 0)
3437 + {
3438 + if (tab_size == 0)
3439 + {
3440 + /* Do not let print_tab_index == first_free_tab;
3441 + stop when it is 1 less. */
3442 + while (print_tab_index < first_free_tab - 1
3443 + && column >= tab_list[print_tab_index])
3444 + print_tab_index++;
3445 + next_tab_column = tab_list[print_tab_index];
3446 + if (print_tab_index < first_free_tab - 1)
3447 + print_tab_index++;
3448 + }
3449 + else
3450 + {
3451 + next_tab_column =
3452 + column + tab_size - column % tab_size;
3453 + }
3454 + if (next_tab_column - column <= pending)
3455 + {
3456 + putchar ('\t');
3457 + pending -= next_tab_column - column;
3458 + column = next_tab_column;
3459 + }
3460 + else
3461 + {
3462 + --print_tab_index;
3463 + column += pending;
3464 + while (pending != 0)
3465 + {
3466 + putchar (' ');
3467 + pending--;
3468 + }
3469 + }
3470 + }
3471 +
3472 + if (wc == WEOF)
3473 + {
3474 + fp = next_file (fp);
3475 + if (fp == NULL)
3476 + break; /* No more files. */
3477 + else
3478 + {
3479 + memset (&i_state, '\0', sizeof(mbstate_t));
3480 + continue;
3481 + }
3482 + }
3483 +
3484 + if (mblength == (size_t)-1 || mblength == (size_t)-2)
3485 + {
3486 + if (convert)
3487 + {
3488 + ++column;
3489 + if (convert_entire_line == 0)
3490 + convert = 0;
3491 + }
3492 + mblength = 1;
3493 + putchar (buf[0]);
3494 + }
3495 + else if (mblength == 0)
3496 + {
3497 + if (convert && convert_entire_line == 0)
3498 + convert = 0;
3499 + mblength = 1;
3500 + putchar ('\0');
3501 + }
3502 + else
3503 + {
3504 + if (convert)
3505 + {
3506 + if (wc == L'\b')
3507 + {
3508 + if (column > 0)
3509 + --column;
3510 + }
3511 + else
3512 + {
3513 + int width; /* The width of WC. */
3514 +
3515 + width = wcwidth (wc);
3516 + column += (width > 0) ? width : 0;
3517 + if (convert_entire_line == 0)
3518 + convert = 0;
3519 + }
3520 + }
3521 +
3522 + if (wc == L'\n')
3523 + {
3524 + tab_index = print_tab_index = 0;
3525 + column = pending = 0;
3526 + convert = 1;
3527 + }
3528 + fwrite (bufpos, sizeof(char), mblength, stdout);
3529 + }
3530 + }
3531 + buflen -= mblength;
3532 + bufpos += mblength;
3533 + }
3534 +}
3535 +#endif
3536 +
3537 +
3538 void
3539 usage (int status)
3540 {
3541 @@ -523,7 +742,12 @@ main (int argc, char **argv)
3542
3543 file_list = (optind < argc ? &argv[optind] : stdin_argv);
3544
3545 - unexpand ();
3546 +#if HAVE_MBRTOWC
3547 + if (MB_CUR_MAX > 1)
3548 + unexpand_multibyte ();
3549 + else
3550 +#endif
3551 + unexpand ();
3552
3553 if (have_read_stdin && fclose (stdin) != 0)
3554 error (EXIT_FAILURE, errno, "-");
3555 diff -urNp coreutils-8.0-orig/src/uniq.c coreutils-8.0/src/uniq.c
3556 --- coreutils-8.0-orig/src/uniq.c 2009-09-23 10:25:44.000000000 +0200
3557 +++ coreutils-8.0/src/uniq.c 2009-10-07 10:07:16.000000000 +0200
3558 @@ -22,6 +22,16 @@
3559 #include <getopt.h>
3560 #include <sys/types.h>
3561
3562 +/* Get mbstate_t, mbrtowc(). */
3563 +#if HAVE_WCHAR_H
3564 +# include <wchar.h>
3565 +#endif
3566 +
3567 +/* Get isw* functions. */
3568 +#if HAVE_WCTYPE_H
3569 +# include <wctype.h>
3570 +#endif
3571 +
3572 #include "system.h"
3573 #include "argmatch.h"
3574 #include "linebuffer.h"
3575 @@ -31,7 +41,19 @@
3576 #include "stdio--.h"
3577 #include "xmemcoll.h"
3578 #include "xstrtol.h"
3579 -#include "memcasecmp.h"
3580 +#include "xmemcoll.h"
3581 +
3582 +/* MB_LEN_MAX is incorrectly defined to be 1 in at least one GCC
3583 + installation; work around this configuration error. */
3584 +#if !defined MB_LEN_MAX || MB_LEN_MAX < 2
3585 +# define MB_LEN_MAX 16
3586 +#endif
3587 +
3588 +/* Some systems, like BeOS, have multibyte encodings but lack mbstate_t. */
3589 +#if HAVE_MBRTOWC && defined mbstate_t
3590 +# define mbrtowc(pwc, s, n, ps) (mbrtowc) (pwc, s, n, 0)
3591 +#endif
3592 +
3593
3594 /* The official name of this program (e.g., no `g' prefix). */
3595 #define PROGRAM_NAME "uniq"
3596 @@ -107,6 +129,10 @@ static enum delimit_method const delimit
3597 /* Select whether/how to delimit groups of duplicate lines. */
3598 static enum delimit_method delimit_groups;
3599
3600 +/* Function pointers. */
3601 +static char *
3602 +(*find_field) (struct linebuffer *line);
3603 +
3604 static struct option const longopts[] =
3605 {
3606 {"count", no_argument, NULL, 'c'},
3607 @@ -206,7 +232,7 @@ size_opt (char const *opt, char const *m
3608 return a pointer to the beginning of the line's field to be compared. */
3609
3610 static char *
3611 -find_field (struct linebuffer const *line)
3612 +find_field_uni (struct linebuffer *line)
3613 {
3614 size_t count;
3615 char const *lp = line->buffer;
3616 @@ -227,6 +253,83 @@ find_field (struct linebuffer const *lin
3617 return line->buffer + i;
3618 }
3619
3620 +#if HAVE_MBRTOWC
3621 +
3622 +# define MBCHAR_TO_WCHAR(WC, MBLENGTH, LP, POS, SIZE, STATEP, CONVFAIL) \
3623 + do \
3624 + { \
3625 + mbstate_t state_bak; \
3626 + \
3627 + CONVFAIL = 0; \
3628 + state_bak = *STATEP; \
3629 + \
3630 + MBLENGTH = mbrtowc (&WC, LP + POS, SIZE - POS, STATEP); \
3631 + \
3632 + switch (MBLENGTH) \
3633 + { \
3634 + case (size_t)-2: \
3635 + case (size_t)-1: \
3636 + *STATEP = state_bak; \
3637 + CONVFAIL++; \
3638 + /* Fall through */ \
3639 + case 0: \
3640 + MBLENGTH = 1; \
3641 + } \
3642 + } \
3643 + while (0)
3644 +
3645 +static char *
3646 +find_field_multi (struct linebuffer *line)
3647 +{
3648 + size_t count;
3649 + char *lp = line->buffer;
3650 + size_t size = line->length - 1;
3651 + size_t pos;
3652 + size_t mblength;
3653 + wchar_t wc;
3654 + mbstate_t *statep;
3655 + int convfail;
3656 +
3657 + pos = 0;
3658 + statep = &(line->state);
3659 +
3660 + /* skip fields. */
3661 + for (count = 0; count < skip_fields && pos < size; count++)
3662 + {
3663 + while (pos < size)
3664 + {
3665 + MBCHAR_TO_WCHAR (wc, mblength, lp, pos, size, statep, convfail);
3666 +
3667 + if (convfail || !iswblank (wc))
3668 + {
3669 + pos += mblength;
3670 + break;
3671 + }
3672 + pos += mblength;
3673 + }
3674 +
3675 + while (pos < size)
3676 + {
3677 + MBCHAR_TO_WCHAR (wc, mblength, lp, pos, size, statep, convfail);
3678 +
3679 + if (!convfail && iswblank (wc))
3680 + break;
3681 +
3682 + pos += mblength;
3683 + }
3684 + }
3685 +
3686 + /* skip fields. */
3687 + for (count = 0; count < skip_chars && pos < size; count++)
3688 + {
3689 + MBCHAR_TO_WCHAR (wc, mblength, lp, pos, size, statep, convfail);
3690 + pos += mblength;
3691 + }
3692 +
3693 + return lp + pos;
3694 +}
3695 +#endif
3696 +
3697 /* Return false if two strings OLD and NEW match, true if not.
3698 OLD and NEW point not to the beginnings of the lines
3699 but rather to the beginnings of the fields to compare.
3700 @@ -235,6 +338,8 @@ find_field (struct linebuffer const *lin
3701 static bool
3702 different (char *old, char *new, size_t oldlen, size_t newlen)
3703 {
3704 + char *copy_old, *copy_new;
3705 +
3706 if (check_chars < oldlen)
3707 oldlen = check_chars;
3708 if (check_chars < newlen)
3709 @@ -242,14 +347,92 @@ different (char *old, char *new, size_t
3710
3711 if (ignore_case)
3712 {
3713 - /* FIXME: This should invoke strcoll somehow. */
3714 - return oldlen != newlen || memcasecmp (old, new, oldlen);
3715 + size_t i;
3716 +
3717 + copy_old = alloca (oldlen + 1);
3718 + copy_new = alloca (oldlen + 1);
3719 +
3720 + for (i = 0; i < oldlen; i++)
3721 + {
3722 + copy_old[i] = toupper (old[i]);
3723 + copy_new[i] = toupper (new[i]);
3724 + }
3725 }
3726 - else if (hard_LC_COLLATE)
3727 - return xmemcoll (old, oldlen, new, newlen) != 0;
3728 else
3729 - return oldlen != newlen || memcmp (old, new, oldlen);
3730 + {
3731 + copy_old = (char *)old;
3732 + copy_new = (char *)new;
3733 + }
3734 +
3735 + return xmemcoll (copy_old, oldlen, copy_new, newlen);
3736 +}
3737 +
3738 +#if HAVE_MBRTOWC
3739 +static int
3740 +different_multi (const char *old, const char *new, size_t oldlen, size_t newlen, mbstate_t oldstate, mbstate_t newstate)
3741 +{
3742 + size_t i, j, chars;
3743 + const char *str[2];
3744 + char *copy[2];
3745 + size_t len[2];
3746 + mbstate_t state[2];
3747 + size_t mblength;
3748 + wchar_t wc, uwc;
3749 + mbstate_t state_bak;
3750 +
3751 + str[0] = old;
3752 + str[1] = new;
3753 + len[0] = oldlen;
3754 + len[1] = newlen;
3755 + state[0] = oldstate;
3756 + state[1] = newstate;
3757 +
3758 + for (i = 0; i < 2; i++)
3759 + {
3760 + copy[i] = alloca (len[i] + 1);
3761 +
3762 + for (j = 0, chars = 0; j < len[i] && chars < check_chars; chars++)
3763 + {
3764 + state_bak = state[i];
3765 + mblength = mbrtowc (&wc, str[i] + j, len[i] - j, &(state[i]));
3766 +
3767 + switch (mblength)
3768 + {
3769 + case (size_t)-1:
3770 + case (size_t)-2:
3771 + state[i] = state_bak;
3772 + /* Fall through */
3773 + case 0:
3774 + mblength = 1;
3775 + break;
3776 +
3777 + default:
3778 + if (ignore_case)
3779 + {
3780 + uwc = towupper (wc);
3781 +
3782 + if (uwc != wc)
3783 + {
3784 + mbstate_t state_wc;
3785 +
3786 + memset (&state_wc, '\0', sizeof(mbstate_t));
3787 + wcrtomb (copy[i] + j, uwc, &state_wc);
3788 + }
3789 + else
3790 + memcpy (copy[i] + j, str[i] + j, mblength);
3791 + }
3792 + else
3793 + memcpy (copy[i] + j, str[i] + j, mblength);
3794 + }
3795 + j += mblength;
3796 + }
3797 + copy[i][j] = '\0';
3798 + len[i] = j;
3799 + }
3800 +
3801 + return xmemcoll (copy[0], len[0], copy[1], len[1]);
3802 }
3803 +#endif
3804
3805 /* Output the line in linebuffer LINE to standard output
3806 provided that the switches say it should be output.
3807 @@ -303,15 +486,43 @@ check_file (const char *infile, const ch
3808 {
3809 char *prevfield IF_LINT (= NULL);
3810 size_t prevlen IF_LINT (= 0);
3811 +#if HAVE_MBRTOWC
3812 + mbstate_t prevstate;
3813 +
3814 + memset (&prevstate, '\0', sizeof (mbstate_t));
3815 +#endif
3816
3817 while (!feof (stdin))
3818 {
3819 char *thisfield;
3820 size_t thislen;
3821 +#if HAVE_MBRTOWC
3822 + mbstate_t thisstate;
3823 +#endif
3824 +
3825 if (readlinebuffer_delim (thisline, stdin, delimiter) == 0)
3826 break;
3827 thisfield = find_field (thisline);
3828 thislen = thisline->length - 1 - (thisfield - thisline->buffer);
3829 +#if HAVE_MBRTOWC
3830 + if (MB_CUR_MAX > 1)
3831 + {
3832 + thisstate = thisline->state;
3833 +
3834 + if (prevline->length == 0 || different_multi
3835 + (thisfield, prevfield, thislen, prevlen, thisstate, prevstate))
3836 + {
3837 + fwrite (thisline->buffer, sizeof (char),
3838 + thisline->length, stdout);
3839 +
3840 + SWAP_LINES (prevline, thisline);
3841 + prevfield = thisfield;
3842 + prevlen = thislen;
3843 + prevstate = thisstate;
3844 + }
3845 + }
3846 + else
3847 +#endif
3848 if (prevline->length == 0
3849 || different (thisfield, prevfield, thislen, prevlen))
3850 {
3851 @@ -330,17 +541,26 @@ check_file (const char *infile, const ch
3852 size_t prevlen;
3853 uintmax_t match_count = 0;
3854 bool first_delimiter = true;
3855 +#if HAVE_MBRTOWC
3856 + mbstate_t prevstate;
3857 +#endif
3858
3859 if (readlinebuffer_delim (prevline, stdin, delimiter) == 0)
3860 goto closefiles;
3861 prevfield = find_field (prevline);
3862 prevlen = prevline->length - 1 - (prevfield - prevline->buffer);
3863 +#if HAVE_MBRTOWC
3864 + prevstate = prevline->state;
3865 +#endif
3866
3867 while (!feof (stdin))
3868 {
3869 bool match;
3870 char *thisfield;
3871 size_t thislen;
3872 +#if HAVE_MBRTOWC
3873 + mbstate_t thisstate;
3874 +#endif
3875 if (readlinebuffer_delim (thisline, stdin, delimiter) == 0)
3876 {
3877 if (ferror (stdin))
3878 @@ -349,6 +569,15 @@ check_file (const char *infile, const ch
3879 }
3880 thisfield = find_field (thisline);
3881 thislen = thisline->length - 1 - (thisfield - thisline->buffer);
3882 +#if HAVE_MBRTOWC
3883 + if (MB_CUR_MAX > 1)
3884 + {
3885 + thisstate = thisline->state;
3886 + match = !different_multi (thisfield, prevfield,
3887 + thislen, prevlen, thisstate, prevstate);
3888 + }
3889 + else
3890 +#endif
3891 match = !different (thisfield, prevfield, thislen, prevlen);
3892 match_count += match;
3893
3894 @@ -381,6 +610,9 @@ check_file (const char *infile, const ch
3895 SWAP_LINES (prevline, thisline);
3896 prevfield = thisfield;
3897 prevlen = thislen;
3898 +#if HAVE_MBRTOWC
3899 + prevstate = thisstate;
3900 +#endif
3901 if (!match)
3902 match_count = 0;
3903 }
3904 @@ -426,6 +658,19 @@ main (int argc, char **argv)
3905
3906 atexit (close_stdout);
3907
3908 +#if HAVE_MBRTOWC
3909 + if (MB_CUR_MAX > 1)
3910 + {
3911 + find_field = find_field_multi;
3912 + }
3913 + else
3914 +#endif
3915 + {
3916 + find_field = find_field_uni;
3917 + }
3918 +
3919 +
3920 +
3921 skip_chars = 0;
3922 skip_fields = 0;
3923 check_chars = SIZE_MAX;
3924 diff -urNp coreutils-8.0-orig/tests/Makefile.in coreutils-8.0/tests/Makefile.in
3925 --- coreutils-8.0-orig/tests/Makefile.in 2009-09-29 16:25:44.000000000 +0200
3926 +++ coreutils-8.0/tests/Makefile.in 2009-10-07 10:07:16.000000000 +0200
3927 @@ -1126,6 +1126,7 @@ TESTS = \
3928 misc/sort-compress \
3929 misc/sort-continue \
3930 misc/sort-files0-from \
3931 + misc/sort-mb-tests \
3932 misc/sort-merge \
3933 misc/sort-merge-fdlimit \
3934 misc/sort-rand \
3935 @@ -1582,6 +1582,10 @@ TESTS = \
3936 $(root_tests)
3937
3938 pr_data = \
3939 + misc/mb1.X \
3940 + misc/mb1.I \
3941 + misc/mb2.X \
3942 + misc/mb2.I \
3943 pr/0F \
3944 pr/0FF \
3945 pr/0FFnt \
3946 diff -urNp coreutils-8.0-orig/tests/Makefile.am coreutils-8.0/tests/Makefile.am
3947 --- coreutils-8.0-orig/tests/Makefile.am 2009-09-29 16:25:44.000000000 +0200
3948 +++ coreutils-8.0/tests/Makefile.am 2009-10-07 10:07:16.000000000 +0200
3949 @@ -208,6 +208,7 @@ TESTS = \
3950 misc/sort-compress \
3951 misc/sort-continue \
3952 misc/sort-files0-from \
3953 + misc/sort-mb-tests \
3954 misc/sort-merge \
3955 misc/sort-merge-fdlimit \
3956 misc/sort-rand \
3957 @@ -452,6 +453,10 @@ TESTS = \
3958 $(root_tests)
3959
3960 pr_data = \
3961 + misc/mb1.X \
3962 + misc/mb1.I \
3963 + misc/mb2.X \
3964 + misc/mb2.I \
3965 pr/0F \
3966 pr/0FF \
3967 pr/0FFnt \
3968 diff -urNp coreutils-8.0-orig/tests/misc/cut coreutils-8.0/tests/misc/cut
3969 --- coreutils-8.0-orig/tests/misc/cut 2009-09-21 14:29:33.000000000 +0200
3970 +++ coreutils-8.0/tests/misc/cut 2009-10-07 10:07:16.000000000 +0200
3971 @@ -26,7 +26,7 @@ use strict;
3972 my $prog = 'cut';
3973 my $try = "Try \`$prog --help' for more information.\n";
3974 my $from_1 = "$prog: fields and positions are numbered from 1\n$try";
3975 -my $inval = "$prog: invalid byte or field list\n$try";
3976 +my $inval = "$prog: invalid byte, character or field list\n$try";
3977 my $no_endpoint = "$prog: invalid range with no endpoint: -\n$try";
3978
3979 my @Tests =
3980 @@ -141,7 +141,7 @@ my @Tests =
3981
3982 # None of the following invalid ranges provoked an error up to coreutils-6.9.
3983 ['inval1', qw(-f 2-0), {IN=>''}, {OUT=>''}, {EXIT=>1},
3984 - {ERR=>"$prog: invalid decreasing range\n$try"}],
3985 + {ERR=>"$prog: invalid byte, character or field list\n$try"}],
3986 ['inval2', qw(-f -), {IN=>''}, {OUT=>''}, {EXIT=>1}, {ERR=>$no_endpoint}],
3987 ['inval3', '-f', '4,-', {IN=>''}, {OUT=>''}, {EXIT=>1}, {ERR=>$no_endpoint}],
3988 ['inval4', '-f', '1-2,-', {IN=>''}, {OUT=>''}, {EXIT=>1}, {ERR=>$no_endpoint}],
3989 diff -urNp coreutils-8.0-orig/tests/misc/mb1.I coreutils-8.0/tests/misc/mb1.I
3990 --- coreutils-8.0-orig/tests/misc/mb1.I 1970-01-01 01:00:00.000000000 +0100
3991 +++ coreutils-8.0/tests/misc/mb1.I 2009-10-07 10:07:16.000000000 +0200
3992 @@ -0,0 +1,4 @@
3993 +Apple@10
3994 +Banana@5
3995 +Citrus@20
3996 +Cherry@30
3997 diff -urNp coreutils-8.0-orig/tests/misc/mb1.X coreutils-8.0/tests/misc/mb1.X
3998 --- coreutils-8.0-orig/tests/misc/mb1.X 1970-01-01 01:00:00.000000000 +0100
3999 +++ coreutils-8.0/tests/misc/mb1.X 2009-10-07 10:07:16.000000000 +0200
4000 @@ -0,0 +1,4 @@
4001 +Banana@5
4002 +Apple@10
4003 +Citrus@20
4004 +Cherry@30
4005 diff -urNp coreutils-8.0-orig/tests/misc/mb2.I coreutils-8.0/tests/misc/mb2.I
4006 --- coreutils-8.0-orig/tests/misc/mb2.I 1970-01-01 01:00:00.000000000 +0100
4007 +++ coreutils-8.0/tests/misc/mb2.I 2009-10-07 10:07:16.000000000 +0200
4008 @@ -0,0 +1,4 @@
4009 +Apple@AA10@@20
4010 +Banana@AA5@@30
4011 +Citrus@AA20@@5
4012 +Cherry@AA30@@10
4013 diff -urNp coreutils-8.0-orig/tests/misc/mb2.X coreutils-8.0/tests/misc/mb2.X
4014 --- coreutils-8.0-orig/tests/misc/mb2.X 1970-01-01 01:00:00.000000000 +0100
4015 +++ coreutils-8.0/tests/misc/mb2.X 2009-10-07 10:07:16.000000000 +0200
4016 @@ -0,0 +1,4 @@
4017 +Citrus@AA20@@5
4018 +Cherry@AA30@@10
4019 +Apple@AA10@@20
4020 +Banana@AA5@@30
4021 diff -urNp coreutils-8.0-orig/tests/misc/sort-mb-tests coreutils-8.0/tests/misc/sort-mb-tests
4022 --- coreutils-8.0-orig/tests/misc/sort-mb-tests 1970-01-01 01:00:00.000000000 +0100
4023 +++ coreutils-8.0/tests/misc/sort-mb-tests 2009-10-07 10:07:16.000000000 +0200
4024 @@ -0,0 +1,58 @@
4025 +#! /bin/sh
4026 +case $# in
4027 + 0) xx='../src/sort';;
4028 + *) xx="$1";;
4029 +esac
4030 +test "$VERBOSE" && echo=echo || echo=:
4031 +$echo testing program: $xx
4032 +errors=0
4033 +test "$srcdir" || srcdir=.
4034 +test "$VERBOSE" && $xx --version 2> /dev/null
4035 +
4036 +export LC_ALL=en_US.UTF-8
4037 +locale -k LC_CTYPE 2>&1 | grep -q charmap.*UTF-8 || exit 77
4038 +errors=0
4039 +
4040 +$xx -t @ -k2 -n misc/mb1.I > misc/mb1.O
4041 +code=$?
4042 +if test $code != 0; then
4043 + $echo "Test mb1 failed: $xx return code $code differs from expected value 0" 1>&2
4044 + errors=`expr $errors + 1`
4045 +else
4046 + cmp misc/mb1.O $srcdir/misc/mb1.X > /dev/null 2>&1
4047 + case $? in
4048 + 0) if test "$VERBOSE"; then $echo "passed mb1"; fi;;
4049 + 1) $echo "Test mb1 failed: files misc/mb1.O and $srcdir/misc/mb1.X differ" 1>&2
4050 + (diff -c misc/mb1.O $srcdir/misc/mb1.X) 2> /dev/null
4051 + errors=`expr $errors + 1`;;
4052 + 2) $echo "Test mb1 may have failed." 1>&2
4053 + $echo The command "cmp misc/mb1.O $srcdir/misc/mb1.X" failed. 1>&2
4054 + errors=`expr $errors + 1`;;
4055 + esac
4056 +fi
4057 +
4058 +$xx -t @ -k4 -n misc/mb2.I > misc/mb2.O
4059 +code=$?
4060 +if test $code != 0; then
4061 + $echo "Test mb2 failed: $xx return code $code differs from expected value 0" 1>&2
4062 + errors=`expr $errors + 1`
4063 +else
4064 + cmp misc/mb2.O $srcdir/misc/mb2.X > /dev/null 2>&1
4065 + case $? in
4066 + 0) if test "$VERBOSE"; then $echo "passed mb2"; fi;;
4067 + 1) $echo "Test mb2 failed: files misc/mb2.O and $srcdir/misc/mb2.X differ" 1>&2
4068 + (diff -c misc/mb2.O $srcdir/misc/mb2.X) 2> /dev/null
4069 + errors=`expr $errors + 1`;;
4070 + 2) $echo "Test mb2 may have failed." 1>&2
4071 + $echo The command "cmp misc/mb2.O $srcdir/misc/mb2.X" failed. 1>&2
4072 + errors=`expr $errors + 1`;;
4073 + esac
4074 +fi
4075 +
4076 +if test $errors = 0; then
4077 + $echo Passed all 113 tests. 1>&2
4078 +else
4079 + $echo Failed $errors tests. 1>&2
4080 +fi
4081 +test $errors = 0 || errors=1
4082 +exit $errors