Bug Summary

File:Source/Additions/Unicode.m
Location:line 2554, column 11
Description:Potential leak of an object allocated on line 2553
Code is compiled to use reference counts

Annotated Source Code

1/** Support functions for Unicode implementation
2 Function to determine default c string encoding for
3 GNUstep based on GNUSTEP_STRING_ENCODING environment variable.
4
5 Copyright (C) 1997 Free Software Foundation, Inc.
6
7 Written by: Stevo Crvenkovski < stevo@btinternet.com >
8 Date: March 1997
9 Merged with GetDefEncoding.m and iconv by: Fred Kiefer <fredkiefer@gmx.de>
10 Date: September 2000
11 Rewrite by: Richard Frith-Macdonald <rfm@gnu.org>
12
13 This file is part of the GNUstep Base Library.
14
15 This library is free software; you can redistribute it and/or
16 modify it under the terms of the GNU Lesser General Public
17 License as published by the Free Software Foundation; either
18 version 2 of the License, or (at your option) any later version.
19
20 This library is distributed in the hope that it will be useful,
21 but WITHOUT ANY WARRANTY; without even the implied warranty of
22 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23 Library General Public License for more details.
24
25 You should have received a copy of the GNU Lesser General Public
26 License along with this library; if not, write to the Free
27 Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
28 Boston, MA 02111 USA.
29*/
30
31
32#import "common.h"
33#if !defined(NeXT_Foundation_LIBRARY)
34#import "Foundation/NSArray.h"
35#import "Foundation/NSDictionary.h"
36#import "Foundation/NSError.h"
37#import "Foundation/NSException.h"
38#import "Foundation/NSLock.h"
39#import "Foundation/NSPathUtilities.h"
40#endif
41
42#import "GNUstepBase/GSLock.h"
43#import "GNUstepBase/GSMime.h"
44#import "GNUstepBase/NSLock+GNUstepBase.h"
45#import "GNUstepBase/Unicode.h"
46
47#import "../GSPrivate.h"
48
49#include <stdio.h>
50#include <string.h>
51
52#if HAVE_LANGINFO_CODESET1
53#include <langinfo.h>
54#endif
55
56typedef struct {unichar from; unsigned char to;} _ucc_;
57
58#include "unicode/cyrillic.h"
59#include "unicode/latin2.h"
60#include "unicode/latin9.h"
61#include "unicode/nextstep.h"
62#include "unicode/caseconv.h"
63#include "unicode/cop.h"
64#include "unicode/decomp.h"
65#include "unicode/gsm0338.h"
66#include "unicode/thai.h"
67
68#ifdef HAVE_ICONV1
69#ifdef HAVE_GICONV_H
70#include <giconv.h>
71#else
72#include <iconvlibiconv.h>
73#endif
74
75/*
76 * The whole of the GNUstep code stores UNICODE in internal byte order,
77 * so we do the same. We have switched to using UTF16 so the defines here
78 * recognise this. We use the endian specific versions of UTF16 so that
79 * iconv does not introduce a BOM where we do not want it.
80 * If UTF16 does not work, we revert to UCS-2-INTERNAL.
81 */
82#ifdef WORDS_BIGENDIAN
83#define UNICODE_UTF16"UTF-16LE" "UTF-16BE"
84#define UNICODE_UTF32"UTF-32LE" "UTF-32BE"
85#define UNICODE_INT"UNICODELITTLE" "UNICODEBIG"
86#else
87#define UNICODE_UTF16"UTF-16LE" "UTF-16LE"
88#define UNICODE_UTF32"UTF-32LE" "UTF-32LE"
89#define UNICODE_INT"UNICODELITTLE" "UNICODELITTLE"
90#endif
91
92#define UNICODE_ENC((unicode_enc) ? unicode_enc : internal_unicode_enc()) ((unicode_enc) ? unicode_enc : internal_unicode_enc())
93
94static const char *unicode_enc = NULL((void *)0);
95
96/* Check to see what type of internal unicode format the library supports */
97static const char *
98internal_unicode_enc(void)
99{
100 iconv_tlibiconv_t conv;
101
102 unicode_enc = UNICODE_UTF16"UTF-16LE";
103 conv = iconv_openlibiconv_open(unicode_enc, "ASCII");
104 if (conv != (iconv_tlibiconv_t)-1)
105 {
106 iconv_closelibiconv_close(conv);
107 return unicode_enc;
108 }
109 fprintf(stderr__stderrp, "Could not initialise iconv() for UTF16, using UCS-2\n");
110 fprintf(stderr__stderrp, "Using characters outside 16 bits may give bad results.\n");
111
112 unicode_enc = UNICODE_INT"UNICODELITTLE";
113 conv = iconv_openlibiconv_open(unicode_enc, "ASCII");
114 if (conv != (iconv_tlibiconv_t)-1)
115 {
116 iconv_closelibiconv_close(conv);
117 return unicode_enc;
118 }
119 unicode_enc = "UCS-2-INTERNAL";
120 conv = iconv_openlibiconv_open(unicode_enc, "ASCII");
121 if (conv != (iconv_tlibiconv_t)-1)
122 {
123 iconv_closelibiconv_close(conv);
124 return unicode_enc;
125 }
126 unicode_enc = "UCS-2";
127 /* This had better work */
128 return unicode_enc;
129}
130
131#else
132#define UNICODE_UTF32"UTF-32LE" ""
133#endif
134
135static GSLazyLock *local_lock = nil((id)((void*)0));
136
137typedef unsigned char unc;
138static NSStringEncoding defEnc = GSUndefinedEncoding;
139static NSStringEncoding natEnc = GSUndefinedEncoding;
140static NSStringEncoding *_availableEncodings = 0;
141
142struct _strenc_ {
143 NSStringEncoding enc; // Constant representing the encoding.
144 const char *ename; // ASCII string representation of name.
145 const char *iconvlibiconv; /* Iconv name of encoding. If this
146 * is the empty string, we cannot use
147 * iconv perform conversions to/from
148 * this encoding.
149 * NB. do not put a null pointer in this
150 * field in the table, use "" instread.
151 */
152 BOOL eightBit; /* Flag to say whether this encoding
153 * can be stored in a byte array ...
154 * ie whether the encoding consists
155 * entirely of single byte characters
156 * and the first 128 are identical to
157 * the ASCII character set.
158 */
159 char supported; /* Is this supported? Some encodings
160 * have builtin conversion to/from
161 * unicode, but for others we must
162 * check with iconv to see if it
163 * supports them on this platform.
164 * A one means supported.
165 * A negative means unsupported.
166 * A zero means not yet checked.
167 */
168 const char *lossy; /* Iconv name for lossy encoding */
169};
170
171/*
172 * The str_encoding_table is a compact representation of all the string
173 * encoding information we might need. It gets modified at runtime.
174 */
175static struct _strenc_ str_encoding_table[] = {
176 {NSASCIIStringEncoding,
177 "NSASCIIStringEncoding","ASCII",1,1,0},
178 {NSNEXTSTEPStringEncoding,
179 "NSNEXTSTEPStringEncoding","NEXTSTEP",1,1,0},
180 {NSJapaneseEUCStringEncoding,
181 "NSJapaneseEUCStringEncoding","EUC-JP",0,0,0},
182 {NSUTF8StringEncoding,
183 "NSUTF8StringEncoding","UTF-8",0,1,0},
184 {NSISOLatin1StringEncoding,
185 "NSISOLatin1StringEncoding","ISO-8859-1",1,1,0},
186 {NSSymbolStringEncoding,
187 "NSSymbolStringEncoding","",0,0,0},
188 {NSNonLossyASCIIStringEncoding,
189 "NSNonLossyASCIIStringEncoding","",1,1,0},
190 {NSShiftJISStringEncoding,
191 "NSShiftJISStringEncoding","SHIFT-JIS",0,0,0},
192 {NSISOLatin2StringEncoding,
193 "NSISOLatin2StringEncoding","ISO-8859-2",1,1,0},
194 {NSUnicodeStringEncoding,
195 "NSUnicodeStringEncoding","",0,1,0},
196 {NSWindowsCP1251StringEncoding,
197 "NSWindowsCP1251StringEncoding","CP1251",0,0,0},
198 {NSWindowsCP1252StringEncoding,
199 "NSWindowsCP1252StringEncoding","CP1252",0,0,0},
200 {NSWindowsCP1253StringEncoding,
201 "NSWindowsCP1253StringEncoding","CP1253",0,0,0},
202 {NSWindowsCP1254StringEncoding,
203 "NSWindowsCP1254StringEncoding","CP1254",0,0,0},
204 {NSWindowsCP1250StringEncoding,
205 "NSWindowsCP1250StringEncoding","CP1250",0,0,0},
206 {NSISO2022JPStringEncoding,
207 "NSISO2022JPStringEncoding","ISO-2022-JP",0,0,0},
208 {NSMacOSRomanStringEncoding,
209 "NSMacOSRomanStringEncoding","MACINTOSH",0,0,0},
210#if defined(GNUSTEP1)
211 {NSProprietaryStringEncoding,
212 "NSProprietaryStringEncoding","",0,0,0},
213#endif
214
215// GNUstep additions
216 {NSISOCyrillicStringEncoding,
217 "NSISOCyrillicStringEncoding","ISO-8859-5",0,1,0},
218 {NSKOI8RStringEncoding,
219 "NSKOI8RStringEncoding","KOI8-R",0,0,0},
220 {NSISOLatin3StringEncoding,
221 "NSISOLatin3StringEncoding","ISO-8859-3",0,0,0},
222 {NSISOLatin4StringEncoding,
223 "NSISOLatin4StringEncoding","ISO-8859-4",0,0,0},
224 {NSISOArabicStringEncoding,
225 "NSISOArabicStringEncoding","ISO-8859-6",0,0,0},
226 {NSISOGreekStringEncoding,
227 "NSISOGreekStringEncoding","ISO-8859-7",0,0,0},
228 {NSISOHebrewStringEncoding,
229 "NSISOHebrewStringEncoding","ISO-8859-8",0,0,0},
230 {NSISOLatin5StringEncoding,
231 "NSISOLatin5StringEncoding","ISO-8859-9",0,0,0},
232 {NSISOLatin6StringEncoding,
233 "NSISOLatin6StringEncoding","ISO-8859-10",0,0,0},
234 {NSISOThaiStringEncoding,
235 "NSISOThaiStringEncoding","ISO-8859-11",1,1,0},
236 {NSISOLatin7StringEncoding,
237 "NSISOLatin7StringEncoding","ISO-8859-13",0,0,0},
238 {NSISOLatin8StringEncoding,
239 "NSISOLatin8StringEncoding","ISO-8859-14",0,0,0},
240 {NSISOLatin9StringEncoding,
241 "NSISOLatin9StringEncoding","ISO-8859-15",1,1,0},
242 {NSUTF7StringEncoding,
243 "NSUTF7StringEncoding","UTF-7",0,0,0},
244 {NSGB2312StringEncoding,
245 "NSGB2312StringEncoding","EUC-CN",0,0,0},
246 {NSGSM0338StringEncoding,
247 "NSGSM0338StringEncoding","",0,1,0},
248 {NSBIG5StringEncoding,
249 "NSBIG5StringEncoding","BIG5",0,0,0},
250 {NSKoreanEUCStringEncoding,
251 "NSKoreanEUCStringEncoding","EUC-KR",0,0,0},
252
253/* Now Apple encodings which have high numeric values.
254 */
255 {NSUTF16BigEndianStringEncoding,
256 "NSUTF16BigEndianStringEncoding","UTF-16BE",0,0,0},
257 {NSUTF16LittleEndianStringEncoding,
258 "NSUTF16LittleEndianStringEncoding","UTF-16LE",0,0,0},
259 {NSUTF32StringEncoding,
260 "NSUTF32StringEncoding",UNICODE_UTF32"UTF-32LE",0,0,0},
261 {NSUTF32BigEndianStringEncoding,
262 "NSUTF32BigEndianStringEncoding","UTF-32BE",0,0,0},
263 {NSUTF32LittleEndianStringEncoding,
264 "NSUTF32LittleEndianStringEncoding","UTF-32LE",0,0,0},
265
266 {0,"Unknown encoding","",0,0,0}
267};
268
269static struct _strenc_ **encodingTable = 0;
270static unsigned encTableSize = 0;
271
272static void GSSetupEncodingTable(void)
273{
274 if (encodingTable == 0)
275 {
276 [GS_INITIALIZED_LOCK(local_lock, GSLazyLock)(local_lock != ((id)((void*)0)) ? (id)local_lock : (id)[GSLazyLock
newLockAt: &local_lock])
lock];
277 if (encodingTable == 0)
278 {
279 static struct _strenc_ **encTable = 0;
280 unsigned count;
281 unsigned i;
282
283 /*
284 * We want to store pointers to our string encoding info in a
285 * large table so we can do efficient lookup by encoding value.
286 */
287#define MAX_ENCODING128 128
288 count = sizeof(str_encoding_table) / sizeof(struct _strenc_);
289
290 /*
291 * First determine the largest encoding value and create a
292 * large enough table of pointers.
293 */
294 encTableSize = 0;
295 for (i = 0; i < count; i++)
296 {
297 unsigned tmp = str_encoding_table[i].enc;
298
299 if (tmp > encTableSize)
300 {
301 if (tmp < MAX_ENCODING128)
302 {
303 encTableSize = tmp;
304 }
305 }
306 }
307 encTable = malloc(
308 (encTableSize+1)*sizeof(struct _strenc_ *));
309 memset(encTable, 0, (encTableSize+1)*sizeof(struct _strenc_ *));
310
311 /*
312 * Now set up the pointers at the correct location in the table.
313 */
314 for (i = 0; i < count; i++)
315 {
316 struct _strenc_ *entry = &str_encoding_table[i];
317 unsigned tmp = entry->enc;
318
319 if (tmp < MAX_ENCODING128)
320 {
321 encTable[tmp] = entry;
322 }
323#ifdef HAVE_ICONV1
324 if (entry->iconvlibiconv != 0 && *(entry->iconvlibiconv) != 0)
325 {
326 iconv_tlibiconv_t c;
327 int l;
328 char *lossy;
329
330 /*
331 * See if we can do a lossy conversion.
332 */
333 l = strlen(entry->iconvlibiconv);
334 lossy = malloc(l + 11);
335 strncpy(lossy, entry->iconvlibiconv, l);
336 strncpy(lossy + l, "//TRANSLIT", 11);
337 c = iconv_openlibiconv_open(lossy, UNICODE_ENC((unicode_enc) ? unicode_enc : internal_unicode_enc()));
338 if (c == (iconv_tlibiconv_t)-1)
339 {
340 free(lossy);
341 }
342 else
343 {
344 entry->lossy = lossy;
345 iconv_closelibiconv_close(c);
346 }
347 }
348#endif
349 }
350 encodingTable = encTable;
351 }
352 [local_lock unlock];
353 }
354}
355
356static struct _strenc_ *
357EntryForEncoding(NSStringEncoding enc)
358{
359 struct _strenc_ *entry = 0;
360
361 if (enc > 0)
362 {
363 GSSetupEncodingTable();
364 if (enc <= encTableSize)
365 {
366 entry = encodingTable[enc];
367 }
368 else
369 {
370 unsigned i = 0;
371
372 while (i < sizeof(str_encoding_table) / sizeof(struct _strenc_))
373 {
374 if (str_encoding_table[i].enc == enc)
375 {
376 entry = &str_encoding_table[i];
377 break;
378 }
379 i++;
380 }
381 }
382 }
383 return entry;
384}
385
386static struct _strenc_ *
387EntrySupported(NSStringEncoding enc)
388{
389 struct _strenc_ *entry = EntryForEncoding(enc);
390
391 if (entry == 0)
392 {
393 return NO((BOOL)0);
394 }
395#ifdef HAVE_ICONV1
396 if (entry->iconvlibiconv != 0 && entry->supported == 0)
397 {
398 if (enc == NSUnicodeStringEncoding)
399 {
400 entry->iconvlibiconv = UNICODE_ENC((unicode_enc) ? unicode_enc : internal_unicode_enc());
401 entry->supported = 1;
402 }
403 else if (entry->iconvlibiconv[0] == 0)
404 {
405 /* explicitly check for empty encoding name since some systems
406 * have buggy iconv_open() code which succeeds on an empty name.
407 */
408 entry->supported = -1;
409 }
410 else
411 {
412 iconv_tlibiconv_t c;
413
414 c = iconv_openlibiconv_open(UNICODE_ENC((unicode_enc) ? unicode_enc : internal_unicode_enc()), entry->iconvlibiconv);
415 if (c == (iconv_tlibiconv_t)-1)
416 {
417 entry->supported = -1;
418 }
419 else
420 {
421 iconv_closelibiconv_close(c);
422 c = iconv_openlibiconv_open(entry->iconvlibiconv, UNICODE_ENC((unicode_enc) ? unicode_enc : internal_unicode_enc()));
423 if (c == (iconv_tlibiconv_t)-1)
424 {
425 entry->supported = -1;
426 }
427 else
428 {
429 iconv_closelibiconv_close(c);
430 entry->supported = 1;
431 }
432 }
433 }
434 }
435#endif
436 if (entry->supported == 1)
437 {
438 return entry;
439 }
440 return 0;
441}
442
443BOOL
444GSPrivateIsEncodingSupported(NSStringEncoding enc)
445{
446 if (EntrySupported(enc) == 0)
447 {
448 return NO((BOOL)0);
449 }
450 return YES((BOOL)1);
451}
452
453/** Returns the NSStringEncoding that matches the specified
454 * character set registry and encoding information. For instance,
455 * for the iso8859-5 character set, the registry is iso8859 and
456 * the encoding is 5, and the returned NSStringEncoding is
457 * NSISOCyrillicStringEncoding. If there is no specific encoding,
458 * use @"0". Returns GSUndefinedEncoding if there is no match.
459 */
460NSStringEncoding
461GSEncodingForRegistry (NSString *registry, NSString *encoding)
462{
463 NSString *charset = registry;
464
465 if ([encoding length] > 0 && [encoding isEqualToString: @"0"] == NO((BOOL)0))
466 {
467 charset = [NSString stringWithFormat: @"%@-%@", registry, encoding];
468 }
469 return [GSMimeDocument encodingFromCharset: charset];
470}
471
472/** Try to deduce the string encoding from the locale string
473 * clocale. This function looks in the Locale.encodings file
474 * installed as part of GNUstep Base if the encoding cannot be
475 * deduced from the clocale string itself. If clocale isn't set or
476 * no match can be found, returns GSUndefinedEncoding.
477 */
478/* It would be really nice if this could be used in +defaultCStringEncoding,
479 * but there are too many dependancies on other parts of the library to
480 * make this practical (even if everything possible was written in C,
481 * we'd still need some way to find the Locale.encodings file).
482 */
483NSStringEncoding
484GSEncodingFromLocale(const char *clocale)
485{
486 NSStringEncoding encoding = GSUndefinedEncoding;
487 NSString *encodstr;
488
489 if (clocale == NULL((void *)0) || strcmp(clocale, "C") == 0
490 || strcmp(clocale, "POSIX") == 0)
491 {
492 /* Don't make any assumptions. Let caller handle that */
493 return encoding;
494 }
495
496 if (strchr (clocale, '.') != NULL((void *)0))
497 {
498 /* Locale contains the 'codeset' section. Parse it and see
499 if we know what encoding this cooresponds to */
500 NSString *registry;
501 NSString *charset;
502 NSArray *array;
503 char *s;
504
505 s = strchr (clocale, '.');
506 registry = [[NSString stringWithUTF8String: s+1] lowercaseString];
507 array = [registry componentsSeparatedByString: @"-"];
508 registry = [array objectAtIndex: 0];
509 if ([array count] > 1)
510 {
511 charset = [NSString stringWithFormat: @"%@-%@",
512 registry, [array lastObject]];
513 }
514 else
515 {
516 charset = registry;
517 }
518
519 encoding = [GSMimeDocument encodingFromCharset: charset];
520 }
521 else
522 {
523 /* Look up the locale in our table of encodings */
524 NSBundle *gbundle;
525 NSString *table;
526
527#ifdef GNUSTEP1
528 gbundle = [NSBundle bundleForLibrary: @"gnustep-base"];
529#else
530 gbundle = [NSBundle bundleForClass: NSClassFromString(@"GSXMLNode")];
531#endif
532 table = [gbundle pathForResource: @"Locale"
533 ofType: @"encodings"
534 inDirectory: @"Languages"];
535 if (table != nil((id)((void*)0)))
536 {
537 unsigned count;
538 NSDictionary *dict;
539
540 dict = [NSDictionary dictionaryWithContentsOfFile: table];
541 encodstr = [dict objectForKey:
542 [NSString stringWithUTF8String: clocale]];
543 if (encodstr == nil((id)((void*)0)))
544 return GSUndefinedEncoding;
545
546 /* Find the matching encoding */
547 count = 0;
548 while (str_encoding_table[count].enc
549 && strcmp(str_encoding_table[count].ename, [encodstr lossyCString]))
550 {
551 count++;
552 }
553 if (str_encoding_table[count].enc)
554 {
555 encoding = str_encoding_table[count].enc;
556 }
557 if (encoding == GSUndefinedEncoding)
558 {
559 NSLog(@"No known GNUstep encoding for %s = %@",
560 clocale, encodstr);
561 }
562 }
563 }
564
565 return encoding;
566}
567
568/**
569 * Uses direct access into a two-level table to map cases.<br />
570 * The two-level table method is less space efficient (but still not bad) than
571 * a single table and a linear search, but it reduces the number of
572 * conditional statements to just one.
573 */
574unichar
575uni_tolower(unichar ch)
576{
577 unichar result = gs_tolower_map[ch / 256][ch % 256];
578
579 return result ? result : ch;
580}
581
582/**
583 * Uses direct access into a two-level table to map cases.<br />
584 * The two-level table method is less space efficient (but still not bad) than
585 * a single table and a linear search, but it reduces the number of
586 * conditional statements to just one.
587 */
588unichar
589uni_toupper(unichar ch)
590{
591 unichar result = gs_toupper_map[ch / 256][ch % 256];
592
593 return result ? result : ch;
594}
595
596unsigned char
597GSPrivateUniCop(unichar u)
598{
599 if (u < uni_cop_table[0].code)
600 {
601 return 0; // Special case for latin1
602 }
603 else
604 {
605 unichar code;
606 unichar count = 0;
607 unichar first = 0;
608 unichar last = uni_cop_table_size;
609
610 while (first <= last)
611 {
612 if (first != last)
613 {
614 count = (first + last) / 2;
615 code = uni_cop_table[count].code;
616 if (code < u)
617 {
618 first = count+1;
619 }
620 else if (code > u)
621 {
622 last = count-1;
623 }
624 else
625 {
626 return uni_cop_table[count].cop;
627 }
628 }
629 else /* first == last */
630 {
631 if (u == uni_cop_table[first].code)
632 {
633 return uni_cop_table[first].cop;
634 }
635 return 0;
636 }
637 }
638 return 0;
639 }
640}
641
642unsigned char
643uni_cop(unichar u)
644{
645 return GSPrivateUniCop(u);
646}
647
648BOOL
649uni_isnonsp(unichar u)
650{
651 /*
652 * Treating upper surrogates as non-spacing is a convenient solution
653 * to a number of issues with UTF-16
654 */
655 if ((u >= 0xdc00) && (u <= 0xdfff))
656 return YES((BOOL)1);
657
658// FIXME check is uni_cop good for this
659 if (GSPrivateUniCop(u))
660 return YES((BOOL)1);
661 else
662 return NO((BOOL)0);
663}
664
665unichar*
666uni_is_decomp(unichar u)
667{
668 if (u < uni_dec_table[0].code)
669 {
670 return 0; // Special case for latin1
671 }
672 else
673 {
674 unichar code;
675 unichar count = 0;
676 unichar first = 0;
677 unichar last = uni_dec_table_size;
678
679 while (first <= last)
680 {
681 if (first != last)
682 {
683 count = (first + last) / 2;
684 code = uni_dec_table[count].code;
685 if (code < u)
686 {
687 first = count+1;
688 }
689 else if (code > u)
690 {
691 last = count-1;
692 }
693 else
694 {
695 return uni_dec_table[count].decomp;
696 }
697 }
698 else /* first == last */
699 {
700 if (u == uni_dec_table[first].code)
701 {
702 return uni_dec_table[first].decomp;
703 }
704 return 0;
705 }
706 }
707 return 0;
708 }
709}
710
711/**
712 * Function to check a block of data for validity as a unicode string and
713 * say whether it contains solely ASCII or solely Latin1 data.<br />
714 * Any leading BOM must already have been removed and the data must already
715 * be in native byte order.<br />
716 * Returns the number of characters which were found valid.
717 */
718unsigned
719GSUnicode(const unichar *chars, unsigned length,
720 BOOL *isASCII, BOOL *isLatin1)
721{
722 unsigned i = 0;
723 unichar c;
724
725 if (isASCII) *isASCII = YES((BOOL)1);
726 if (isLatin1) *isLatin1 = YES((BOOL)1);
727 while (i < length)
728 {
729 if ((c = chars[i++]) > 127)
730 {
731 if (isASCII) *isASCII = NO((BOOL)0);
732 i--;
733 while (i < length)
734 {
735 if ((c = chars[i++]) > 255)
736 {
737 if (isLatin1) *isLatin1 = NO((BOOL)0);
738 i--;
739 while (i < length)
740 {
741 c = chars[i++];
742 if (c == 0xfffe || c == 0xffff
743 || (c >= 0xfdd0 && c <= 0xfdef))
744 {
745 return i - 1; // Non-characters.
746 }
747 if (c >= 0xdc00 && c <= 0xdfff)
748 {
749 return i - 1; // Second half of a surrogate pair.
750 }
751 if (c >= 0xd800 && c <= 0xdbff)
752 {
753 // First half of a surrogate pair.
754 if (i >= length)
755 {
756 return i - 1; // Second half missing
757 }
758 c = chars[i];
759 if (c < 0xdc00 || c > 0xdfff)
760 {
761 return i - 1; // Second half missing
762 }
763 i++; // Step past second half
764 }
765 }
766 }
767 }
768 }
769 }
770 return i;
771}
772
773#if GS_WITH_GC0
774
775#define GROW() \
776if (dst == 0) \
777 { \
778 /* \
779 * Data is just being discarded anyway, so we can \
780 * reset the offset into the local buffer on the \
781 * stack and pretend the buffer has grown. \
782 */ \
783 ptr = buf - dpos; \
784 bsize = dpos + BUFSIZ1024; \
785 if (extra != 0) \
786 { \
787 bsize--; \
788 } \
789 } \
790else if (zone == 0) \
791 { \
792 result = NO((BOOL)0); /* No buffer growth possible ... fail. */ \
793 goto done; \
794 } \
795else \
796 { \
797 unsigned grow = slen; \
798\
799 if (grow < bsize + BUFSIZ1024) \
800 { \
801 grow = bsize + BUFSIZ1024; \
802 } \
803 grow *= sizeof(unichar); \
804\
805 if (ptr == buf || ptr == *dst) \
806 { \
807 unichar *tmp; \
808\
809 tmp = NSAllocateCollectable(grow + extra, 0); \
810 if (tmp != 0) \
811 { \
812 memcpy(tmp, ptr, bsize * sizeof(unichar)); \
813 } \
814 ptr = tmp; \
815 } \
816 else \
817 { \
818 ptr = NSReallocateCollectable(ptr, grow + extra, 0); \
819 } \
820 if (ptr == 0) \
821 { \
822 return NO((BOOL)0); /* Not enough memory */ \
823 } \
824 bsize = grow / sizeof(unichar); \
825 }
826
827#else /* GS_WITH_GC */
828
829#define GROW() \
830if (dst == 0) \
831 { \
832 /* \
833 * Data is just being discarded anyway, so we can \
834 * reset the offset into the local buffer on the \
835 * stack and pretend the buffer has grown. \
836 */ \
837 ptr = buf - dpos; \
838 bsize = dpos + BUFSIZ1024; \
839 if (extra != 0) \
840 { \
841 bsize--; \
842 } \
843 } \
844else if (zone == 0) \
845 { \
846 result = NO((BOOL)0); /* No buffer growth possible ... fail. */ \
847 goto done; \
848 } \
849else \
850 { \
851 unsigned grow = slen; \
852\
853 if (grow < bsize + BUFSIZ1024) \
854 { \
855 grow = bsize + BUFSIZ1024; \
856 } \
857 grow *= sizeof(unichar); \
858\
859 if (ptr == buf || ptr == *dst) \
860 { \
861 unichar *tmp; \
862\
863 tmp = NSZoneMalloc(zone, grow + extra); \
864 if (tmp != 0) \
865 { \
866 memcpy(tmp, ptr, bsize * sizeof(unichar)); \
867 } \
868 ptr = tmp; \
869 } \
870 else \
871 { \
872 ptr = NSZoneRealloc(zone, ptr, grow + extra); \
873 } \
874 if (ptr == 0) \
875 { \
876 return NO((BOOL)0); /* Not enough memory */ \
877 } \
878 bsize = grow / sizeof(unichar); \
879 }
880
881#endif /* GS_WITH_GC */
882
883/**
884 * Function to convert from 8-bit data to 16-bit unicode characters.
885 * <p>The dst argument is a pointer to a pointer to a buffer in which the
886 * converted string is to be stored. If it is a null pointer, this function
887 * discards converted data, and is used only to determine the length of the
888 * converted string. If the zone argument is non-nul, the function is free
889 * to allocate a larger buffer if necessary, and store this new buffer in
890 * the dst argument. It will *NOT* deallocate the original buffer!
891 * </p>
892 * <p>The size argument is a pointer to the initial size of the destination
893 * buffer. If the function changes the buffer size, this value will be
894 * altered to the new size. This is measured in 16-bit unicode characters,
895 * not bytes.
896 * </p>
897 * <p>The src argument is a pointer to the byte sequence which is
898 * to be converted to 16-bit unicode.
899 * </p>
900 * <p>The slen argument is the length of the byte sequence
901 * which is to be converted to 16-bit unicode.
902 * This is measured in bytes.
903 * </p>
904 * <p>The enc argument specifies the encoding type of the 8-bit byte sequence
905 * which is to be converted to 16-bit unicode.
906 * </p>
907 * <p>The zone argument specifies a memory zone in which the function may
908 * allocate a buffer to return data in.
909 * If this is nul, the function will fail if the originally supplied buffer
910 * is not big enough (unless dst is a null pointer ... indicating that
911 * converted data is to be discarded).<br />
912 * If the library is built for garbage collecting, the zone argument is used
913 * only as a marker to say whether the function may allocate memory (zone
914 * is non-null) or not (zone is null).
915 * </p>
916 * The options argument controls some special behavior.
917 * <list>
918 * <item>If GSUniTerminate is set, the function is expected to null terminate
919 * the output string, and will assume that it is safe to place the nul
920 * just beyond the end of the stated buffer size.
921 * Also, if the function grows the buffer, it will allow for an extra
922 * termination character.</item>
923 * <item>If GSUniTemporary is set, the function will return the results in
924 * an autoreleased buffer rather than in a buffer that the caller must
925 * release.</item>
926 * <item>If GSUniBOM is set, the function will write the first unicode
927 * character as a byte order marker.</item>
928 * <item>If GSUniShortOk is set, the function will return a buffer containing
929 * any decoded characters even if the whole conversion fails.</item>
930 * </list>
931 * <p>On return, the function result is a flag indicating success (YES)
932 * or failure (NO), and on success, the value stored in size is the number
933 * of characters in the converted string. The converted string itself is
934 * stored in the location given by dst.<br />
935 * NB. If the value stored in dst has been changed, it is a pointer to
936 * allocated memory which the caller is responsible for freeing, and the
937 * caller is <em>still</em> responsible for freeing the original buffer.
938 * </p>
939 */
940BOOL
941GSToUnicode(unichar **dst, unsigned int *size, const unsigned char *src,
942 unsigned int slen, NSStringEncoding enc, NSZone *zone,
943 unsigned int options)
944{
945 unichar buf[BUFSIZ1024];
946 unichar *ptr;
947 unsigned bsize;
948 unsigned dpos = 0; // Offset into destination buffer.
949 unsigned spos = 0; // Offset into source buffer.
950 unsigned extra = (options & GSUniTerminate0x01) ? sizeof(unichar) : 0;
951 unichar base = 0;
952 unichar *table = 0;
953 BOOL result = YES((BOOL)1);
954
955 /*
956 * Ensure we have an initial buffer set up to decode data into.
957 */
958 if (dst == 0 || *size == 0)
959 {
960 ptr = buf;
961 bsize = (extra != 0) ? BUFSIZ1024 - 1 : BUFSIZ1024;
962 }
963 else
964 {
965 ptr = *dst;
966 bsize = *size;
967 }
968
969 if (options & GSUniBOM0x08)
970 {
971 while (dpos >= bsize)
972 {
973 GROW();
974 }
975 ptr[dpos++] = (unichar)0xFEFF; // Insert byte order marker.
976 }
977
978 switch (enc)
979 {
980 case NSUTF8StringEncoding:
981 {
982 while (spos < slen)
983 {
984 unsigned char c = src[spos];
985 unsigned long u = c;
986
987 if (c > 0x7f)
988 {
989 int i, sle = 0;
990
991 /* calculated the expected sequence length */
992 while (c & 0x80)
993 {
994 c = c << 1;
995 sle++;
996 }
997
998 /* legal ? */
999 if ((sle < 2) || (sle > 6))
1000 {
1001 result = NO((BOOL)0);
1002 goto done;
1003 }
1004
1005 /* do we have enough bytes ? */
1006 if ((spos + sle) > slen)
1007 {
1008 result = NO((BOOL)0);
1009 goto done;
1010 }
1011
1012 /* get the codepoint */
1013 for (i = 1; i < sle; i++)
1014 {
1015 if (src[spos + i] < 0x80 || src[spos + i] >= 0xc0)
1016 break;
1017 u = (u << 6) | (src[spos + i] & 0x3f);
1018 }
1019 if (i < sle)
1020 {
1021 result = NO((BOOL)0);
1022 goto done;
1023 }
1024 u = u & ~(0xffffffff << ((5 * sle) + 1));
1025 spos += sle;
1026
1027 /*
1028 * We discard invalid codepoints here.
1029 */
1030 if (u > 0x10ffff || u == 0xfffe || u == 0xffff
1031 || (u >= 0xfdd0 && u <= 0xfdef))
1032 {
1033 result = NO((BOOL)0); // Invalid character.
1034 goto done;
1035 }
1036
1037 if ((u >= 0xd800) && (u <= 0xdfff))
1038 {
1039 result = NO((BOOL)0); // Unmatched half of surrogate pair.
1040 goto done;
1041 }
1042 }
1043 else
1044 {
1045 spos++;
1046 }
1047
1048 /*
1049 * Add codepoint as either a single unichar for BMP
1050 * or as a pair of surrogates for codepoints over 16 bits.
1051 */
1052
1053 if (dpos >= bsize)
1054 {
1055 GROW();
1056 }
1057 if (u < 0x10000)
1058 {
1059 ptr[dpos++] = u;
1060 }
1061 else
1062 {
1063 unichar ul, uh;
1064
1065 u -= 0x10000;
1066 ul = u & 0x3ff;
1067 uh = (u >> 10) & 0x3ff;
1068
1069 ptr[dpos++] = uh + 0xd800;
1070 if (dpos >= bsize)
1071 {
1072 GROW();
1073 }
1074 ptr[dpos++] = ul + 0xdc00;
1075 }
1076 }
1077 }
1078 break;
1079
1080 case NSNonLossyASCIIStringEncoding:
1081 case NSASCIIStringEncoding:
1082 if (dst == 0)
1083 {
1084 /* Just counting bytes, and we know there is exactly one
1085 * unicode codepoint needed for each ascii character.
1086 */
1087 dpos += slen;
1088 }
1089 else
1090 {
1091 /* Because we know that each ascii chartacter is exactly
1092 * one unicode character, we can check the destination
1093 * buffer size and allocate more space in one go, before
1094 * entering the loop where we deal with each character.
1095 */
1096 if (dpos + slen + (extra ? 1 : 0) > bsize)
1097 {
1098 if (zone == 0)
1099 {
1100 result = NO((BOOL)0); /* No buffer growth possible ... fail. */
1101 goto done;
1102 }
1103 else
1104 {
1105 unsigned grow = (dpos + slen) * sizeof(unichar);
1106 unichar *tmp;
1107
1108#if GS_WITH_GC0
1109 tmp = NSAllocateCollectable(grow + extra, 0);
1110#else
1111 tmp = NSZoneMalloc(zone, grow + extra);
1112#endif
1113 if ((ptr == buf || ptr == *dst) && (tmp != 0))
1114 {
1115 memcpy(tmp, ptr, bsize * sizeof(unichar));
1116 }
1117#if !GS_WITH_GC0
1118 if (ptr != buf && ptr != *dst)
1119 {
1120 NSZoneFree(zone, ptr);
1121 }
1122#endif
1123 ptr = tmp;
1124 if (ptr == 0)
1125 {
1126 return NO((BOOL)0); /* Not enough memory */
1127 }
1128 bsize = grow / sizeof(unichar);
1129 }
1130 }
1131 while (spos < slen)
1132 {
1133 unichar c = (unichar)((unc)src[spos++]);
1134
1135 if (c > 127)
1136 {
1137 result = NO((BOOL)0); // Non-ascii data found in input.
1138 goto done;
1139 }
1140 ptr[dpos++] = c;
1141 }
1142 }
1143 break;
1144
1145 case NSISOLatin1StringEncoding:
1146 if (dst == 0)
1147 {
1148 /* Just counting bytes, and we know there is exactly one
1149 * unicode codepoint needed for each latin1 character.
1150 */
1151 dpos += slen;
1152 }
1153 else
1154 {
1155 /* Because we know that each latin1 chartacter is exactly
1156 * one unicode character, we can check the destination
1157 * buffer size and allocate more space in one go, before
1158 * entering the loop where we deal with each character.
1159 */
1160 if (dpos + slen + (extra ? 1 : 0) > bsize)
1161 {
1162 if (zone == 0)
1163 {
1164 result = NO((BOOL)0); /* No buffer growth possible ... fail. */
1165 goto done;
1166 }
1167 else
1168 {
1169 unsigned grow = (dpos + slen) * sizeof(unichar);
1170 unichar *tmp;
1171
1172#if GS_WITH_GC0
1173 tmp = NSAllocateCollectable(grow + extra, 0);
1174#else
1175 tmp = NSZoneMalloc(zone, grow + extra);
1176#endif
1177 if ((ptr == buf || ptr == *dst) && (tmp != 0))
1178 {
1179 memcpy(tmp, ptr, bsize * sizeof(unichar));
1180 }
1181#if !GS_WITH_GC0
1182 if (ptr != buf && ptr != *dst)
1183 {
1184 NSZoneFree(zone, ptr);
1185 }
1186#endif
1187 ptr = tmp;
1188 if (ptr == 0)
1189 {
1190 return NO((BOOL)0); /* Not enough memory */
1191 }
1192 bsize = grow / sizeof(unichar);
1193 }
1194 }
1195 while (spos < slen)
1196 {
1197 ptr[dpos++] = (unichar)((unc)src[spos++]);
1198 }
1199 }
1200 break;
1201
1202 case NSNEXTSTEPStringEncoding:
1203 base = Next_conv_base;
1204 table = Next_char_to_uni_table;
1205 goto tables;
1206
1207 case NSISOCyrillicStringEncoding:
1208 base = Cyrillic_conv_base;
1209 table = Cyrillic_char_to_uni_table;
1210 goto tables;
1211
1212 case NSISOLatin2StringEncoding:
1213 base = Latin2_conv_base;
1214 table = Latin2_char_to_uni_table;
1215 goto tables;
1216
1217 case NSISOLatin9StringEncoding:
1218 base = Latin9_conv_base;
1219 table = Latin9_char_to_uni_table;
1220 goto tables;
1221
1222 case NSISOThaiStringEncoding:
1223 base = Thai_conv_base;
1224 table = Thai_char_to_uni_table;
1225 goto tables;
1226
1227#if 0
1228 case NSSymbolStringEncoding:
1229 base = Symbol_conv_base;
1230 table = Symbol_char_to_uni_table;
1231 goto tables;
1232#endif
1233
1234tables:
1235 if (dst == 0)
1236 {
1237 /* Just counting bytes, and we know there is exactly one
1238 * unicode codepoint needed for each character.
1239 */
1240 dpos += slen;
1241 }
1242 else
1243 {
1244 /* Because we know that each character in the table is exactly
1245 * one unicode character, we can check the destination
1246 * buffer size and allocate more space in one go, before
1247 * entering the loop where we deal with each character.
1248 */
1249 if (dpos + slen + (extra ? 1 : 0) > bsize)
1250 {
1251 if (zone == 0)
1252 {
1253 result = NO((BOOL)0); /* No buffer growth possible ... fail. */
1254 goto done;
1255 }
1256 else
1257 {
1258 unsigned grow = (dpos + slen) * sizeof(unichar);
1259 unichar *tmp;
1260
1261#if GS_WITH_GC0
1262 tmp = NSAllocateCollectable(grow + extra, 0);
1263#else
1264 tmp = NSZoneMalloc(zone, grow + extra);
1265#endif
1266 if ((ptr == buf || ptr == *dst) && (tmp != 0))
1267 {
1268 memcpy(tmp, ptr, bsize * sizeof(unichar));
1269 }
1270#if !GS_WITH_GC0
1271 if (ptr != buf && ptr != *dst)
1272 {
1273 NSZoneFree(zone, ptr);
1274 }
1275#endif
1276 ptr = tmp;
1277 if (ptr == 0)
1278 {
1279 return NO((BOOL)0); /* Not enough memory */
1280 }
1281 bsize = grow / sizeof(unichar);
1282 }
1283 }
1284 while (spos < slen)
1285 {
1286 unc c = (unc)src[spos];
1287
1288 if (c < base)
1289 {
1290 ptr[dpos++] = c;
1291 }
1292 else
1293 {
1294 ptr[dpos++] = table[c - base];
1295 }
1296 spos++;
1297 }
1298 }
1299 break;
1300
1301 case NSGSM0338StringEncoding:
1302 while (spos < slen)
1303 {
1304 unc c = (unc)src[spos];
1305
1306 if (dpos >= bsize)
1307 {
1308 GROW();
1309 }
1310 ptr[dpos] = GSM0338_char_to_uni_table[c];
1311 if (c == 0x1b && spos < slen)
1312 {
1313 unsigned i = 0;
1314
1315 c = (unc)src[spos+1];
1316 while (i < sizeof(GSM0338_escapes)/sizeof(GSM0338_escapes[0]))
1317 {
1318 if (GSM0338_escapes[i].to == c)
1319 {
1320 ptr[dpos] = GSM0338_escapes[i].from;
1321 spos++;
1322 break;
1323 }
1324 i++;
1325 }
1326 }
1327 dpos++;
1328 spos++;
1329 }
1330 break;
1331
1332 default:
1333#ifdef HAVE_ICONV1
1334 {
1335 struct _strenc_ *encInfo;
1336 unsigned char *inbuf;
1337 unsigned char *outbuf;
1338 size_t inbytesleft;
1339 size_t outbytesleft;
1340 size_t rval;
1341 iconv_tlibiconv_t cd;
1342 const char *estr = 0;
1343 BOOL done = NO((BOOL)0);
1344
1345 if ((encInfo = EntrySupported(enc)) != 0)
1346 {
1347 estr = encInfo->iconvlibiconv;
1348 }
1349 /* explicitly check for empty encoding name since some systems
1350 * have buggy iconv_open() code which succeeds on an empty name.
1351 */
1352 if (estr == 0)
1353 {
1354 NSLog(@"GSToUnicode() No iconv for encoding x%02x", enc);
1355 result = NO((BOOL)0);
1356 goto done;
1357 }
1358 if (slen == 0)
1359 {
1360 break; // Nothing to do
1361 }
1362 cd = iconv_openlibiconv_open(UNICODE_ENC((unicode_enc) ? unicode_enc : internal_unicode_enc()), estr);
1363 if (cd == (iconv_tlibiconv_t)-1)
1364 {
1365 NSLog(@"GSToUnicode() No iconv for encoding %@ tried to use %s",
1366 GSPrivateEncodingName(enc), estr);
1367 result = NO((BOOL)0);
1368 goto done;
1369 }
1370
1371 inbuf = (unsigned char*)src;
1372 inbytesleft = slen;
1373 outbuf = (unsigned char*)ptr;
1374 outbytesleft = bsize * sizeof(unichar);
1375 do
1376 {
1377 if (inbytesleft == 0)
1378 {
1379 done = YES((BOOL)1); // Flush iconv
1380 rval = iconvlibiconv(cd, 0, 0, (void*)&outbuf, &outbytesleft);
1381 }
1382 else
1383 {
1384 rval = iconvlibiconv(cd,
1385 (void*)&inbuf, &inbytesleft, (void*)&outbuf, &outbytesleft);
1386 }
1387 dpos = (bsize * sizeof(unichar) - outbytesleft) / sizeof(unichar);
1388 if (rval == (size_t)-1)
1389 {
1390 if (errno(* __error()) == E2BIG7)
1391 {
1392 unsigned old = bsize;
1393
1394 GROW();
1395 outbuf = (unsigned char*)&ptr[dpos];
1396 outbytesleft += (bsize - old) * sizeof(unichar);
1397 }
1398 else
1399 {
1400 result = NO((BOOL)0);
1401 goto done;
1402 }
1403 }
1404 } while (!done || rval != 0);
1405 // close the converter
1406 iconv_closelibiconv_close(cd);
1407 }
1408#else
1409 result = NO((BOOL)0);
1410#endif
1411 }
1412
1413done:
1414
1415 /*
1416 * Post conversion ... terminate if needed, and set output values.
1417 */
1418 if (extra != 0 && dst != 0)
1419 {
1420 ptr[dpos] = (unichar)0;
1421 }
1422 *size = dpos;
1423 if (dst != 0 && (result == YES((BOOL)1) || (options & GSUniShortOk0x10)))
1424 {
1425 if (options & GSUniTemporary0x02)
1426 {
1427 unsigned bytes = dpos * sizeof(unichar) + extra;
1428 void *r;
1429
1430 /*
1431 * Temporary string was requested ... make one.
1432 */
1433#if GS_WITH_GC0
1434 r = NSAllocateCollectable(bytes, 0);
1435 memcpy(r, ptr, bytes);
1436#else
1437 r = GSAutoreleasedBuffer(bytes);
1438 memcpy(r, ptr, bytes);
1439 if (ptr != buf && ptr != *dst)
1440 {
1441 NSZoneFree(zone, ptr);
1442 }
1443#endif
1444 ptr = r;
1445 *dst = ptr;
1446 }
1447 else if (zone != 0 && (ptr == buf || bsize > dpos))
1448 {
1449 unsigned bytes = dpos * sizeof(unichar) + extra;
1450
1451 /*
1452 * Resizing is permitted, try ensure we return a buffer which
1453 * is just big enough to hold the converted string.
1454 */
1455 if (ptr == buf || ptr == *dst)
1456 {
1457 unichar *tmp;
1458
1459#if GS_WITH_GC0
1460 tmp = NSAllocateCollectable(bytes, 0);
1461#else
1462 tmp = NSZoneMalloc(zone, bytes);
1463#endif
1464 if (tmp != 0)
1465 {
1466 memcpy(tmp, ptr, bytes);
1467 }
1468 ptr = tmp;
1469 }
1470 else
1471 {
1472#if GS_WITH_GC0
1473 ptr = NSReallocateCollectable(ptr, bytes, 0);
1474#else
1475 ptr = NSZoneRealloc(zone, ptr, bytes);
1476#endif
1477 }
1478 *dst = ptr;
1479 }
1480 else if (ptr == buf)
1481 {
1482 ptr = NULL((void *)0);
1483 result = NO((BOOL)0);
1484 }
1485 else
1486 {
1487 *dst = ptr;
1488 }
1489 }
1490#if !GS_WITH_GC0
1491 else if (ptr != buf && dst != 0 && ptr != *dst)
1492 {
1493 NSZoneFree(zone, ptr);
1494 }
1495#endif
1496
1497 if (dst)
1498 NSCAssert(*dst != buf, @"attempted to pass out pointer to internal buffer")do { if (!((*dst != buf))) { [[NSAssertionHandler currentHandler
] handleFailureInFunction: [NSString stringWithUTF8String: __PRETTY_FUNCTION__
] file: [NSString stringWithUTF8String: "Unicode.m"] lineNumber
: 1498 description: ((@"attempted to pass out pointer to internal buffer"
))]; } } while(0)
;
1499
1500 return result;
1501}
1502
1503#undef GROW
1504
1505
1506#if GS_WITH_GC0
1507
1508#define GROW() \
1509if (dst == 0) \
1510 { \
1511 /* \
1512 * Data is just being discarded anyway, so we can \
1513 * reset the offset into the local buffer on the \
1514 * stack and pretend the buffer has grown. \
1515 */ \
1516 ptr = buf - dpos; \
1517 bsize = dpos + BUFSIZ1024; \
1518 if (extra != 0) \
1519 { \
1520 bsize--; \
1521 } \
1522 } \
1523else if (zone == 0) \
1524 { \
1525 result = NO((BOOL)0); /* No buffer growth possible ... fail. */ \
1526 goto done; \
1527 } \
1528else \
1529 { \
1530 unsigned grow = slen; \
1531\
1532 if (grow < bsize + BUFSIZ1024) \
1533 { \
1534 grow = bsize + BUFSIZ1024; \
1535 } \
1536\
1537 if (ptr == buf || ptr == *dst) \
1538 { \
1539 unsigned char *tmp; \
1540\
1541 tmp = NSAllocateCollectable(grow + extra, 0); \
1542 if (tmp != 0) \
1543 { \
1544 memcpy(tmp, ptr, bsize); \
1545 } \
1546 ptr = tmp; \
1547 } \
1548 else \
1549 { \
1550 ptr = NSReallocateCollectable(ptr, grow + extra, 0); \
1551 } \
1552 if (ptr == 0) \
1553 { \
1554 return NO((BOOL)0); /* Not enough memory */ \
1555 } \
1556 bsize = grow; \
1557 }
1558
1559#else /* GS_WITH_GC */
1560
1561#define GROW() \
1562if (dst == 0) \
1563 { \
1564 /* \
1565 * Data is just being discarded anyway, so we can \
1566 * reset the offset into the local buffer on the \
1567 * stack and pretend the buffer has grown. \
1568 */ \
1569 ptr = buf - dpos; \
1570 bsize = dpos + BUFSIZ1024; \
1571 if (extra != 0) \
1572 { \
1573 bsize--; \
1574 } \
1575 } \
1576else if (zone == 0) \
1577 { \
1578 result = NO((BOOL)0); /* No buffer growth possible ... fail. */ \
1579 goto done; \
1580 } \
1581else \
1582 { \
1583 unsigned grow = slen; \
1584\
1585 if (grow < bsize + BUFSIZ1024) \
1586 { \
1587 grow = bsize + BUFSIZ1024; \
1588 } \
1589\
1590 if (ptr == buf || ptr == *dst) \
1591 { \
1592 unsigned char *tmp; \
1593\
1594 tmp = NSZoneMalloc(zone, grow + extra); \
1595 if (tmp != 0) \
1596 { \
1597 memcpy(tmp, ptr, bsize); \
1598 } \
1599 ptr = tmp; \
1600 } \
1601 else \
1602 { \
1603 ptr = NSZoneRealloc(zone, ptr, grow + extra); \
1604 } \
1605 if (ptr == 0) \
1606 { \
1607 return NO((BOOL)0); /* Not enough memory */ \
1608 } \
1609 bsize = grow; \
1610 }
1611
1612#endif /* GS_WITH_GC */
1613
1614static inline int chop(unichar c, _ucc_ *table, int hi)
1615{
1616 int lo = 0;
1617
1618 while (hi > lo)
1619 {
1620 int i = (hi + lo) / 2;
1621 unichar from = table[i].from;
1622
1623 if (from < c)
1624 {
1625 lo = i + 1;
1626 }
1627 else if (from > c)
1628 {
1629 hi = i;
1630 }
1631 else
1632 {
1633 return i; // Found
1634 }
1635 }
1636 return -1; // Not found
1637}
1638
1639/**
1640 * Function to convert from 16-bit unicode to 8-bit data.
1641 * <p>The dst argument is a pointer to a pointer to a buffer in which the
1642 * converted data is to be stored. If it is a null pointer, this function
1643 * discards converted data, and is used only to determine the length of the
1644 * converted data. If the zone argument is non-nul, the function is free
1645 * to allocate a larger buffer if necessary, and store this new buffer in
1646 * the dst argument. It will *NOT* deallocate the original buffer!
1647 * </p>
1648 * <p>The size argument is a pointer to the initial size of the destination
1649 * buffer. If the function changes the buffer size, this value will be
1650 * altered to the new size. This is measured in bytes.
1651 * </p>
1652 * <p>The src argument is a pointer to the 16-bit unicode string which is
1653 * to be converted to 8-bit data.
1654 * </p>
1655 * <p>The slen argument is the length of the 16-bit unicode string
1656 * which is to be converted to 8-bit data.
1657 * This is measured in 16-bit characters, not bytes.
1658 * </p>
1659 * <p>The enc argument specifies the encoding type of the 8-bit byte sequence
1660 * which is to be produced from the 16-bit unicode.
1661 * </p>
1662 * <p>The zone argument specifies a memory zone in which the function may
1663 * allocate a buffer to return data in.
1664 * If this is nul, the function will fail if the originally supplied buffer
1665 * is not big enough (unless dst is a null pointer ... indicating that
1666 * converted data is to be discarded).<br />
1667 * If the library is built for garbage collecting, the zone argument is used
1668 * only as a marker to say whether the function may allocate memory (zone
1669 * is non-null) or not (zone is null).
1670 * </p>
1671 * The options argument controls some special behavior.
1672 * <list>
1673 * <item>If GSUniStrict is set, the function will fail if a character is
1674 * encountered in the source which can't be converted. Otherwise, some
1675 * approximation or marker will be placed in the destination.</item>
1676 * <item>If GSUniTerminate is set, the function is expected to nul terminate
1677 * the output data, and will assume that it is safe to place the nul
1678 * just beyond the end of the stated buffer size.
1679 * Also, if the function grows the buffer, it will allow for an extra
1680 * termination byte.</item>
1681 * <item>If GSUniTemporary is set, the function will return the results in
1682 * an autoreleased buffer rather than in a buffer that the caller must
1683 * release.</item>
1684 * <item>If GSUniBOM is set, the function will read the first unicode
1685 * character as a byte order marker.</item>
1686 * <item>If GSUniShortOk is set, the function will return a buffer containing
1687 * any decoded characters even if the whole conversion fails.</item>
1688 * </list>
1689 * <p>On return, the function result is a flag indicating success (YES)
1690 * or failure (NO), and on success, the value stored in size is the number
1691 * of bytes in the converted data. The converted data itself is
1692 * stored in the location given by dst.<br />
1693 * NB. If the value stored in dst has been changed, it is a pointer to
1694 * allocated memory which the caller is responsible for freeing, and the
1695 * caller is <em>still</em> responsible for freeing the original buffer.
1696 * </p>
1697 */
1698BOOL
1699GSFromUnicode(unsigned char **dst, unsigned int *size, const unichar *src,
1700 unsigned int slen, NSStringEncoding enc, NSZone *zone,
1701 unsigned int options)
1702{
1703 unsigned char buf[BUFSIZ1024];
1704 unsigned char *ptr;
1705 unsigned bsize;
1706 unsigned dpos = 0; // Offset into destination buffer.
1707 unsigned spos = 0; // Offset into source buffer.
1708 unsigned extra = (options & GSUniTerminate0x01) ? 1 : 0;
1709 BOOL strict = (options & GSUniStrict0x04) ? YES((BOOL)1) : NO((BOOL)0);
1710 unichar base = 0;
1711 _ucc_ *table = 0;
1712 unsigned tsize = 0;
1713 unsigned char escape = 0;
1714 _ucc_ *etable = 0;
1715 unsigned etsize = 0;
1716 _ucc_ *ltable = 0;
1717 unsigned ltsize = 0;
1718 BOOL swapped = NO((BOOL)0);
1719 BOOL result = YES((BOOL)1);
1720
1721 if (options & GSUniBOM0x08)
1722 {
1723 if (slen == 0)
1724 {
1725 *size = 0;
1726 result = NO((BOOL)0); // Missing byte order marker.
1727 }
1728 else
1729 {
1730 unichar c;
1731
1732 c = *src++;
1733 slen--;
1734 if (c != 0xFEFF)
1735 {
1736 if (c == 0xFFFE)
1737 {
1738 swapped = YES((BOOL)1);
1739 }
1740 else
1741 {
1742 *size = 0;
1743 result = NO((BOOL)0); // Illegal byte order marker.
1744 }
1745 }
1746 }
1747 }
1748
1749 /*
1750 * Ensure we have an initial buffer set up to decode data into.
1751 */
1752 if (dst == 0 || *size == 0)
1753 {
1754 ptr = buf;
1755 bsize = (extra != 0) ? BUFSIZ1024 - 1 : BUFSIZ1024;
1756 }
1757 else
1758 {
1759 ptr = *dst;
1760 bsize = *size;
1761 }
1762
1763 if (result == NO((BOOL)0))
1764 {
1765 goto done;
1766 }
1767
1768#ifdef HAVE_ICONV1
1769 if (strict == NO((BOOL)0)
1770 && enc != NSUTF8StringEncoding
1771 && enc != NSGSM0338StringEncoding)
1772 {
1773 goto iconv_start; // For lossy conversion
1774 }
1775#endif
1776
1777 switch (enc)
1778 {
1779 case NSUTF8StringEncoding:
1780 {
1781 if (swapped == YES((BOOL)1))
1782 {
1783 while (spos < slen)
1784 {
1785 unichar u1, u2;
1786 unsigned char reversed[8];
1787 unsigned long u;
1788 int sl;
1789 int i;
1790
1791 /* get first unichar */
1792 u1 = src[spos++];
1793 u1 = (((u1 & 0xff00) >> 8) + ((u1 & 0x00ff) << 8));
1794
1795 /* Fast track ... if this is actually an ascii character
1796 * it just converts straight to utf-8
1797 */
1798 if (u1 <= 0x7f)
1799 {
1800 if (dpos >= bsize)
1801 {
1802 GROW();
1803 }
1804 ptr[dpos++] = (unsigned char)u1;
1805 continue;
1806 }
1807
1808 // 0xfeff is a zero-width-no-break-space inside text
1809 if (u1 == 0xfffe // unexpected BOM
1810 || u1 == 0xffff // not a character
1811 || (u1 >= 0xfdd0 && u1 <= 0xfdef) // invalid character
1812 || (u1 >= 0xdc00 && u1 <= 0xdfff)) // bad pairing
1813 {
1814 if (strict)
1815 {
1816 result = NO((BOOL)0);
1817 goto done;
1818 }
1819 continue; // Skip invalid character.
1820 }
1821
1822 /* possibly get second character and calculate 'u' */
1823 if ((u1 >= 0xd800) && (u1 < 0xdc00))
1824 {
1825 if (spos >= slen)
1826 {
1827 if (strict)
1828 {
1829 result = NO((BOOL)0);
1830 goto done;
1831 }
1832 continue; // At end.
1833 }
1834
1835 /* get second unichar */
1836 u2 = src[spos++];
1837 u2 = (((u2 & 0xff00) >> 8) + ((u2 & 0x00ff) << 8));
1838
1839 if ((u2 < 0xdc00) && (u2 > 0xdfff))
1840 {
1841 spos--;
1842 if (strict)
1843 {
1844 result = NO((BOOL)0);
1845 goto done;
1846 }
1847 continue; // Skip bad half of surrogate pair.
1848 }
1849
1850 /* make the full value */
1851 u = ((unsigned long)(u1 - 0xd800) * 0x400)
1852 + (u2 - 0xdc00) + 0x10000;
1853 }
1854 else
1855 {
1856 u = u1;
1857 }
1858
1859 /* calculate the sequence length
1860 * a length of 1 was dealt with earlier
1861 */
1862 if (u <= 0x7ff)
1863 {
1864 sl = 2;
1865 }
1866 else if (u <= 0xffff)
1867 {
1868 sl = 3;
1869 }
1870 else if (u <= 0x1fffff)
1871 {
1872 sl = 4;
1873 }
1874 else if (u <= 0x3ffffff)
1875 {
1876 sl = 5;
1877 }
1878 else
1879 {
1880 sl = 6;
1881 }
1882
1883 /* make sure we have enough space for it */
1884 while (dpos + sl >= bsize)
1885 {
1886 GROW();
1887 }
1888
1889 /* split value into reversed array */
1890 for (i = 0; i < sl; i++)
1891 {
1892 reversed[i] = (u & 0x3f);
1893 u = u >> 6;
1894 }
1895
1896 ptr[dpos++] = reversed[sl-1] | ((0xff << (8-sl)) & 0xff);
1897 /* add bytes into the output sequence */
1898 for (i = sl - 2; i >= 0; i--)
1899 {
1900 ptr[dpos++] = reversed[i] | 0x80;
1901 }
1902 }
1903 }
1904 else
1905 {
1906 while (spos < slen)
1907 {
1908 unichar u1, u2;
1909 unsigned char reversed[8];
1910 unsigned long u;
1911 int sl;
1912 int i;
1913
1914 /* get first unichar */
1915 u1 = src[spos++];
1916
1917 /* Fast track ... if this is actually an ascii character
1918 * it just converts straight to utf-8
1919 */
1920 if (u1 <= 0x7f)
1921 {
1922 if (dpos >= bsize)
1923 {
1924 GROW();
1925 }
1926 ptr[dpos++] = (unsigned char)u1;
1927 continue;
1928 }
1929
1930 // 0xfeff is a zero-width-no-break-space inside text
1931 if (u1 == 0xfffe // unexpected BOM
1932 || u1 == 0xffff // not a character
1933 || (u1 >= 0xfdd0 && u1 <= 0xfdef) // invalid character
1934 || (u1 >= 0xdc00 && u1 <= 0xdfff)) // bad pairing
1935 {
1936 if (strict)
1937 {
1938 result = NO((BOOL)0);
1939 goto done;
1940 }
1941 continue; // Skip invalid character.
1942 }
1943
1944 /* possibly get second character and calculate 'u' */
1945 if ((u1 >= 0xd800) && (u1 < 0xdc00))
1946 {
1947 if (spos >= slen)
1948 {
1949 if (strict)
1950 {
1951 result = NO((BOOL)0);
1952 goto done;
1953 }
1954 continue; // At end.
1955 }
1956
1957 /* get second unichar */
1958 u2 = src[spos++];
1959
1960 if ((u2 < 0xdc00) && (u2 > 0xdfff))
1961 {
1962 spos--;
1963 if (strict)
1964 {
1965 result = NO((BOOL)0);
1966 goto done;
1967 }
1968 continue; // Skip bad half of surrogate pair.
1969 }
1970
1971 /* make the full value */
1972 u = ((unsigned long)(u1 - 0xd800) * 0x400)
1973 + (u2 - 0xdc00) + 0x10000;
1974 }
1975 else
1976 {
1977 u = u1;
1978 }
1979
1980 /* calculate the sequence length
1981 * a length of 1 was dealt with earlier
1982 */
1983 if (u <= 0x7ff)
1984 {
1985 sl = 2;
1986 }
1987 else if (u <= 0xffff)
1988 {
1989 sl = 3;
1990 }
1991 else if (u <= 0x1fffff)
1992 {
1993 sl = 4;
1994 }
1995 else if (u <= 0x3ffffff)
1996 {
1997 sl = 5;
1998 }
1999 else
2000 {
2001 sl = 6;
2002 }
2003
2004 /* make sure we have enough space for it */
2005 while (dpos + sl >= bsize)
2006 {
2007 GROW();
2008 }
2009
2010 /* split value into reversed array */
2011 for (i = 0; i < sl; i++)
2012 {
2013 reversed[i] = (u & 0x3f);
2014 u = u >> 6;
2015 }
2016
2017 ptr[dpos++] = reversed[sl-1] | ((0xff << (8-sl)) & 0xff);
2018 /* add bytes into the output sequence */
2019 for (i = sl - 2; i >= 0; i--)
2020 {
2021 ptr[dpos++] = reversed[i] | 0x80;
2022 }
2023 }
2024 }
2025 }
2026 break;
2027
2028 case NSNonLossyASCIIStringEncoding:
2029 case NSASCIIStringEncoding:
2030 base = 128;
2031 goto bases;
2032
2033 case NSISOLatin1StringEncoding:
2034 case NSUnicodeStringEncoding:
2035 base = 256;
2036 goto bases;
2037
2038bases:
2039 if (dst == 0)
2040 {
2041 /* Just counting bytes, and we know there is exactly one
2042 * unicode codepoint needed for each character.
2043 */
2044 dpos = slen;
2045 }
2046 else
2047 {
2048 /* Because we know that each ascii chartacter is exactly
2049 * one unicode character, we can check the destination
2050 * buffer size and allocate more space in one go, before
2051 * entering the loop where we deal with each character.
2052 */
2053 if (slen > bsize)
2054 {
2055 if (zone == 0)
2056 {
2057 result = NO((BOOL)0); /* No buffer growth possible ... fail. */
2058 goto done;
2059 }
2060 else
2061 {
2062 uint8_t *tmp;
2063
2064#if GS_WITH_GC0
2065 tmp = NSAllocateCollectable(slen, 0);
2066#else
2067 tmp = NSZoneMalloc(zone, slen);
2068 if (ptr != buf && ptr != *dst)
2069 {
2070 NSZoneFree(zone, ptr);
2071 }
2072#endif
2073 ptr = tmp;
2074 if (ptr == 0)
2075 {
2076 return NO((BOOL)0); /* Not enough memory */
2077 }
2078 bsize = slen;
2079 }
2080 }
2081 }
2082 if (strict == NO((BOOL)0))
2083 {
2084 if (swapped == YES((BOOL)1))
2085 {
2086 while (spos < slen)
2087 {
2088 unichar u = src[spos++];
2089
2090 u = (((u & 0xff00) >> 8) + ((u & 0x00ff) << 8));
2091 if (u < base)
2092 {
2093 ptr[dpos++] = (unsigned char)u;
2094 }
2095 else
2096 {
2097 ptr[dpos++] = '?';
2098 }
2099 }
2100 }
2101 else
2102 {
2103 while (spos < slen)
2104 {
2105 unichar u = src[spos++];
2106
2107 if (u < base)
2108 {
2109 ptr[dpos++] = (unsigned char)u;
2110 }
2111 else
2112 {
2113 ptr[dpos++] = '?';
2114 }
2115 }
2116 }
2117 }
2118 else
2119 {
2120 if (swapped == YES((BOOL)1))
2121 {
2122 while (spos < slen)
2123 {
2124 unichar u = src[spos++];
2125
2126 u = (((u & 0xff00) >> 8) + ((u & 0x00ff) << 8));
2127 if (u < base)
2128 {
2129 ptr[dpos++] = (unsigned char)u;
2130 }
2131 else
2132 {
2133 result = NO((BOOL)0);
2134 goto done;
2135 }
2136 }
2137 }
2138 else
2139 {
2140 while (spos < slen)
2141 {
2142 unichar u = src[spos++];
2143
2144 if (u < base)
2145 {
2146 ptr[dpos++] = (unsigned char)u;
2147 }
2148 else
2149 {
2150 result = NO((BOOL)0);
2151 goto done;
2152 }
2153 }
2154 }
2155 }
2156 break;
2157
2158 case NSNEXTSTEPStringEncoding:
2159 base = Next_conv_base;
2160 table = Next_uni_to_char_table;
2161 tsize = Next_uni_to_char_table_size;
2162 goto tables;
2163
2164 case NSISOCyrillicStringEncoding:
2165 base = Cyrillic_conv_base;
2166 table = Cyrillic_uni_to_char_table;
2167 tsize = Cyrillic_uni_to_char_table_size;
2168 goto tables;
2169
2170 case NSISOLatin2StringEncoding:
2171 base = Latin2_conv_base;
2172 table = Latin2_uni_to_char_table;
2173 tsize = Latin2_uni_to_char_table_size;
2174 goto tables;
2175
2176 case NSISOLatin9StringEncoding:
2177 base = Latin9_conv_base;
2178 table = Latin9_uni_to_char_table;
2179 tsize = Latin9_uni_to_char_table_size;
2180 goto tables;
2181
2182 case NSISOThaiStringEncoding:
2183 base = Thai_conv_base;
2184 table = Thai_uni_to_char_table;
2185 tsize = Thai_uni_to_char_table_size;
2186 goto tables;
2187
2188#if 0
2189 case NSSymbolStringEncoding:
2190 base = Symbol_conv_base;
2191 table = Symbol_uni_to_char_table;
2192 tsize = Symbol_uni_to_char_table_size;
2193 goto tables;
2194#endif
2195
2196 case NSGSM0338StringEncoding:
2197 base = 0;
2198 table = GSM0338_uni_to_char_table;
2199 tsize = GSM0338_tsize(sizeof(GSM0338_uni_to_char_table)/sizeof(_ucc_));
2200 escape = 0x1b;
2201 etable = GSM0338_escapes;
2202 etsize = GSM0338_esize(sizeof(GSM0338_escapes)/sizeof(_ucc_));
2203 if (strict == NO((BOOL)0))
2204 {
2205 ltable = GSM0338_lossy;
2206 ltsize = GSM0338_lsize(sizeof(GSM0338_lossy)/sizeof(_ucc_));
2207 }
2208 goto tables;
2209
2210tables:
2211 while (spos < slen)
2212 {
2213 unichar u = src[spos++];
2214 int i;
2215
2216 /* Swap byte order if necessary */
2217 if (swapped == YES((BOOL)1))
2218 {
2219 u = (((u & 0xff00) >> 8) + ((u & 0x00ff) << 8));
2220 }
2221
2222 /* Grow output buffer to make room if necessary */
2223 if (dpos >= bsize)
2224 {
2225 GROW();
2226 }
2227
2228 if (u < base)
2229 {
2230 /*
2231 * The character set has a lower section whose contents
2232 * are identical to unicode, so no mapping is needed.
2233 */
2234 ptr[dpos++] = (unsigned char)u;
2235 }
2236 else if (table != 0 && (i = chop(u, table, tsize)) >= 0)
2237 {
2238 /*
2239 * The character mapping is found in a basic table.
2240 */
2241 ptr[dpos++] = table[i].to;
2242 }
2243 else if (etable != 0 && (i = chop(u, etable, etsize)) >= 0)
2244 {
2245 /*
2246 * The character mapping is found in a table of simple
2247 * escape sequences consisting of an escape byte followed
2248 * by another single byte.
2249 */
2250 ptr[dpos++] = escape;
2251 if (dpos >= bsize)
2252 {
2253 GROW();
2254 }
2255 ptr[dpos++] = etable[i].to;
2256 }
2257 else if (ltable != 0 && (i = chop(u, ltable, ltsize)) >= 0)
2258 {
2259 /*
2260 * The character is found in a lossy mapping table.
2261 */
2262 ptr[dpos++] = ltable[i].to;
2263 }
2264 else if (strict == NO((BOOL)0))
2265 {
2266 /*
2267 * The default lossy mapping generates a question mark.
2268 */
2269 ptr[dpos++] = '?';
2270 }
2271 else
2272 {
2273 /*
2274 * No mapping has been found.
2275 */
2276 result = NO((BOOL)0);
2277 goto done;
2278 }
2279 }
2280 break;
2281
2282 default:
2283#ifdef HAVE_ICONV1
2284iconv_start:
2285 {
2286 struct _strenc_ *encInfo;
2287 iconv_tlibiconv_t cd;
2288 unsigned char *inbuf;
2289 unsigned char *outbuf;
2290 size_t inbytesleft;
2291 size_t outbytesleft;
2292 size_t rval;
2293 const char *estr = 0;
2294 BOOL done = NO((BOOL)0);
2295
2296 if ((encInfo = EntrySupported(enc)) != 0)
2297 {
2298 if (strict == NO((BOOL)0))
2299 {
2300 /*
2301 * Try to transliterate where no direct conversion
2302 * is available.
2303 */
2304 estr = encInfo->lossy;
2305 }
2306 if (estr == 0)
2307 {
2308 estr = encInfo->iconvlibiconv;
2309 }
2310 }
2311
2312 /* explicitly check for empty encoding name since some systems
2313 * have buggy iconv_open() code which succeeds on an empty name.
2314 */
2315 if (estr == 0)
2316 {
2317 NSLog(@"GSFromUnicode() No iconv for encoding x%02x", enc);
2318 result = NO((BOOL)0);
2319 goto done;
2320 }
2321 if (slen == 0)
2322 {
2323 break; // Nothing to convert.
2324 }
2325 cd = iconv_openlibiconv_open(estr, UNICODE_ENC((unicode_enc) ? unicode_enc : internal_unicode_enc()));
2326 if (cd == (iconv_tlibiconv_t)-1)
2327 {
2328 NSLog(@"GSFromUnicode() No iconv for encoding %@ tried to use %s",
2329 GSPrivateEncodingName(enc), estr);
2330 result = NO((BOOL)0);
2331 goto done;
2332 }
2333
2334 inbuf = (unsigned char*)src;
2335 inbytesleft = slen * sizeof(unichar);
2336 outbuf = (unsigned char*)ptr;
2337 outbytesleft = bsize;
2338 do
2339 {
2340 if (inbytesleft == 0)
2341 {
2342 done = YES((BOOL)1); // Flush buffer
2343 rval = iconvlibiconv(cd, 0, 0, (void*)&outbuf, &outbytesleft);
2344 }
2345 else
2346 {
2347 rval = iconvlibiconv(cd,
2348 (void*)&inbuf, &inbytesleft, (void*)&outbuf, &outbytesleft);
2349 }
2350 dpos = bsize - outbytesleft;
2351 if (rval != 0)
2352 {
2353 if (rval == (size_t)-1)
2354 {
2355 if (errno(* __error()) == E2BIG7)
2356 {
2357 unsigned old = bsize;
2358
2359 GROW();
2360 outbuf = (unsigned char*)&ptr[dpos];
2361 outbytesleft += (bsize - old);
2362 }
2363 else if (errno(* __error()) == EILSEQ86)
2364 {
2365 if (strict == YES((BOOL)1))
2366 {
2367 result = NO((BOOL)0);
2368 goto done;
2369 }
2370 /*
2371 * If we are allowing lossy conversion, we replace any
2372 * unconvertable character with a question mark.
2373 */
2374 if (outbytesleft > 0)
2375 {
2376 *outbuf++ = '?';
2377 outbytesleft--;
2378 inbuf += sizeof(unichar);
2379 inbytesleft -= sizeof(unichar);
2380 }
2381 }
2382 else
2383 {
2384 result = NO((BOOL)0);
2385 goto done;
2386 }
2387 }
2388 else if (strict == YES((BOOL)1))
2389 {
2390 /*
2391 * A positive return from iconv indicates some
2392 * irreversible (ie lossy) conversions took place,
2393 * so if we are doing strict conversions we must fail.
2394 */
2395 result = NO((BOOL)0);
2396 goto done;
2397 }
2398 }
2399 } while (!done || rval != 0);
2400 // close the converter
2401 iconv_closelibiconv_close(cd);
2402 }
2403#else
2404 result = NO((BOOL)0);
2405 goto done;
2406#endif
2407 }
2408
2409 done:
2410
2411 /*
2412 * Post conversion ... set output values.
2413 */
2414 if (extra != 0)
2415 {
2416 ptr[dpos] = (unsigned char)0;
2417 }
2418 *size = dpos;
2419 if (dst != 0 && (result == YES((BOOL)1) || (options & GSUniShortOk0x10)))
2420 {
2421 if (options & GSUniTemporary0x02)
2422 {
2423 unsigned bytes = dpos + extra;
2424 void *r;
2425
2426 /*
2427 * Temporary string was requested ... make one.
2428 */
2429#if GS_WITH_GC0
2430 r = NSAllocateCollectable(bytes, 0);
2431 memcpy(r, ptr, bytes);
2432#else
2433 r = GSAutoreleasedBuffer(bytes);
2434 memcpy(r, ptr, bytes);
2435 if (ptr != buf && ptr != *dst)
2436 {
2437 NSZoneFree(zone, ptr);
2438 }
2439#endif
2440 ptr = r;
2441 *dst = ptr;
2442 }
2443 else if (zone != 0 && (ptr == buf || bsize > dpos))
2444 {
2445 unsigned bytes = dpos + extra;
2446
2447 /*
2448 * Resizing is permitted - try ensure we return a buffer
2449 * which is just big enough to hold the converted string.
2450 */
2451 if (ptr == buf || ptr == *dst)
2452 {
2453 unsigned char *tmp;
2454
2455#if GS_WITH_GC0
2456 tmp = NSAllocateCollectable(bytes, 0);
2457#else
2458 tmp = NSZoneMalloc(zone, bytes);
2459#endif
2460 if (tmp != 0)
2461 {
2462 memcpy(tmp, ptr, bytes);
2463 }
2464 ptr = tmp;
2465 }
2466 else
2467 {
2468#if GS_WITH_GC0
2469 ptr = NSReallocateCollectable(ptr, bytes, 0);
2470#else
2471 ptr = NSZoneRealloc(zone, ptr, bytes);
2472#endif
2473 }
2474 *dst = ptr;
2475 }
2476 else if (ptr == buf)
2477 {
2478 ptr = NULL((void *)0);
2479 result = NO((BOOL)0);
2480 }
2481 else
2482 {
2483 *dst = ptr;
2484 }
2485 }
2486#if !GS_WITH_GC0
2487 else if (ptr != buf && dst != 0 && ptr != *dst)
2488 {
2489 NSZoneFree(zone, ptr);
2490 }
2491#endif
2492
2493 if (dst)
2494 NSCAssert(*dst != buf, @"attempted to pass out pointer to internal buffer")do { if (!((*dst != buf))) { [[NSAssertionHandler currentHandler
] handleFailureInFunction: [NSString stringWithUTF8String: __PRETTY_FUNCTION__
] file: [NSString stringWithUTF8String: "Unicode.m"] lineNumber
: 2494 description: ((@"attempted to pass out pointer to internal buffer"
))]; } } while(0)
;
2495
2496 return result;
2497}
2498
2499#undef GROW
2500
2501
2502
2503NSStringEncoding*
2504GSPrivateAvailableEncodings()
2505{
2506 if (_availableEncodings == 0)
2507 {
2508 GSSetupEncodingTable();
2509 [GS_INITIALIZED_LOCK(local_lock, GSLazyLock)(local_lock != ((id)((void*)0)) ? (id)local_lock : (id)[GSLazyLock
newLockAt: &local_lock])
lock];
2510 if (_availableEncodings == 0)
2511 {
2512 NSStringEncoding *encodings;
2513 unsigned pos;
2514 unsigned i;
2515
2516 /*
2517 * Now build up a list of supported encodings ... in the
2518 * format needed to support [NSString+availableStringEncodings]
2519 * Check to see what iconv support we have as we go along.
2520 * This is also the place where we determine the name we use
2521 * for iconv to support unicode.
2522 */
2523 encodings = malloc(sizeof(NSStringEncoding) * (encTableSize+1));
2524 pos = 0;
2525 for (i = 0; i < encTableSize+1; i++)
2526 {
2527 if (GSPrivateIsEncodingSupported(i) == YES((BOOL)1))
2528 {
2529 encodings[pos++] = i;
2530 }
2531 }
2532 encodings[pos] = 0;
2533 _availableEncodings = encodings;
2534 }
2535 [local_lock unlock];
2536 }
2537 return _availableEncodings;
2538}
2539
2540NSStringEncoding
2541GSPrivateDefaultCStringEncoding()
2542{
2543 if (defEnc == GSUndefinedEncoding)
1
Assuming 'defEnc' is equal to GSUndefinedEncoding
2
Taking true branch
2544 {
2545 char *encoding;
2546#if HAVE_LANGINFO_CODESET1
2547 char encbuf[BUFSIZ1024];
2548#endif
2549 unsigned int count;
2550
2551 GSSetupEncodingTable();
2552
2553 [GS_INITIALIZED_LOCK(local_lock, GSLazyLock)(local_lock != ((id)((void*)0)) ? (id)local_lock : (id)[GSLazyLock
newLockAt: &local_lock])
lock];
3
Within the expansion of the macro 'GS_INITIALIZED_LOCK':
a
Assuming 'local_lock' is not equal to nil
b
Method returns an Objective-C object with a +1 retain count
2554 if (defEnc != GSUndefinedEncoding)
4
Object leaked: allocated object is not referenced later in this execution path and has a retain count of +1
2555 {
2556 [local_lock unlock];
2557 return defEnc;
2558 }
2559
2560 if (natEnc == GSUndefinedEncoding)
2561 {
2562
2563 /* Encoding not set */
2564#if HAVE_LANGINFO_CODESET1
2565 /* Take it from the system locale information. */
2566 [gnustep_global_lock lock];
2567 strncpy(encbuf, nl_langinfo(CODESET0), sizeof(encbuf)-1);
2568 encbuf[sizeof(encbuf)-1] = '\0';
2569 [gnustep_global_lock unlock];
2570 encoding = encbuf;
2571
2572 /*
2573 * First handle the fallback response from nl_langinfo() ...
2574 * if we are getting the default value we can't assume that
2575 * the user has set anything up at all, so we must use the
2576 * OpenStep/GNUstep default encopding ... latin1, even though
2577 * the nl_langinfo() stuff would say default is ascii.
2578 */
2579 if (strcmp(encoding, "ANSI_X3.4-1968") == 0 /* glibc */
2580 || strcmp(encoding, "ISO_646.IRV:1983") == 0 /* glibc */
2581 || strcmp(encoding, "646") == 0 /* Solaris NetBSD */)
2582 natEnc = NSISOLatin1StringEncoding;
2583 else if (strcmp(encoding, "EUC-JP") == 0 /* glibc */
2584 /* HP-UX IRIX OSF/1 Solaris NetBSD */
2585 || strcmp(encoding, "eucJP") == 0
2586 || strcmp(encoding, "IBM-eucJP") == 0 /* AIX */)
2587 natEnc = NSJapaneseEUCStringEncoding;
2588 else if (strcmp(encoding, "UTF-8") == 0 /* glibc AIX OSF/1 Solaris */
2589 || strcmp(encoding, "utf8") == 0 /* HP-UX */)
2590 natEnc = NSUTF8StringEncoding;
2591 else if (strcmp(encoding, "ISO-8859-1") == 0 /* glibc */
2592 /* AIX IRIX OSF/1 Solaris NetBSD */
2593 || strcmp(encoding, "ISO8859-1") == 0
2594 || strcmp(encoding, "iso88591") == 0 /* HP-UX */)
2595 natEnc = NSISOLatin1StringEncoding;
2596 else if (strcmp(encoding, "IBM-932") == 0 /* AIX */
2597 || strcmp(encoding, "SJIS") == 0 /* HP-UX OSF/1 NetBSD */
2598 || strcmp(encoding, "PCK") == 0 /* Solaris */)
2599 natEnc = NSShiftJISStringEncoding;
2600 else if (strcmp(encoding, "ISO-8859-2") == 0 /* glibc */
2601 /* AIX IRIX OSF/1 Solaris NetBSD */
2602 || strcmp(encoding, "ISO8859-2") == 0
2603 || strcmp(encoding, "iso88592") == 0 /* HP-UX */)
2604 natEnc = NSISOLatin2StringEncoding;
2605 else if (strcmp(encoding, "CP1251") == 0 /* glibc */
2606 || strcmp(encoding, "ansi-1251") == 0 /* Solaris */)
2607 natEnc = NSWindowsCP1251StringEncoding;
2608 else if (strcmp(encoding, "CP1252") == 0 /* */
2609 || strcmp(encoding, "IBM-1252") == 0 /* AIX */)
2610 natEnc = NSWindowsCP1252StringEncoding;
2611 else if (strcmp(encoding, "ISO-8859-5") == 0 /* glibc */
2612 /* AIX IRIX OSF/1 Solaris NetBSD */
2613 || strcmp(encoding, "ISO8859-5") == 0
2614 || strcmp(encoding, "iso88595") == 0 /* HP-UX */)
2615 natEnc = NSISOCyrillicStringEncoding;
2616 else if (strcmp(encoding, "KOI8-R") == 0 /* glibc */
2617 || strcmp(encoding, "koi8-r") == 0 /* Solaris */)
2618 natEnc = NSKOI8RStringEncoding;
2619 else if (strcmp(encoding, "ISO-8859-3") == 0 /* glibc */
2620 || strcmp(encoding, "ISO8859-3") == 0 /* Solaris */)
2621 natEnc = NSISOLatin3StringEncoding;
2622 else if (strcmp(encoding, "ISO-8859-4") == 0 /* */
2623 || strcmp(encoding, "ISO8859-4") == 0 /* OSF/1 Solaris NetBSD */)
2624 natEnc = NSISOLatin4StringEncoding;
2625 else if (strcmp(encoding, "ISO-8859-6") == 0 /* glibc */
2626 || strcmp(encoding, "ISO8859-6") == 0 /* AIX Solaris */
2627 || strcmp(encoding, "iso88596") == 0 /* HP-UX */)
2628 natEnc = NSISOArabicStringEncoding;
2629 else if (strcmp(encoding, "ISO-8859-7") == 0 /* glibc */
2630 || strcmp(encoding, "ISO8859-7") == 0 /* AIX IRIX OSF/1 Solaris */
2631 || strcmp(encoding, "iso88597") == 0 /* HP-UX */)
2632 natEnc = NSISOGreekStringEncoding;
2633 else if (strcmp(encoding, "ISO-8859-8") == 0 /* glibc */
2634 || strcmp(encoding, "ISO8859-8") == 0 /* AIX OSF/1 Solaris */
2635 || strcmp(encoding, "iso88598") == 0 /* HP-UX */)
2636 natEnc = NSISOHebrewStringEncoding;
2637 else if (strcmp(encoding, "ISO-8859-9") == 0 /* glibc */
2638 || strcmp(encoding, "ISO8859-9") == 0 /* AIX IRIX OSF/1 Solaris */
2639 || strcmp(encoding, "iso88599") == 0 /* HP-UX */)
2640 natEnc = NSISOLatin5StringEncoding;
2641 else if (strcmp(encoding, "ISO-8859-10") == 0 /* */
2642 || strcmp(encoding, "ISO8859-10") == 0 /* */)
2643 natEnc = NSISOLatin6StringEncoding;
2644 else if (strcmp(encoding, "TIS-620") == 0 /* glibc AIX */
2645 || strcmp(encoding, "tis620") == 0 /* HP-UX */
2646 || strcmp(encoding, "TIS620.2533") == 0 /* Solaris */
2647 || strcmp(encoding, "TACTIS") == 0 /* OSF/1 */)
2648 natEnc = NSISOThaiStringEncoding;
2649 else if (strcmp(encoding, "ISO-8859-13") == 0 /* glibc */
2650 || strcmp(encoding, "ISO8859-13") == 0 /* */
2651 || strcmp(encoding, "IBM-921") == 0 /* AIX */)
2652 natEnc = NSISOLatin7StringEncoding;
2653 else if (strcmp(encoding, "ISO-8859-14") == 0 /* glibc */
2654 || strcmp(encoding, "ISO8859-14") == 0 /* */)
2655 natEnc = NSISOLatin8StringEncoding;
2656 else if (strcmp(encoding, "ISO-8859-15") == 0 /* glibc */
2657 /* AIX OSF/1 Solaris NetBSD */
2658 || strcmp(encoding, "ISO8859-15") == 0
2659 || strcmp(encoding, "iso885915") == 0 /* HP-UX */)
2660 natEnc = NSISOLatin9StringEncoding;
2661 else if (strcmp(encoding, "GB2312") == 0 /* glibc */
2662 || strcmp(encoding, "gb2312") == 0 /* Solaris */
2663 || strcmp(encoding, "eucCN") == 0 /* IRIX NetBSD */
2664 || strcmp(encoding, "IBM-eucCN") == 0 /* AIX */
2665 || strcmp(encoding, "hp15CN") == 0 /* HP-UX */)
2666 natEnc = NSGB2312StringEncoding;
2667 else if (strcmp(encoding, "BIG5") == 0 /* glibc Solaris NetBSD */
2668 || strcmp(encoding, "big5") == 0 /* AIX HP-UX OSF/1 */)
2669 natEnc = NSBIG5StringEncoding;
2670 else if (strcmp(encoding, "EUC-KR") == 0 /* glibc */
2671 || strcmp(encoding, "eucKR") == 0 /* HP-UX IRIX OSF/1 NetBSD */
2672 || strcmp(encoding, "IBM-eucKR") == 0 /* AIX */
2673 || strcmp(encoding, "5601") == 0 /* Solaris */)
2674 natEnc = NSKoreanEUCStringEncoding;
2675#endif
2676 }
2677
2678 encoding = getenv("GNUSTEP_STRING_ENCODING");
2679 if (encoding != 0)
2680 {
2681 count = 0;
2682 while (str_encoding_table[count].enc
2683 && strcasecmp(str_encoding_table[count].ename, encoding)
2684 && strcasecmp(str_encoding_table[count].iconvlibiconv, encoding))
2685 {
2686 count++;
2687 }
2688 if (str_encoding_table[count].enc)
2689 {
2690 defEnc = str_encoding_table[count].enc;
2691 }
2692 else
2693 {
2694 fprintf(stderr__stderrp,
2695 "WARNING: %s - encoding not supported.\n", encoding);
2696 fprintf(stderr__stderrp,
2697 " NSISOLatin1StringEncoding set as default.\n");
2698 defEnc = NSISOLatin1StringEncoding;
2699 }
2700 }
2701 if (defEnc == GSUndefinedEncoding)
2702 {
2703 defEnc = natEnc;
2704 }
2705 if (defEnc == GSUndefinedEncoding)
2706 {
2707 defEnc = NSISOLatin1StringEncoding;
2708 }
2709 else if (GSPrivateIsEncodingSupported(defEnc) == NO((BOOL)0))
2710 {
2711 fprintf(stderr__stderrp, "WARNING: %s - encoding not implemented as "
2712 "default c string encoding.\n", encoding);
2713 fprintf(stderr__stderrp,
2714 " NSISOLatin1StringEncoding set as default.\n");
2715 defEnc = NSISOLatin1StringEncoding;
2716 }
2717
2718 if (natEnc == GSUndefinedEncoding)
2719 {
2720 natEnc = defEnc;
2721 }
2722
2723 [local_lock unlock];
2724 }
2725 return defEnc;
2726}
2727
2728NSString*
2729GSPrivateEncodingName(NSStringEncoding encoding)
2730{
2731 struct _strenc_ *encInfo;
2732
2733 if ((encInfo = EntrySupported(encoding)) == NO((BOOL)0))
2734 {
2735 return @"Unknown encoding";
2736 }
2737 return [NSString stringWithUTF8String: encInfo->ename];
2738}
2739
2740BOOL
2741GSPrivateIsByteEncoding(NSStringEncoding encoding)
2742{
2743 struct _strenc_ *encInfo;
2744
2745 if ((encInfo = EntrySupported(encoding)) == NO((BOOL)0))
2746 {
2747 return NO((BOOL)0);
2748 }
2749 return encInfo->eightBit;
2750}
2751
2752NSStringEncoding
2753GSPrivateNativeCStringEncoding()
2754{
2755 if (natEnc == GSUndefinedEncoding)
2756 {
2757 /* GSPrivateDefaultCStringEncoding() will actually set the encoding.
2758 */
2759 GSPrivateDefaultCStringEncoding();
2760 }
2761 return natEnc;
2762}
2763