POSIX.1 National Language Support API for MinGW
Revisión | b7a9785f64e65c62671dcba922e77872d0324bae (tree) |
---|---|
Tiempo | 2007-06-19 07:36:08 |
Autor | Keith Marshall <keithmarshall@user...> |
Commiter | Keith Marshall |
Handle unicode input streams.
@@ -1,3 +1,37 @@ | ||
1 | +2007-06-18 Keith Marshall <keithmarshall@users.sourceforge.net> | |
2 | + | |
3 | + Handle unicode input streams. | |
4 | + | |
5 | + * mcutfsig.c, include/mcutfsig.h: New files; they define the | |
6 | + interface and implementation for... | |
7 | + (mc_utf_signature): ...this new function. | |
8 | + | |
9 | + * mcsource.c (mc_discard): New static function. | |
10 | + (mc_source): Use it to clean up heap on abnormal termination. | |
11 | + Include `mcutfsig.h'; use `mc_utf_signature' to identify unicode | |
12 | + streams; map codeset using new local variables `input_encoding' | |
13 | + and `input_code_size'; use them with... | |
14 | + (UTF_TYPE): ...this new macro, to parse input accordingly. | |
15 | + | |
16 | + * include/gcmsgs.h (MSG_HAD_CODESET): New wording. | |
17 | + (MSG_UTF_CODESET, NMSG_UTF_UNKNOWN): New message definitions. | |
18 | + (MSG_UTF_SIZE_ERROR, MSG_UTF_FRAME_ERROR): Likewise. | |
19 | + | |
20 | + * Makefile.in (canonical_prefix): New macro. | |
21 | + (HEADER_TRANSFORM_SCRIPT): Use it. | |
22 | + (GENCAT_HEADERS): Add `mcutfsig.h'. | |
23 | + (GENCAT_SOURCES): Add `mcutfsig.c'. | |
24 | + (target): Macro renamed as... | |
25 | + (tool_prefix): ...this. | |
26 | + (install-progs): Updated to use it. | |
27 | + (clean): Remove all locally generated headers. | |
28 | + | |
29 | + * aclocal.m4 (MINGW_AC_HOST_CANONICAL_PREFIX): New macro. | |
30 | + (MSYS_AC_CANONICAL_PATH): New macro; copied from `man' sources. | |
31 | + * configure.ac: Use them. | |
32 | + (GENCAT_AC_OBJECTS_ADD): Add `mcutfsig'. | |
33 | + * configure: Regenerated. | |
34 | + | |
1 | 35 | 2007-06-01 Keith Marshall <keithmarshall@users.sourceforge.net> |
2 | 36 | |
3 | 37 | * repl/include/langinfo.h (LC_CTYPE): Include locale.h, to define. |
@@ -32,6 +32,8 @@ VPATH = ${srcdir}:${srcdir}/include:${srcdir}/repl | ||
32 | 32 | prefix = @prefix@ |
33 | 33 | exec_prefix = @exec_prefix@ |
34 | 34 | |
35 | +canonical_prefix = @canonical_prefix@ | |
36 | + | |
35 | 37 | bindir = @bindir@ |
36 | 38 | libdir = @libdir@ |
37 | 39 | includedir = @includedir@ |
@@ -85,8 +87,8 @@ CATGETS_HEADERS = mctab.h msgcat.h | ||
85 | 87 | CATGETS_SOURCES = catopen.c catgets.c catclose.c mctab.c mcref.c mcfree.c |
86 | 88 | CATGETS_OBJECTS = $(subst .c,.$(OBJEXT),$(CATGETS_SOURCES)) |
87 | 89 | |
88 | -GENCAT_HEADERS = gencat.h gcmsgs.h | |
89 | -GENCAT_SOURCES = gencat.c mcload.c mcsource.c mciconv.c mcmerge.c | |
90 | +GENCAT_HEADERS = gencat.h gcmsgs.h mcutfsig.h | |
91 | +GENCAT_SOURCES = gencat.c mcload.c mcsource.c mciconv.c mcmerge.c mcutfsig.c | |
90 | 92 | GENCAT_OBJECTS = @GENCAT_OBJECTS@ |
91 | 93 | |
92 | 94 | OTHER_SOURCES = COPYING ChangeLog mkinstalldirs install-sh $(MISC_SOURCES) |
@@ -110,7 +112,7 @@ INSTALL_PROGS = gendefs.awk | ||
110 | 112 | $(LN_S) $< $@ |
111 | 113 | |
112 | 114 | HEADER_TRANSFORM_SCRIPT = \ |
113 | - -e 's,@HOST_PREFIX@,${prefix},g' \ | |
115 | + -e 's,@HOST_PREFIX@,${canonical_prefix},g' \ | |
114 | 116 | -e 's,@PATH_SEPARATOR_INTERNAL@,$(PATH_SEPARATOR_INTERNAL),g' \ |
115 | 117 | -e 's,$(subst .,\.,$@)\.in$$,$@ -- $(AUTOGENERATED),' |
116 | 118 |
@@ -146,7 +148,7 @@ $(GENCAT_OBJECTS): $(GENCAT_HEADERS) $(CATGETS_OBJECTS) | ||
146 | 148 | ## ============== ## |
147 | 149 | |
148 | 150 | mkinstalldirs = ${srcdir}/mkinstalldirs |
149 | -target = @target_alias@`test -n "@target_alias@" && echo "-"` | |
151 | +tool_prefix = @target_alias@`test -n "@target_alias@" && echo "-"` | |
150 | 152 | |
151 | 153 | INSTALL = @INSTALL@ |
152 | 154 | INSTALL_PROGRAM = @INSTALL_PROGRAM@ |
@@ -159,10 +161,10 @@ install: install-headers install-progs install-libs | ||
159 | 161 | install-progs: all |
160 | 162 | $(mkinstalldirs) --require ${exec_prefix} ${bindir} ${libdir} |
161 | 163 | for prog in $(BUILD_PROGS); do \ |
162 | - $(INSTALL_PROGRAM) $$prog ${bindir}/$(target)$$prog; \ | |
164 | + $(INSTALL_PROGRAM) $$prog ${bindir}/$(tool_prefix)$$prog; \ | |
163 | 165 | done |
164 | 166 | for prog in $(INSTALL_PROGS); do \ |
165 | - $(INSTALL_PROGRAM) ${srcdir}/$$prog ${bindir}/$(target)$$prog; \ | |
167 | + $(INSTALL_PROGRAM) ${srcdir}/$$prog ${bindir}/$(tool_prefix)$$prog; \ | |
166 | 168 | done |
167 | 169 | |
168 | 170 | install-libs: all-libs $(all-dll) |
@@ -230,9 +232,10 @@ bindist: all | ||
230 | 232 | ## ========== ## |
231 | 233 | |
232 | 234 | clean: |
235 | + rm -f nlspath.h $(LOCAL_HEADERS) $(REPLACEMENT_HEADERS) | |
233 | 236 | rm -f *.$(OBJEXT) *~ gencat$(EXEEXT) |
234 | 237 | |
235 | 238 | distclean: clean |
236 | 239 | rm -rf Makefile config.h config.[ls]* autom4te.cache |
237 | 240 | |
238 | -# $RCSfile$Revision$: end of file | |
241 | +# $RCSfile$Revision: 1.1.1.1 $: end of file |
@@ -15,6 +15,42 @@ AC_DEFUN([MINGW_AC_WIN32_NATIVE_HOST], | ||
15 | 15 | #endif]], [mingw_cv_win32_host=no], [mingw_cv_win32_host=yes]))dnl |
16 | 16 | ])# MINGW_AC_WIN32_NATIVE_HOST |
17 | 17 | |
18 | +# MINGW_AC_HOST_CANONICAL_PREFIX | |
19 | +# ------------------------------ | |
20 | +# Set the AC_SUBST variable `canonical_prefix' to the canonical form | |
21 | +# of `prefix', as applicable for a mingw32 host. | |
22 | +# | |
23 | +AC_DEFUN([MINGW_AC_HOST_CANONICAL_PREFIX], | |
24 | +[AC_SUBST([canonical_prefix])dnl | |
25 | + ac_val=$prefix; test "x$ac_val" = xNONE && ac_val=$ac_default_prefix | |
26 | + MSYS_AC_CANONICAL_PATH([canonical_prefix],[$ac_val])dnl | |
27 | +])# MINGW_AC_HOST_CANONICAL_PREFIX | |
28 | + | |
29 | +# MSYS_AC_CANONICAL_PATH( VAR, PATHNAME ) | |
30 | +# --------------------------------------- | |
31 | +# Set VAR to the canonically resolved absolute equivalent of PATHNAME, | |
32 | +# (which may be a relative path, and need not refer to any existing entity). | |
33 | +# | |
34 | +# On Win32-MSYS build hosts, the returned path is resolved to its true | |
35 | +# native Win32 path name, (but with slashes, not backslashes). | |
36 | +# | |
37 | +# On any other system, it is simply the result which would be obtained | |
38 | +# if PATHNAME represented an existing directory, and the pwd command was | |
39 | +# executed in that directory. | |
40 | +# | |
41 | +AC_DEFUN([MSYS_AC_CANONICAL_PATH], | |
42 | +[ac_dir="$2" | |
43 | + pwd -W >/dev/null 2>&1 && ac_pwd_w="pwd -W" || ac_pwd_w=pwd | |
44 | + until ac_val=`exec 2>/dev/null; cd "$ac_dir" && $ac_pwd_w` | |
45 | + do | |
46 | + ac_dir=`AS_DIRNAME(["$ac_dir"])` | |
47 | + done | |
48 | + ac_dir=`echo "$ac_dir" | sed 's?^[[./]]*??'` | |
49 | + ac_val=`echo "$ac_val" | sed 's?/*$[]??'` | |
50 | + $1=`echo "$2" | sed "s?^[[./]]*$ac_dir/*?$ac_val/?"' | |
51 | + s?/*$[]??'`dnl | |
52 | +])# MSYS_AC_CANONICAL_PATH | |
53 | + | |
18 | 54 | # MINGW_AC_CHECK_HEADER( LISTVAR, HEADER ) |
19 | 55 | # ---------------------------------------- |
20 | 56 | # Invoke AC_CHECK_HEADER, to check availability of HEADER; |
@@ -152,3 +188,5 @@ AC_DEFUN([CATGETS_AC_CONFIG_VERSION_DEFINE], | ||
152 | 188 | [AC_DEFINE_UNQUOTED([$1],[`IFS=.;set x $PACKAGE_VERSION;echo ${$3}`], |
153 | 189 | [Define numerically to the catgets $2 version number])dnl |
154 | 190 | ])# CATGETS_AC_CONFIG_VERSION_DEFINE |
191 | + | |
192 | +# $RCSfile$Revision$: end of file |
@@ -310,7 +310,7 @@ ac_includes_default="\ | ||
310 | 310 | # include <unistd.h> |
311 | 311 | #endif" |
312 | 312 | |
313 | -ac_subst_vars='SHELL PATH_SEPARATOR PACKAGE_NAME PACKAGE_TARNAME PACKAGE_VERSION PACKAGE_STRING PACKAGE_BUGREPORT exec_prefix prefix program_transform_name bindir sbindir libexecdir datadir sysconfdir sharedstatedir localstatedir libdir includedir oldincludedir infodir mandir build_alias host_alias target_alias DEFS ECHO_C ECHO_N ECHO_T LIBS CC CFLAGS LDFLAGS CPPFLAGS ac_ct_CC EXEEXT OBJEXT LN_S AR ac_ct_AR INSTALL_PROGRAM INSTALL_SCRIPT INSTALL_DATA MAKE_DLL DLLVERSION HOST_PATH_SEPARATOR REPLACEMENT_HEADERS CPP EGREP LOCAL_HEADERS GENCAT_OBJECTS INCICONV LIBICONV LIBOBJS LTLIBOBJS' | |
313 | +ac_subst_vars='SHELL PATH_SEPARATOR PACKAGE_NAME PACKAGE_TARNAME PACKAGE_VERSION PACKAGE_STRING PACKAGE_BUGREPORT exec_prefix prefix program_transform_name bindir sbindir libexecdir datadir sysconfdir sharedstatedir localstatedir libdir includedir oldincludedir infodir mandir build_alias host_alias target_alias DEFS ECHO_C ECHO_N ECHO_T LIBS CC CFLAGS LDFLAGS CPPFLAGS ac_ct_CC EXEEXT OBJEXT LN_S AR ac_ct_AR INSTALL_PROGRAM INSTALL_SCRIPT INSTALL_DATA MAKE_DLL DLLVERSION canonical_prefix HOST_PATH_SEPARATOR REPLACEMENT_HEADERS CPP EGREP LOCAL_HEADERS GENCAT_OBJECTS INCICONV LIBICONV LIBOBJS LTLIBOBJS' | |
314 | 314 | ac_subst_files='' |
315 | 315 | |
316 | 316 | # Initialize some variables set by options. |
@@ -2508,9 +2508,31 @@ echo $ECHO_N "checking release version for mingw32 DLLs... $ECHO_C" >&6 | ||
2508 | 2508 | echo "$as_me:$LINENO: result: ${DLLVERSION}" >&5 |
2509 | 2509 | echo "${ECHO_T}${DLLVERSION}" >&6 |
2510 | 2510 | |
2511 | -# We need to identify the appropriate PATH separator character, | |
2512 | -# to be used in the default NLSPATH definition. | |
2511 | +# We need to identify the canonical path prefix, and the appropriate | |
2512 | +# PATH separator character, to be used in the default NLSPATH definition. | |
2513 | 2513 | # |
2514 | + ac_val=$prefix; test "x$ac_val" = xNONE && ac_val=$ac_default_prefix | |
2515 | + ac_dir="$ac_val" | |
2516 | + pwd -W >/dev/null 2>&1 && ac_pwd_w="pwd -W" || ac_pwd_w=pwd | |
2517 | + until ac_val=`exec 2>/dev/null; cd "$ac_dir" && $ac_pwd_w` | |
2518 | + do | |
2519 | + ac_dir=`(dirname "$ac_dir") 2>/dev/null || | |
2520 | +$as_expr X"$ac_dir" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \ | |
2521 | + X"$ac_dir" : 'X\(//\)[^/]' \| \ | |
2522 | + X"$ac_dir" : 'X\(//\)$' \| \ | |
2523 | + X"$ac_dir" : 'X\(/\)' \| \ | |
2524 | + . : '\(.\)' 2>/dev/null || | |
2525 | +echo X"$ac_dir" | | |
2526 | + sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{ s//\1/; q; } | |
2527 | + /^X\(\/\/\)[^/].*/{ s//\1/; q; } | |
2528 | + /^X\(\/\/\)$/{ s//\1/; q; } | |
2529 | + /^X\(\/\).*/{ s//\1/; q; } | |
2530 | + s/.*/./; q'` | |
2531 | + done | |
2532 | + ac_dir=`echo "$ac_dir" | sed 's?^[./]*??'` | |
2533 | + ac_val=`echo "$ac_val" | sed 's?/*$??'` | |
2534 | + canonical_prefix=`echo "$ac_val" | sed "s?^[./]*$ac_dir/*?$ac_val/?"' | |
2535 | + s?/*$??'` | |
2514 | 2536 | echo "$as_me:$LINENO: checking NLSPATH separator character used at runtime" >&5 |
2515 | 2537 | echo $ECHO_N "checking NLSPATH separator character used at runtime... $ECHO_C" >&6 |
2516 | 2538 | if test "${mingw_cv_host_path_separator+set}" = set; then |
@@ -4332,6 +4354,7 @@ done | ||
4332 | 4354 | GENCAT_OBJECTS=${GENCAT_OBJECTS}' mciconv.$(OBJEXT)' |
4333 | 4355 | GENCAT_OBJECTS=${GENCAT_OBJECTS}' mcsource.$(OBJEXT)' |
4334 | 4356 | GENCAT_OBJECTS=${GENCAT_OBJECTS}' mcmerge.$(OBJEXT)' |
4357 | + GENCAT_OBJECTS=${GENCAT_OBJECTS}' mcutfsig.$(OBJEXT)' | |
4335 | 4358 | |
4336 | 4359 | # We also require `iconv', always expecting it to be pre-installed; we |
4337 | 4360 | # need this to check if we are using a standard `libc' implementation, |
@@ -5468,6 +5491,7 @@ s,@INSTALL_SCRIPT@,$INSTALL_SCRIPT,;t t | ||
5468 | 5491 | s,@INSTALL_DATA@,$INSTALL_DATA,;t t |
5469 | 5492 | s,@MAKE_DLL@,$MAKE_DLL,;t t |
5470 | 5493 | s,@DLLVERSION@,$DLLVERSION,;t t |
5494 | +s,@canonical_prefix@,$canonical_prefix,;t t | |
5471 | 5495 | s,@HOST_PATH_SEPARATOR@,$HOST_PATH_SEPARATOR,;t t |
5472 | 5496 | s,@REPLACEMENT_HEADERS@,$REPLACEMENT_HEADERS,;t t |
5473 | 5497 | s,@CPP@,$CPP,;t t |
@@ -5970,4 +5994,4 @@ if test "$no_create" != yes; then | ||
5970 | 5994 | fi |
5971 | 5995 | |
5972 | 5996 | # |
5973 | -# $RCSfile$Revision$: end of file | |
5997 | +# $RCSfile$Revision: 1.1.1.1 $: end of file |
@@ -65,9 +65,10 @@ | ||
65 | 65 | # |
66 | 66 | MINGW_AC_HOST_CONFIG_DLL([__MINGW_AC_PACKAGE_DLL_VERSION__]) |
67 | 67 | |
68 | -# We need to identify the appropriate PATH separator character, | |
69 | -# to be used in the default NLSPATH definition. | |
68 | +# We need to identify the canonical path prefix, and the appropriate | |
69 | +# PATH separator character, to be used in the default NLSPATH definition. | |
70 | 70 | # |
71 | + MINGW_AC_HOST_CANONICAL_PREFIX | |
71 | 72 | MINGW_AC_HOST_PATH_SEPARATOR([NLSPATH]) |
72 | 73 | |
73 | 74 | # Schedule an automatic header update, if we find a `unistd.h', |
@@ -108,6 +109,7 @@ | ||
108 | 109 | GENCAT_AC_OBJECTS_ADD([mciconv]) |
109 | 110 | GENCAT_AC_OBJECTS_ADD([mcsource]) |
110 | 111 | GENCAT_AC_OBJECTS_ADD([mcmerge]) |
112 | + GENCAT_AC_OBJECTS_ADD([mcutfsig]) | |
111 | 113 | |
112 | 114 | # We also require `iconv', always expecting it to be pre-installed; we |
113 | 115 | # need this to check if we are using a standard `libc' implementation, |
@@ -118,4 +120,4 @@ | ||
118 | 120 | AC_CONFIG_FILES([Makefile]) |
119 | 121 | AC_OUTPUT |
120 | 122 | # |
121 | -# $RCSfile$Revision$: end of file | |
123 | +# $RCSfile$Revision: 1.1.1.1 $: end of file |
@@ -44,7 +44,7 @@ | ||
44 | 44 | #define MSG_BAD_CATALOGUE 2, 1, "%s: %s: file is not a valid message catalogue\n" |
45 | 45 | #define MSG_UNKNOWN_CODESET 2, 2, "%s: %s: unknown codeset descriptor\n" |
46 | 46 | #define MSG_CODESET_CLASH 2, 3, "%s:%u: codeset `%s' conflicts with prior declaration\n" |
47 | -#define MSG_HAD_CODESET 2, 4, "%s:%u: codeset `%s' was previously declared here\n" | |
47 | +#define MSG_HAD_CODESET 2, 4, "%s:%u: codeset `%s' previously declared here\n" | |
48 | 48 | #define MSG_SETNUM_NOT_INCR 2, 5, "invalid set number: expecting > %d; got %d\n" |
49 | 49 | #define MSG_MSGNUM_NOT_INCR 2, 6, "invalid message number: expecting > %d; got %d\n" |
50 | 50 | #define MSG_REDEFINED 2, 7, "%s: %s:%u: redefinition of message %u in set %u\n" |
@@ -53,7 +53,11 @@ | ||
53 | 53 | #define MSG_TEXT_DISCARDED 3, 3, "%s:%u: incomplete message marked for deletion\n" |
54 | 54 | #define MSG_MISSING_NEWLINE 3, 4, "%s:%u: missing newline at end of file\n" |
55 | 55 | #define MSG_BAD_INDEX 3, 5, "invalid reference in message index" |
56 | +#define MSG_UTF_UNKNOWN 4, 1, "%s:unrecognisable encoding format\n" | |
57 | +#define MSG_UTF_SIZE_ERROR 4, 2, "%s:invalid byte count per code point; value was %d\n" | |
58 | +#define MSG_UTF_FRAME_ERROR 4, 3, "%s:%u:UTF-%u%cE input framing error\n" | |
59 | +#define MSG_UTF_CODESET 4, 4, "%s:input codeset identified as %s; conflicts with ...\n" | |
56 | 60 | /* ! |
57 | 61 | * !$ end of file |
58 | 62 | */ |
59 | -#endif /* !defined( GCMSGS_H ): $RCSfile$Revision: 1.2 $: end of file */ | |
63 | +#endif /* !defined( GCMSGS_H ): $RCSfile$Revision: 1.3 $: end of file */ |
@@ -0,0 +1,65 @@ | ||
1 | +#ifndef _MCUTFSIG_H_ | |
2 | +/* | |
3 | + * mcutfsig.h | |
4 | + * | |
5 | + * $Id$ | |
6 | + * | |
7 | + * Copyright (C) 2007, Keith Marshall | |
8 | + * | |
9 | + * Header file defining the `mc_utf_signature' function API, and a set | |
10 | + * of supporting macros, used for obtaining and manipulating an encoding | |
11 | + * `signature' for UTF-8, UTF-16 and UTF-32 encoded input files. | |
12 | + * | |
13 | + * Written by Keith Marshall <keithmarshall@users.sourceforge.net> | |
14 | + * Last Revision: 30-May-2007 | |
15 | + * | |
16 | + * | |
17 | + * This is free software. It is provided AS IS, in the hope that it may | |
18 | + * be useful, but WITHOUT WARRANTY OF ANY KIND, not even an IMPLIED WARRANTY | |
19 | + * of MERCHANTABILITY, nor of FITNESS FOR ANY PARTICULAR PURPOSE. | |
20 | + * | |
21 | + * Permission is granted to redistribute this software, either "as is" or | |
22 | + * in modified form, under the terms of the GNU General Public License, as | |
23 | + * published by the Free Software Foundation; either version 2, or (at your | |
24 | + * option) any later version. | |
25 | + * | |
26 | + * You should have received a copy of the GNU General Public License | |
27 | + * along with this software; see the file COPYING. If not, write to the | |
28 | + * Free Software Foundation, 51 Franklin St - Fifth Floor, Boston, | |
29 | + * MA 02110-1301, USA. | |
30 | + * | |
31 | + */ | |
32 | +#define _MCUTFSIG_H_ | |
33 | + | |
34 | +/* | |
35 | + * Flags used to designate the `endianness' of the input encoding, | |
36 | + * and to record presence or absence of a byte order mark. | |
37 | + * | |
38 | + */ | |
39 | +#define UTF_BIG_ENDIAN 0100 | |
40 | +#define UTF_LITTLE_ENDIAN 0200 | |
41 | +#define UTF_WITH_BYTE_ORDER_MARK 0400 | |
42 | + | |
43 | +/* | |
44 | + * Mask used to isolate a bit-field representing the number of bytes | |
45 | + * per encoding unit in the input stream. | |
46 | + * | |
47 | + */ | |
48 | +#define UTF_CODE_UNIT_SIZE_MASK 0007 | |
49 | + | |
50 | +/* | |
51 | + * Macros used to disambiguate the codeset name, wrt byte order. | |
52 | + * | |
53 | + */ | |
54 | +#define UTF_IS_MB(FLAGS) (UTF_CODE_SIZE(FLAGS) > 1) | |
55 | +#define UTF_CODE_SIZE(FLAGS) ((FLAGS) & UTF_CODE_UNIT_SIZE_MASK) | |
56 | +#define UTF_BYTE_ORDER(FLAGS) (UTF_IS_MB(FLAGS) ? UTF_SUFFIX(FLAGS) : '\0') | |
57 | +#define UTF_SUFFIX(FLAGS) (((FLAGS) & UTF_LITTLE_ENDIAN) ? 'L' : 'B') | |
58 | + | |
59 | +/* | |
60 | + * Function prototypes. | |
61 | + * | |
62 | + */ | |
63 | +unsigned short mc_ucs_signature( unsigned char *stream ); | |
64 | + | |
65 | +#endif /* !defined(_MCUTFSIG_H_): $RCSfile$Revision$: end of file */ |
@@ -57,6 +57,7 @@ | ||
57 | 57 | #include <debug.h> |
58 | 58 | |
59 | 59 | #include <platform.h> |
60 | +#include <mcutfsig.h> | |
60 | 61 | |
61 | 62 | #ifdef DEBUG_BUFSIZ |
62 | 63 | # undef BUFSIZ |
@@ -205,9 +206,35 @@ char *mc_update_workspace( char *buf, char *cache, unsigned int count ) | ||
205 | 206 | return buf; |
206 | 207 | } |
207 | 208 | |
209 | +static inline | |
210 | +struct msgdict *mc_discard( struct msgdict *index, char *messages ) | |
211 | +{ | |
212 | + /* A helper function, to reclaim all memory allocated to a local | |
213 | + * message dictionary, prior to aborting compilation of the current | |
214 | + * message catalogue source file. | |
215 | + */ | |
216 | + while( index ) | |
217 | + { | |
218 | + /* Walk the linked list of dictionary index entries, (if any), | |
219 | + * releasing the memory block alloted to each individually. | |
220 | + */ | |
221 | + struct msgdict *next = index->link; | |
222 | + free( index ); | |
223 | + index = next; | |
224 | + } | |
225 | + if( messages ) | |
226 | + /* | |
227 | + * All of the indexed messages are collected into a single block, | |
228 | + * which is allocated, and so must be released, separately. | |
229 | + */ | |
230 | + free( messages ); | |
231 | + return index; | |
232 | +} | |
233 | + | |
208 | 234 | struct msgdict *mc_source( const char *input ) |
209 | 235 | { |
210 | 236 | # define CODESET_DECLARED codeset_decl_src, codeset_decl_lineno |
237 | +# define UTF_TYPE(ORDER) 8 * input_code_size, (ORDER) | |
211 | 238 | |
212 | 239 | dinvoke( int dtrace = 0; ) |
213 | 240 |
@@ -231,6 +258,9 @@ struct msgdict *mc_source( const char *input ) | ||
231 | 258 | static char *codeset = NULL; |
232 | 259 | static const char *codeset_decl_src = NULL; |
233 | 260 | static unsigned int codeset_decl_lineno = 0; |
261 | + | |
262 | + unsigned short input_encoding = 0, input_code_size; | |
263 | + | |
234 | 264 | static iconv_t iconv_map[2] = {(iconv_t)(-1), (iconv_t)(-1)}; |
235 | 265 | char *messages; off_t msgloc, headroom; |
236 | 266 | /* |
@@ -297,6 +327,101 @@ struct msgdict *mc_source( const char *input ) | ||
297 | 327 | char *p = buf; |
298 | 328 | int high_water_mark = count - ( count >> 2 ); |
299 | 329 | dfprintf(( stderr, "\n%s:%u:read %u byte%s", input, linenum, count, count == 1 ? "" : "s" )); |
330 | + | |
331 | + if( input_encoding == 0 ) | |
332 | + { | |
333 | + input_encoding = mc_utf_signature( buf ); | |
334 | + switch( input_code_size = input_encoding & UTF_CODE_UNIT_SIZE_MASK ) | |
335 | + { | |
336 | + case 1: | |
337 | + if( (input_encoding & UTF_WITH_BYTE_ORDER_MARK) != 0 ) | |
338 | + { | |
339 | + /* | |
340 | + * This is UTF-8 input encoding, affirmed by the presence of | |
341 | + * the byte order mark, (three bytes), which we must skip. | |
342 | + */ | |
343 | + p += 3; | |
344 | + count -= 3; | |
345 | + } | |
346 | + break; | |
347 | + | |
348 | + case 2: | |
349 | + case 4: | |
350 | + if( (input_encoding & UTF_WITH_BYTE_ORDER_MARK) != 0 ) | |
351 | + { | |
352 | + /* This is either UTF-16, or UTF-32, also affirmed by the BOM, | |
353 | + * which occupies the first code unit, so skip it. | |
354 | + */ | |
355 | + p += input_code_size; | |
356 | + count -= input_code_size; | |
357 | + } | |
358 | + break; | |
359 | + | |
360 | + default: | |
361 | + /* | |
362 | + * This isn't valid, for any recognisable codeset in the required | |
363 | + * POSIX Portable Character Set input context; diagnose, clean up, | |
364 | + * and bail out. | |
365 | + */ | |
366 | + dfputc(( '\n', stderr )); | |
367 | + fprintf( errmsg( MSG_UTF_UNKNOWN ), input ); | |
368 | + fprintf( errmsg( MSG_UTF_SIZE_ERROR ), input, input_code_size ); | |
369 | + free( messages ); | |
370 | + close( input_fd ); | |
371 | + return NULL; | |
372 | + } | |
373 | + | |
374 | + if( input_encoding > 1 ) | |
375 | + { | |
376 | + /* We've detected a UTF input encoding, which implicitly specifies | |
377 | + * the codeset of the messages defined within this source file. | |
378 | + */ | |
379 | + char utf_byte_order = UTF_BYTE_ORDER( input_encoding ); | |
380 | + sprintf( keyword, "UTF-%d%cE", 8 * input_code_size, utf_byte_order ); | |
381 | + | |
382 | + dfprintf(( stderr, "\n%s:", input )); | |
383 | + dinvoke( if( (input_encoding & UTF_WITH_BYTE_ORDER_MARK) != 0 ) ) | |
384 | + dfprintf(( stderr, "unicode byte order mark detected; " )); | |
385 | + dfprintf(( stderr, "encoding identified as %s", keyword )); | |
386 | + | |
387 | + if( codeset != NULL ) | |
388 | + { | |
389 | + /* We could coalesce these two conditions into a single test, | |
390 | + * but we choose to nest them thus, to facilitate a possible | |
391 | + * future change, to support codeset alternation. | |
392 | + */ | |
393 | + if( strcmp( keyword, codeset ) != 0 ) | |
394 | + { | |
395 | + /* The detected UTF input encoding is not compatible with the | |
396 | + * previously declared codeset of the messages in the catalogue; | |
397 | + * diagnose, and skip this source file. | |
398 | + */ | |
399 | + dfputc(( '\n', stderr )); | |
400 | + fprintf( errmsg( MSG_UTF_CODESET ), input, keyword ); | |
401 | + fprintf( errmsg( MSG_HAD_CODESET ), CODESET_DECLARED, codeset ); | |
402 | + free( messages ); | |
403 | + close( input_fd ); | |
404 | + return NULL; | |
405 | + } | |
406 | + } | |
407 | + | |
408 | + else | |
409 | + { | |
410 | + /* We don't yet have a codeset declaration; establish one implicitly, | |
411 | + * based on the identified input encoding. | |
412 | + */ | |
413 | + id = strdup( keyword ); | |
414 | + if( (codeset = map_codeset( iconv_map, id, "wchar_t" )) == NULL ) | |
415 | + { | |
416 | + free( id ); | |
417 | + } | |
418 | + | |
419 | + else | |
420 | + codeset_decl_src = input; | |
421 | + } | |
422 | + } | |
423 | + } | |
424 | + | |
300 | 425 | while( count > 0 ) |
301 | 426 | { |
302 | 427 | /* ... scanning character by character, |
@@ -325,14 +450,64 @@ struct msgdict *mc_source( const char *input ) | ||
325 | 450 | * transforming to the wide character domain, for local processing. |
326 | 451 | */ |
327 | 452 | p += ((skip = iconv_mbtowc( &c, p, count )) > 0) ? skip : 0; |
453 | + | |
454 | + /* For UTF-16 or UTF-32 input encodings, the `skip' count *must* | |
455 | + * match the codeset size, ... | |
456 | + */ | |
457 | + if( (input_code_size > 1) && (skip != input_code_size) ) | |
458 | + { | |
459 | + /* ... or we have a framing error; diagnose, | |
460 | + * and discard this input stream. | |
461 | + */ | |
462 | + dfputc(( '\n', stderr )); | |
463 | + fprintf( errmsg( MSG_UTF_FRAME_ERROR ), input, linenum, codeset ); | |
464 | + return mc_discard( head, messages ); | |
465 | + } | |
328 | 466 | } |
329 | 467 | |
330 | 468 | else |
331 | 469 | { |
332 | 470 | /* We are parsing context which is defined in the POSIX, |
333 | - * or "C" locale, so read single byte character sequences. | |
471 | + * or "C" locale, so read single byte character sequences, | |
472 | + * but stripping out any padding NULs required to fill the | |
473 | + * input stream to a UTF-16 or UTF-32 framing boundary. | |
334 | 474 | */ |
475 | + int utf_skip = input_code_size - 1; | |
476 | + if( (utf_skip > 0) && ((input_encoding & UTF_BIG_ENDIAN) != 0) ) | |
477 | + { | |
478 | + /* Big-Endian Unicode should have padding NULs before the | |
479 | + * POSIX `C' locale byte required. | |
480 | + */ | |
481 | + while( (*p == '\0') && utf_skip-- && count-- ) | |
482 | + ++p; | |
483 | + if( (utf_skip > 0) || (*p == '\0') ) | |
484 | + { | |
485 | + /* Diagnose and bail out, if the number of padding NULs | |
486 | + * wasn't what we expected. | |
487 | + */ | |
488 | + dfputc(( '\n', stderr )); | |
489 | + fprintf( errmsg( MSG_UTF_FRAME_ERROR ), input, linenum, UTF_TYPE( 'B' )); | |
490 | + return mc_discard( head, messages ); | |
491 | + } | |
492 | + } | |
335 | 493 | c = (wchar_t)(*p++); |
494 | + if( (utf_skip > 0) && ((input_encoding & UTF_LITTLE_ENDIAN) != 0) ) | |
495 | + { | |
496 | + /* Little-Endian Unicode should have the padding NULs after | |
497 | + * this significant byte. | |
498 | + */ | |
499 | + while( (*p == '\0') && utf_skip-- && count-- ) | |
500 | + ++p; | |
501 | + if( (utf_skip > 0) || (*p == '\0') ) | |
502 | + { | |
503 | + /* Diagnose and bail out, if the number of padding NULs | |
504 | + * wasn't what we expected. | |
505 | + */ | |
506 | + dfputc(( '\n', stderr )); | |
507 | + fprintf( errmsg( MSG_UTF_FRAME_ERROR ), input, linenum, UTF_TYPE( 'L' )); | |
508 | + return mc_discard( head, messages ); | |
509 | + } | |
510 | + } | |
336 | 511 | } |
337 | 512 | |
338 | 513 | if( skip > 0 ) |
@@ -460,12 +635,13 @@ struct msgdict *mc_source( const char *input ) | ||
460 | 635 | { |
461 | 636 | if( strcmp( codeset, id ) != 0 ) |
462 | 637 | { |
638 | + dfputc(( '\n', stderr )); | |
463 | 639 | fprintf( errmsg( MSG_CODESET_CLASH ), input, linenum, id ); |
464 | 640 | fprintf( errmsg( MSG_HAD_CODESET ), CODESET_DECLARED, codeset ); |
465 | 641 | } |
466 | 642 | free( id ); |
467 | 643 | } |
468 | - dfprintf(( stderr, "; declare %s", keyword )); | |
644 | + dfprintf(( stderr, "\n%s:%u:declare %s", input, linenum, keyword )); | |
469 | 645 | } |
470 | 646 | } |
471 | 647 |
@@ -1087,4 +1263,4 @@ struct msgdict *mc_source( const char *input ) | ||
1087 | 1263 | return head; |
1088 | 1264 | } |
1089 | 1265 | |
1090 | -/* $RCSfile$Revision: 1.9 $: end of file */ | |
1266 | +/* $RCSfile$Revision: 1.10 $: end of file */ |
@@ -0,0 +1,118 @@ | ||
1 | +/* | |
2 | + * mcutfsig.c | |
3 | + * | |
4 | + * $Id$ | |
5 | + * | |
6 | + * Copyright (C) 2007, Keith Marshall | |
7 | + * | |
8 | + * This file implements the `mc_utf_signature' function, which is used | |
9 | + * by `gencat', to identify message definition source files which appear | |
10 | + * to exhibit any recognisable standard of Unicode encoding. | |
11 | + * | |
12 | + * Written by Keith Marshall <keithmarshall@users.sourceforge.net> | |
13 | + * Last Revision: 22-May-2007 | |
14 | + * | |
15 | + * | |
16 | + * This is free software. It is provided AS IS, in the hope that it may | |
17 | + * be useful, but WITHOUT WARRANTY OF ANY KIND, not even an IMPLIED WARRANTY | |
18 | + * of MERCHANTABILITY, nor of FITNESS FOR ANY PARTICULAR PURPOSE. | |
19 | + * | |
20 | + * Permission is granted to redistribute this software, either "as is" or | |
21 | + * in modified form, under the terms of the GNU General Public License, as | |
22 | + * published by the Free Software Foundation; either version 2, or (at your | |
23 | + * option) any later version. | |
24 | + * | |
25 | + * You should have received a copy of the GNU General Public License | |
26 | + * along with this software; see the file COPYING. If not, write to the | |
27 | + * Free Software Foundation, 51 Franklin St - Fifth Floor, Boston, | |
28 | + * MA 02110-1301, USA. | |
29 | + * | |
30 | + */ | |
31 | +#include <mcutfsig.h> | |
32 | + | |
33 | +unsigned short mc_utf_signature( unsigned char *stream ) | |
34 | +{ | |
35 | + /* Inspect the first few bytes of the specified data stream; | |
36 | + * attempt to identify a potential Unicode encoding signature, | |
37 | + * defaulting to non-specific single byte encoding units. | |
38 | + */ | |
39 | + unsigned short signature = 1; | |
40 | + /* | |
41 | + * The first character in the input stream must not be NUL, | |
42 | + * and must be a member of the POSIX Portable Character Set; | |
43 | + * if it isn't, then it may indicate a Unicode stream. | |
44 | + */ | |
45 | + if( *stream == 0 ) | |
46 | + { | |
47 | + /* An initial NUL byte anticipates a big-endian Unicode stream; | |
48 | + * one such byte implies UTF-16, without a Byte Order Mark, while | |
49 | + * two such followed by the big-endian form of the BOM, or three | |
50 | + * without a BOM, indicates UTF-32. | |
51 | + */ | |
52 | + int count = 4; | |
53 | + while( count-- && (*stream++ == '\0') ) | |
54 | + ++signature; | |
55 | + signature += UTF_BIG_ENDIAN; | |
56 | + } | |
57 | + if( (*stream & 0xfe) == 0xfe ) | |
58 | + { | |
59 | + /* This looks like it might be a Unicode Byte Order Mark; | |
60 | + * identify the UTF encoding standard, if any, which it represents. | |
61 | + */ | |
62 | + unsigned bom = *stream++ << 8; bom |= *stream++; | |
63 | + switch( bom ) | |
64 | + { | |
65 | + case 0xfffe: | |
66 | + /* | |
67 | + * This is the BOM signature for a little-endian Unicode stream; | |
68 | + * the first byte has already been included in the initial size | |
69 | + * assigned for the encoding unit; adjust this to accommodate the | |
70 | + * second byte, and incorporate the little-endian flag. | |
71 | + */ | |
72 | + signature += UTF_WITH_BYTE_ORDER_MARK + UTF_LITTLE_ENDIAN + 1; | |
73 | + if( *stream == '\0' ) | |
74 | + { | |
75 | + int count = 4; | |
76 | + while( count-- && (*stream++ == '\0') ) | |
77 | + ++signature; | |
78 | + } | |
79 | + break; | |
80 | + | |
81 | + case 0xfeff: | |
82 | + /* | |
83 | + * This is the BOM signature for a big-endian Unicode stream; | |
84 | + * if preceded by two NULs, (already counted), then it is UTF-32, | |
85 | + * else it is UTF-16. In either case, adding an additional one | |
86 | + * to the accumulated size of the encoding unit yields the | |
87 | + * desired result, since the first byte of the BOM, and | |
88 | + * any leading NULs, have already been counted. | |
89 | + */ | |
90 | + signature += UTF_WITH_BYTE_ORDER_MARK + UTF_BIG_ENDIAN + 1; | |
91 | + break; | |
92 | + | |
93 | + case 0xffbb: | |
94 | + /* | |
95 | + * Provided it's followed by one further `0xbf' byte, this is the | |
96 | + * BOM used as a signature for a UTF-8 encoded stream; it becomes | |
97 | + * invalid, if there were any leading NUL bytes. | |
98 | + */ | |
99 | + if( (signature == 1) && (*stream++ == (unsigned char)('\xbf')) ) | |
100 | + signature |= UTF_WITH_BYTE_ORDER_MARK; | |
101 | + } | |
102 | + } | |
103 | + else if( (signature == 1) && (*++stream == 0) ) | |
104 | + { | |
105 | + /* NUL as the second byte in the input stream indicates a probable | |
106 | + * little-endian Unicode input stream, although this is not indicated | |
107 | + * by a Byte Order Mark; count the trailing NULs, to determine if we | |
108 | + * should interpret it as UTF-16LE, or as UTF-32LE. | |
109 | + */ | |
110 | + int count = 4; | |
111 | + while( count-- && (*stream++ == '\0') ) | |
112 | + ++signature; | |
113 | + signature += UTF_LITTLE_ENDIAN; | |
114 | + } | |
115 | + return signature; | |
116 | +} | |
117 | + | |
118 | +/* $RCSfile$Revision$: end of file */ |