• R/O
  • HTTP
  • SSH
  • HTTPS

Commit

Tags
No Tags

Frequently used words (click to add to your profile)

javac++androidlinuxc#windowsobjective-ccocoa誰得qtpythonphprubygameguibathyscaphec計画中(planning stage)翻訳omegatframeworktwitterdomtestvb.netdirectxゲームエンジンbtronarduinopreviewer

POSIX.1 National Language Support API for MinGW


Commit MetaInfo

Revisiónb7a9785f64e65c62671dcba922e77872d0324bae (tree)
Tiempo2007-06-19 07:36:08
AutorKeith Marshall <keithmarshall@user...>
CommiterKeith Marshall

Log Message

Handle unicode input streams.

Cambiar Resumen

Diferencia incremental

--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,37 @@
1+2007-06-18 Keith Marshall <keithmarshall@users.sourceforge.net>
2+
3+ Handle unicode input streams.
4+
5+ * mcutfsig.c, include/mcutfsig.h: New files; they define the
6+ interface and implementation for...
7+ (mc_utf_signature): ...this new function.
8+
9+ * mcsource.c (mc_discard): New static function.
10+ (mc_source): Use it to clean up heap on abnormal termination.
11+ Include `mcutfsig.h'; use `mc_utf_signature' to identify unicode
12+ streams; map codeset using new local variables `input_encoding'
13+ and `input_code_size'; use them with...
14+ (UTF_TYPE): ...this new macro, to parse input accordingly.
15+
16+ * include/gcmsgs.h (MSG_HAD_CODESET): New wording.
17+ (MSG_UTF_CODESET, NMSG_UTF_UNKNOWN): New message definitions.
18+ (MSG_UTF_SIZE_ERROR, MSG_UTF_FRAME_ERROR): Likewise.
19+
20+ * Makefile.in (canonical_prefix): New macro.
21+ (HEADER_TRANSFORM_SCRIPT): Use it.
22+ (GENCAT_HEADERS): Add `mcutfsig.h'.
23+ (GENCAT_SOURCES): Add `mcutfsig.c'.
24+ (target): Macro renamed as...
25+ (tool_prefix): ...this.
26+ (install-progs): Updated to use it.
27+ (clean): Remove all locally generated headers.
28+
29+ * aclocal.m4 (MINGW_AC_HOST_CANONICAL_PREFIX): New macro.
30+ (MSYS_AC_CANONICAL_PATH): New macro; copied from `man' sources.
31+ * configure.ac: Use them.
32+ (GENCAT_AC_OBJECTS_ADD): Add `mcutfsig'.
33+ * configure: Regenerated.
34+
135 2007-06-01 Keith Marshall <keithmarshall@users.sourceforge.net>
236
337 * repl/include/langinfo.h (LC_CTYPE): Include locale.h, to define.
--- a/Makefile.in
+++ b/Makefile.in
@@ -32,6 +32,8 @@ VPATH = ${srcdir}:${srcdir}/include:${srcdir}/repl
3232 prefix = @prefix@
3333 exec_prefix = @exec_prefix@
3434
35+canonical_prefix = @canonical_prefix@
36+
3537 bindir = @bindir@
3638 libdir = @libdir@
3739 includedir = @includedir@
@@ -85,8 +87,8 @@ CATGETS_HEADERS = mctab.h msgcat.h
8587 CATGETS_SOURCES = catopen.c catgets.c catclose.c mctab.c mcref.c mcfree.c
8688 CATGETS_OBJECTS = $(subst .c,.$(OBJEXT),$(CATGETS_SOURCES))
8789
88-GENCAT_HEADERS = gencat.h gcmsgs.h
89-GENCAT_SOURCES = gencat.c mcload.c mcsource.c mciconv.c mcmerge.c
90+GENCAT_HEADERS = gencat.h gcmsgs.h mcutfsig.h
91+GENCAT_SOURCES = gencat.c mcload.c mcsource.c mciconv.c mcmerge.c mcutfsig.c
9092 GENCAT_OBJECTS = @GENCAT_OBJECTS@
9193
9294 OTHER_SOURCES = COPYING ChangeLog mkinstalldirs install-sh $(MISC_SOURCES)
@@ -110,7 +112,7 @@ INSTALL_PROGS = gendefs.awk
110112 $(LN_S) $< $@
111113
112114 HEADER_TRANSFORM_SCRIPT = \
113- -e 's,@HOST_PREFIX@,${prefix},g' \
115+ -e 's,@HOST_PREFIX@,${canonical_prefix},g' \
114116 -e 's,@PATH_SEPARATOR_INTERNAL@,$(PATH_SEPARATOR_INTERNAL),g' \
115117 -e 's,$(subst .,\.,$@)\.in$$,$@ -- $(AUTOGENERATED),'
116118
@@ -146,7 +148,7 @@ $(GENCAT_OBJECTS): $(GENCAT_HEADERS) $(CATGETS_OBJECTS)
146148 ## ============== ##
147149
148150 mkinstalldirs = ${srcdir}/mkinstalldirs
149-target = @target_alias@`test -n "@target_alias@" && echo "-"`
151+tool_prefix = @target_alias@`test -n "@target_alias@" && echo "-"`
150152
151153 INSTALL = @INSTALL@
152154 INSTALL_PROGRAM = @INSTALL_PROGRAM@
@@ -159,10 +161,10 @@ install: install-headers install-progs install-libs
159161 install-progs: all
160162 $(mkinstalldirs) --require ${exec_prefix} ${bindir} ${libdir}
161163 for prog in $(BUILD_PROGS); do \
162- $(INSTALL_PROGRAM) $$prog ${bindir}/$(target)$$prog; \
164+ $(INSTALL_PROGRAM) $$prog ${bindir}/$(tool_prefix)$$prog; \
163165 done
164166 for prog in $(INSTALL_PROGS); do \
165- $(INSTALL_PROGRAM) ${srcdir}/$$prog ${bindir}/$(target)$$prog; \
167+ $(INSTALL_PROGRAM) ${srcdir}/$$prog ${bindir}/$(tool_prefix)$$prog; \
166168 done
167169
168170 install-libs: all-libs $(all-dll)
@@ -230,9 +232,10 @@ bindist: all
230232 ## ========== ##
231233
232234 clean:
235+ rm -f nlspath.h $(LOCAL_HEADERS) $(REPLACEMENT_HEADERS)
233236 rm -f *.$(OBJEXT) *~ gencat$(EXEEXT)
234237
235238 distclean: clean
236239 rm -rf Makefile config.h config.[ls]* autom4te.cache
237240
238-# $RCSfile$Revision$: end of file
241+# $RCSfile$Revision: 1.1.1.1 $: end of file
--- a/aclocal.m4
+++ b/aclocal.m4
@@ -15,6 +15,42 @@ AC_DEFUN([MINGW_AC_WIN32_NATIVE_HOST],
1515 #endif]], [mingw_cv_win32_host=no], [mingw_cv_win32_host=yes]))dnl
1616 ])# MINGW_AC_WIN32_NATIVE_HOST
1717
18+# MINGW_AC_HOST_CANONICAL_PREFIX
19+# ------------------------------
20+# Set the AC_SUBST variable `canonical_prefix' to the canonical form
21+# of `prefix', as applicable for a mingw32 host.
22+#
23+AC_DEFUN([MINGW_AC_HOST_CANONICAL_PREFIX],
24+[AC_SUBST([canonical_prefix])dnl
25+ ac_val=$prefix; test "x$ac_val" = xNONE && ac_val=$ac_default_prefix
26+ MSYS_AC_CANONICAL_PATH([canonical_prefix],[$ac_val])dnl
27+])# MINGW_AC_HOST_CANONICAL_PREFIX
28+
29+# MSYS_AC_CANONICAL_PATH( VAR, PATHNAME )
30+# ---------------------------------------
31+# Set VAR to the canonically resolved absolute equivalent of PATHNAME,
32+# (which may be a relative path, and need not refer to any existing entity).
33+#
34+# On Win32-MSYS build hosts, the returned path is resolved to its true
35+# native Win32 path name, (but with slashes, not backslashes).
36+#
37+# On any other system, it is simply the result which would be obtained
38+# if PATHNAME represented an existing directory, and the pwd command was
39+# executed in that directory.
40+#
41+AC_DEFUN([MSYS_AC_CANONICAL_PATH],
42+[ac_dir="$2"
43+ pwd -W >/dev/null 2>&1 && ac_pwd_w="pwd -W" || ac_pwd_w=pwd
44+ until ac_val=`exec 2>/dev/null; cd "$ac_dir" && $ac_pwd_w`
45+ do
46+ ac_dir=`AS_DIRNAME(["$ac_dir"])`
47+ done
48+ ac_dir=`echo "$ac_dir" | sed 's?^[[./]]*??'`
49+ ac_val=`echo "$ac_val" | sed 's?/*$[]??'`
50+ $1=`echo "$2" | sed "s?^[[./]]*$ac_dir/*?$ac_val/?"'
51+ s?/*$[]??'`dnl
52+])# MSYS_AC_CANONICAL_PATH
53+
1854 # MINGW_AC_CHECK_HEADER( LISTVAR, HEADER )
1955 # ----------------------------------------
2056 # Invoke AC_CHECK_HEADER, to check availability of HEADER;
@@ -152,3 +188,5 @@ AC_DEFUN([CATGETS_AC_CONFIG_VERSION_DEFINE],
152188 [AC_DEFINE_UNQUOTED([$1],[`IFS=.;set x $PACKAGE_VERSION;echo ${$3}`],
153189 [Define numerically to the catgets $2 version number])dnl
154190 ])# CATGETS_AC_CONFIG_VERSION_DEFINE
191+
192+# $RCSfile$Revision$: end of file
--- a/configure
+++ b/configure
@@ -310,7 +310,7 @@ ac_includes_default="\
310310 # include <unistd.h>
311311 #endif"
312312
313-ac_subst_vars='SHELL PATH_SEPARATOR PACKAGE_NAME PACKAGE_TARNAME PACKAGE_VERSION PACKAGE_STRING PACKAGE_BUGREPORT exec_prefix prefix program_transform_name bindir sbindir libexecdir datadir sysconfdir sharedstatedir localstatedir libdir includedir oldincludedir infodir mandir build_alias host_alias target_alias DEFS ECHO_C ECHO_N ECHO_T LIBS CC CFLAGS LDFLAGS CPPFLAGS ac_ct_CC EXEEXT OBJEXT LN_S AR ac_ct_AR INSTALL_PROGRAM INSTALL_SCRIPT INSTALL_DATA MAKE_DLL DLLVERSION HOST_PATH_SEPARATOR REPLACEMENT_HEADERS CPP EGREP LOCAL_HEADERS GENCAT_OBJECTS INCICONV LIBICONV LIBOBJS LTLIBOBJS'
313+ac_subst_vars='SHELL PATH_SEPARATOR PACKAGE_NAME PACKAGE_TARNAME PACKAGE_VERSION PACKAGE_STRING PACKAGE_BUGREPORT exec_prefix prefix program_transform_name bindir sbindir libexecdir datadir sysconfdir sharedstatedir localstatedir libdir includedir oldincludedir infodir mandir build_alias host_alias target_alias DEFS ECHO_C ECHO_N ECHO_T LIBS CC CFLAGS LDFLAGS CPPFLAGS ac_ct_CC EXEEXT OBJEXT LN_S AR ac_ct_AR INSTALL_PROGRAM INSTALL_SCRIPT INSTALL_DATA MAKE_DLL DLLVERSION canonical_prefix HOST_PATH_SEPARATOR REPLACEMENT_HEADERS CPP EGREP LOCAL_HEADERS GENCAT_OBJECTS INCICONV LIBICONV LIBOBJS LTLIBOBJS'
314314 ac_subst_files=''
315315
316316 # Initialize some variables set by options.
@@ -2508,9 +2508,31 @@ echo $ECHO_N "checking release version for mingw32 DLLs... $ECHO_C" >&6
25082508 echo "$as_me:$LINENO: result: ${DLLVERSION}" >&5
25092509 echo "${ECHO_T}${DLLVERSION}" >&6
25102510
2511-# We need to identify the appropriate PATH separator character,
2512-# to be used in the default NLSPATH definition.
2511+# We need to identify the canonical path prefix, and the appropriate
2512+# PATH separator character, to be used in the default NLSPATH definition.
25132513 #
2514+ ac_val=$prefix; test "x$ac_val" = xNONE && ac_val=$ac_default_prefix
2515+ ac_dir="$ac_val"
2516+ pwd -W >/dev/null 2>&1 && ac_pwd_w="pwd -W" || ac_pwd_w=pwd
2517+ until ac_val=`exec 2>/dev/null; cd "$ac_dir" && $ac_pwd_w`
2518+ do
2519+ ac_dir=`(dirname "$ac_dir") 2>/dev/null ||
2520+$as_expr X"$ac_dir" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
2521+ X"$ac_dir" : 'X\(//\)[^/]' \| \
2522+ X"$ac_dir" : 'X\(//\)$' \| \
2523+ X"$ac_dir" : 'X\(/\)' \| \
2524+ . : '\(.\)' 2>/dev/null ||
2525+echo X"$ac_dir" |
2526+ sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{ s//\1/; q; }
2527+ /^X\(\/\/\)[^/].*/{ s//\1/; q; }
2528+ /^X\(\/\/\)$/{ s//\1/; q; }
2529+ /^X\(\/\).*/{ s//\1/; q; }
2530+ s/.*/./; q'`
2531+ done
2532+ ac_dir=`echo "$ac_dir" | sed 's?^[./]*??'`
2533+ ac_val=`echo "$ac_val" | sed 's?/*$??'`
2534+ canonical_prefix=`echo "$ac_val" | sed "s?^[./]*$ac_dir/*?$ac_val/?"'
2535+ s?/*$??'`
25142536 echo "$as_me:$LINENO: checking NLSPATH separator character used at runtime" >&5
25152537 echo $ECHO_N "checking NLSPATH separator character used at runtime... $ECHO_C" >&6
25162538 if test "${mingw_cv_host_path_separator+set}" = set; then
@@ -4332,6 +4354,7 @@ done
43324354 GENCAT_OBJECTS=${GENCAT_OBJECTS}' mciconv.$(OBJEXT)'
43334355 GENCAT_OBJECTS=${GENCAT_OBJECTS}' mcsource.$(OBJEXT)'
43344356 GENCAT_OBJECTS=${GENCAT_OBJECTS}' mcmerge.$(OBJEXT)'
4357+ GENCAT_OBJECTS=${GENCAT_OBJECTS}' mcutfsig.$(OBJEXT)'
43354358
43364359 # We also require `iconv', always expecting it to be pre-installed; we
43374360 # need this to check if we are using a standard `libc' implementation,
@@ -5468,6 +5491,7 @@ s,@INSTALL_SCRIPT@,$INSTALL_SCRIPT,;t t
54685491 s,@INSTALL_DATA@,$INSTALL_DATA,;t t
54695492 s,@MAKE_DLL@,$MAKE_DLL,;t t
54705493 s,@DLLVERSION@,$DLLVERSION,;t t
5494+s,@canonical_prefix@,$canonical_prefix,;t t
54715495 s,@HOST_PATH_SEPARATOR@,$HOST_PATH_SEPARATOR,;t t
54725496 s,@REPLACEMENT_HEADERS@,$REPLACEMENT_HEADERS,;t t
54735497 s,@CPP@,$CPP,;t t
@@ -5970,4 +5994,4 @@ if test "$no_create" != yes; then
59705994 fi
59715995
59725996 #
5973-# $RCSfile$Revision$: end of file
5997+# $RCSfile$Revision: 1.1.1.1 $: end of file
--- a/configure.ac
+++ b/configure.ac
@@ -65,9 +65,10 @@
6565 #
6666 MINGW_AC_HOST_CONFIG_DLL([__MINGW_AC_PACKAGE_DLL_VERSION__])
6767
68-# We need to identify the appropriate PATH separator character,
69-# to be used in the default NLSPATH definition.
68+# We need to identify the canonical path prefix, and the appropriate
69+# PATH separator character, to be used in the default NLSPATH definition.
7070 #
71+ MINGW_AC_HOST_CANONICAL_PREFIX
7172 MINGW_AC_HOST_PATH_SEPARATOR([NLSPATH])
7273
7374 # Schedule an automatic header update, if we find a `unistd.h',
@@ -108,6 +109,7 @@
108109 GENCAT_AC_OBJECTS_ADD([mciconv])
109110 GENCAT_AC_OBJECTS_ADD([mcsource])
110111 GENCAT_AC_OBJECTS_ADD([mcmerge])
112+ GENCAT_AC_OBJECTS_ADD([mcutfsig])
111113
112114 # We also require `iconv', always expecting it to be pre-installed; we
113115 # need this to check if we are using a standard `libc' implementation,
@@ -118,4 +120,4 @@
118120 AC_CONFIG_FILES([Makefile])
119121 AC_OUTPUT
120122 #
121-# $RCSfile$Revision$: end of file
123+# $RCSfile$Revision: 1.1.1.1 $: end of file
--- a/include/gcmsgs.h
+++ b/include/gcmsgs.h
@@ -44,7 +44,7 @@
4444 #define MSG_BAD_CATALOGUE 2, 1, "%s: %s: file is not a valid message catalogue\n"
4545 #define MSG_UNKNOWN_CODESET 2, 2, "%s: %s: unknown codeset descriptor\n"
4646 #define MSG_CODESET_CLASH 2, 3, "%s:%u: codeset `%s' conflicts with prior declaration\n"
47-#define MSG_HAD_CODESET 2, 4, "%s:%u: codeset `%s' was previously declared here\n"
47+#define MSG_HAD_CODESET 2, 4, "%s:%u: codeset `%s' previously declared here\n"
4848 #define MSG_SETNUM_NOT_INCR 2, 5, "invalid set number: expecting > %d; got %d\n"
4949 #define MSG_MSGNUM_NOT_INCR 2, 6, "invalid message number: expecting > %d; got %d\n"
5050 #define MSG_REDEFINED 2, 7, "%s: %s:%u: redefinition of message %u in set %u\n"
@@ -53,7 +53,11 @@
5353 #define MSG_TEXT_DISCARDED 3, 3, "%s:%u: incomplete message marked for deletion\n"
5454 #define MSG_MISSING_NEWLINE 3, 4, "%s:%u: missing newline at end of file\n"
5555 #define MSG_BAD_INDEX 3, 5, "invalid reference in message index"
56+#define MSG_UTF_UNKNOWN 4, 1, "%s:unrecognisable encoding format\n"
57+#define MSG_UTF_SIZE_ERROR 4, 2, "%s:invalid byte count per code point; value was %d\n"
58+#define MSG_UTF_FRAME_ERROR 4, 3, "%s:%u:UTF-%u%cE input framing error\n"
59+#define MSG_UTF_CODESET 4, 4, "%s:input codeset identified as %s; conflicts with ...\n"
5660 /* !
5761 * !$ end of file
5862 */
59-#endif /* !defined( GCMSGS_H ): $RCSfile$Revision: 1.2 $: end of file */
63+#endif /* !defined( GCMSGS_H ): $RCSfile$Revision: 1.3 $: end of file */
--- /dev/null
+++ b/include/mcutfsig.h
@@ -0,0 +1,65 @@
1+#ifndef _MCUTFSIG_H_
2+/*
3+ * mcutfsig.h
4+ *
5+ * $Id$
6+ *
7+ * Copyright (C) 2007, Keith Marshall
8+ *
9+ * Header file defining the `mc_utf_signature' function API, and a set
10+ * of supporting macros, used for obtaining and manipulating an encoding
11+ * `signature' for UTF-8, UTF-16 and UTF-32 encoded input files.
12+ *
13+ * Written by Keith Marshall <keithmarshall@users.sourceforge.net>
14+ * Last Revision: 30-May-2007
15+ *
16+ *
17+ * This is free software. It is provided AS IS, in the hope that it may
18+ * be useful, but WITHOUT WARRANTY OF ANY KIND, not even an IMPLIED WARRANTY
19+ * of MERCHANTABILITY, nor of FITNESS FOR ANY PARTICULAR PURPOSE.
20+ *
21+ * Permission is granted to redistribute this software, either "as is" or
22+ * in modified form, under the terms of the GNU General Public License, as
23+ * published by the Free Software Foundation; either version 2, or (at your
24+ * option) any later version.
25+ *
26+ * You should have received a copy of the GNU General Public License
27+ * along with this software; see the file COPYING. If not, write to the
28+ * Free Software Foundation, 51 Franklin St - Fifth Floor, Boston,
29+ * MA 02110-1301, USA.
30+ *
31+ */
32+#define _MCUTFSIG_H_
33+
34+/*
35+ * Flags used to designate the `endianness' of the input encoding,
36+ * and to record presence or absence of a byte order mark.
37+ *
38+ */
39+#define UTF_BIG_ENDIAN 0100
40+#define UTF_LITTLE_ENDIAN 0200
41+#define UTF_WITH_BYTE_ORDER_MARK 0400
42+
43+/*
44+ * Mask used to isolate a bit-field representing the number of bytes
45+ * per encoding unit in the input stream.
46+ *
47+ */
48+#define UTF_CODE_UNIT_SIZE_MASK 0007
49+
50+/*
51+ * Macros used to disambiguate the codeset name, wrt byte order.
52+ *
53+ */
54+#define UTF_IS_MB(FLAGS) (UTF_CODE_SIZE(FLAGS) > 1)
55+#define UTF_CODE_SIZE(FLAGS) ((FLAGS) & UTF_CODE_UNIT_SIZE_MASK)
56+#define UTF_BYTE_ORDER(FLAGS) (UTF_IS_MB(FLAGS) ? UTF_SUFFIX(FLAGS) : '\0')
57+#define UTF_SUFFIX(FLAGS) (((FLAGS) & UTF_LITTLE_ENDIAN) ? 'L' : 'B')
58+
59+/*
60+ * Function prototypes.
61+ *
62+ */
63+unsigned short mc_ucs_signature( unsigned char *stream );
64+
65+#endif /* !defined(_MCUTFSIG_H_): $RCSfile$Revision$: end of file */
--- a/mcsource.c
+++ b/mcsource.c
@@ -57,6 +57,7 @@
5757 #include <debug.h>
5858
5959 #include <platform.h>
60+#include <mcutfsig.h>
6061
6162 #ifdef DEBUG_BUFSIZ
6263 # undef BUFSIZ
@@ -205,9 +206,35 @@ char *mc_update_workspace( char *buf, char *cache, unsigned int count )
205206 return buf;
206207 }
207208
209+static inline
210+struct msgdict *mc_discard( struct msgdict *index, char *messages )
211+{
212+ /* A helper function, to reclaim all memory allocated to a local
213+ * message dictionary, prior to aborting compilation of the current
214+ * message catalogue source file.
215+ */
216+ while( index )
217+ {
218+ /* Walk the linked list of dictionary index entries, (if any),
219+ * releasing the memory block alloted to each individually.
220+ */
221+ struct msgdict *next = index->link;
222+ free( index );
223+ index = next;
224+ }
225+ if( messages )
226+ /*
227+ * All of the indexed messages are collected into a single block,
228+ * which is allocated, and so must be released, separately.
229+ */
230+ free( messages );
231+ return index;
232+}
233+
208234 struct msgdict *mc_source( const char *input )
209235 {
210236 # define CODESET_DECLARED codeset_decl_src, codeset_decl_lineno
237+# define UTF_TYPE(ORDER) 8 * input_code_size, (ORDER)
211238
212239 dinvoke( int dtrace = 0; )
213240
@@ -231,6 +258,9 @@ struct msgdict *mc_source( const char *input )
231258 static char *codeset = NULL;
232259 static const char *codeset_decl_src = NULL;
233260 static unsigned int codeset_decl_lineno = 0;
261+
262+ unsigned short input_encoding = 0, input_code_size;
263+
234264 static iconv_t iconv_map[2] = {(iconv_t)(-1), (iconv_t)(-1)};
235265 char *messages; off_t msgloc, headroom;
236266 /*
@@ -297,6 +327,101 @@ struct msgdict *mc_source( const char *input )
297327 char *p = buf;
298328 int high_water_mark = count - ( count >> 2 );
299329 dfprintf(( stderr, "\n%s:%u:read %u byte%s", input, linenum, count, count == 1 ? "" : "s" ));
330+
331+ if( input_encoding == 0 )
332+ {
333+ input_encoding = mc_utf_signature( buf );
334+ switch( input_code_size = input_encoding & UTF_CODE_UNIT_SIZE_MASK )
335+ {
336+ case 1:
337+ if( (input_encoding & UTF_WITH_BYTE_ORDER_MARK) != 0 )
338+ {
339+ /*
340+ * This is UTF-8 input encoding, affirmed by the presence of
341+ * the byte order mark, (three bytes), which we must skip.
342+ */
343+ p += 3;
344+ count -= 3;
345+ }
346+ break;
347+
348+ case 2:
349+ case 4:
350+ if( (input_encoding & UTF_WITH_BYTE_ORDER_MARK) != 0 )
351+ {
352+ /* This is either UTF-16, or UTF-32, also affirmed by the BOM,
353+ * which occupies the first code unit, so skip it.
354+ */
355+ p += input_code_size;
356+ count -= input_code_size;
357+ }
358+ break;
359+
360+ default:
361+ /*
362+ * This isn't valid, for any recognisable codeset in the required
363+ * POSIX Portable Character Set input context; diagnose, clean up,
364+ * and bail out.
365+ */
366+ dfputc(( '\n', stderr ));
367+ fprintf( errmsg( MSG_UTF_UNKNOWN ), input );
368+ fprintf( errmsg( MSG_UTF_SIZE_ERROR ), input, input_code_size );
369+ free( messages );
370+ close( input_fd );
371+ return NULL;
372+ }
373+
374+ if( input_encoding > 1 )
375+ {
376+ /* We've detected a UTF input encoding, which implicitly specifies
377+ * the codeset of the messages defined within this source file.
378+ */
379+ char utf_byte_order = UTF_BYTE_ORDER( input_encoding );
380+ sprintf( keyword, "UTF-%d%cE", 8 * input_code_size, utf_byte_order );
381+
382+ dfprintf(( stderr, "\n%s:", input ));
383+ dinvoke( if( (input_encoding & UTF_WITH_BYTE_ORDER_MARK) != 0 ) )
384+ dfprintf(( stderr, "unicode byte order mark detected; " ));
385+ dfprintf(( stderr, "encoding identified as %s", keyword ));
386+
387+ if( codeset != NULL )
388+ {
389+ /* We could coalesce these two conditions into a single test,
390+ * but we choose to nest them thus, to facilitate a possible
391+ * future change, to support codeset alternation.
392+ */
393+ if( strcmp( keyword, codeset ) != 0 )
394+ {
395+ /* The detected UTF input encoding is not compatible with the
396+ * previously declared codeset of the messages in the catalogue;
397+ * diagnose, and skip this source file.
398+ */
399+ dfputc(( '\n', stderr ));
400+ fprintf( errmsg( MSG_UTF_CODESET ), input, keyword );
401+ fprintf( errmsg( MSG_HAD_CODESET ), CODESET_DECLARED, codeset );
402+ free( messages );
403+ close( input_fd );
404+ return NULL;
405+ }
406+ }
407+
408+ else
409+ {
410+ /* We don't yet have a codeset declaration; establish one implicitly,
411+ * based on the identified input encoding.
412+ */
413+ id = strdup( keyword );
414+ if( (codeset = map_codeset( iconv_map, id, "wchar_t" )) == NULL )
415+ {
416+ free( id );
417+ }
418+
419+ else
420+ codeset_decl_src = input;
421+ }
422+ }
423+ }
424+
300425 while( count > 0 )
301426 {
302427 /* ... scanning character by character,
@@ -325,14 +450,64 @@ struct msgdict *mc_source( const char *input )
325450 * transforming to the wide character domain, for local processing.
326451 */
327452 p += ((skip = iconv_mbtowc( &c, p, count )) > 0) ? skip : 0;
453+
454+ /* For UTF-16 or UTF-32 input encodings, the `skip' count *must*
455+ * match the codeset size, ...
456+ */
457+ if( (input_code_size > 1) && (skip != input_code_size) )
458+ {
459+ /* ... or we have a framing error; diagnose,
460+ * and discard this input stream.
461+ */
462+ dfputc(( '\n', stderr ));
463+ fprintf( errmsg( MSG_UTF_FRAME_ERROR ), input, linenum, codeset );
464+ return mc_discard( head, messages );
465+ }
328466 }
329467
330468 else
331469 {
332470 /* We are parsing context which is defined in the POSIX,
333- * or "C" locale, so read single byte character sequences.
471+ * or "C" locale, so read single byte character sequences,
472+ * but stripping out any padding NULs required to fill the
473+ * input stream to a UTF-16 or UTF-32 framing boundary.
334474 */
475+ int utf_skip = input_code_size - 1;
476+ if( (utf_skip > 0) && ((input_encoding & UTF_BIG_ENDIAN) != 0) )
477+ {
478+ /* Big-Endian Unicode should have padding NULs before the
479+ * POSIX `C' locale byte required.
480+ */
481+ while( (*p == '\0') && utf_skip-- && count-- )
482+ ++p;
483+ if( (utf_skip > 0) || (*p == '\0') )
484+ {
485+ /* Diagnose and bail out, if the number of padding NULs
486+ * wasn't what we expected.
487+ */
488+ dfputc(( '\n', stderr ));
489+ fprintf( errmsg( MSG_UTF_FRAME_ERROR ), input, linenum, UTF_TYPE( 'B' ));
490+ return mc_discard( head, messages );
491+ }
492+ }
335493 c = (wchar_t)(*p++);
494+ if( (utf_skip > 0) && ((input_encoding & UTF_LITTLE_ENDIAN) != 0) )
495+ {
496+ /* Little-Endian Unicode should have the padding NULs after
497+ * this significant byte.
498+ */
499+ while( (*p == '\0') && utf_skip-- && count-- )
500+ ++p;
501+ if( (utf_skip > 0) || (*p == '\0') )
502+ {
503+ /* Diagnose and bail out, if the number of padding NULs
504+ * wasn't what we expected.
505+ */
506+ dfputc(( '\n', stderr ));
507+ fprintf( errmsg( MSG_UTF_FRAME_ERROR ), input, linenum, UTF_TYPE( 'L' ));
508+ return mc_discard( head, messages );
509+ }
510+ }
336511 }
337512
338513 if( skip > 0 )
@@ -460,12 +635,13 @@ struct msgdict *mc_source( const char *input )
460635 {
461636 if( strcmp( codeset, id ) != 0 )
462637 {
638+ dfputc(( '\n', stderr ));
463639 fprintf( errmsg( MSG_CODESET_CLASH ), input, linenum, id );
464640 fprintf( errmsg( MSG_HAD_CODESET ), CODESET_DECLARED, codeset );
465641 }
466642 free( id );
467643 }
468- dfprintf(( stderr, "; declare %s", keyword ));
644+ dfprintf(( stderr, "\n%s:%u:declare %s", input, linenum, keyword ));
469645 }
470646 }
471647
@@ -1087,4 +1263,4 @@ struct msgdict *mc_source( const char *input )
10871263 return head;
10881264 }
10891265
1090-/* $RCSfile$Revision: 1.9 $: end of file */
1266+/* $RCSfile$Revision: 1.10 $: end of file */
--- /dev/null
+++ b/mcutfsig.c
@@ -0,0 +1,118 @@
1+/*
2+ * mcutfsig.c
3+ *
4+ * $Id$
5+ *
6+ * Copyright (C) 2007, Keith Marshall
7+ *
8+ * This file implements the `mc_utf_signature' function, which is used
9+ * by `gencat', to identify message definition source files which appear
10+ * to exhibit any recognisable standard of Unicode encoding.
11+ *
12+ * Written by Keith Marshall <keithmarshall@users.sourceforge.net>
13+ * Last Revision: 22-May-2007
14+ *
15+ *
16+ * This is free software. It is provided AS IS, in the hope that it may
17+ * be useful, but WITHOUT WARRANTY OF ANY KIND, not even an IMPLIED WARRANTY
18+ * of MERCHANTABILITY, nor of FITNESS FOR ANY PARTICULAR PURPOSE.
19+ *
20+ * Permission is granted to redistribute this software, either "as is" or
21+ * in modified form, under the terms of the GNU General Public License, as
22+ * published by the Free Software Foundation; either version 2, or (at your
23+ * option) any later version.
24+ *
25+ * You should have received a copy of the GNU General Public License
26+ * along with this software; see the file COPYING. If not, write to the
27+ * Free Software Foundation, 51 Franklin St - Fifth Floor, Boston,
28+ * MA 02110-1301, USA.
29+ *
30+ */
31+#include <mcutfsig.h>
32+
33+unsigned short mc_utf_signature( unsigned char *stream )
34+{
35+ /* Inspect the first few bytes of the specified data stream;
36+ * attempt to identify a potential Unicode encoding signature,
37+ * defaulting to non-specific single byte encoding units.
38+ */
39+ unsigned short signature = 1;
40+ /*
41+ * The first character in the input stream must not be NUL,
42+ * and must be a member of the POSIX Portable Character Set;
43+ * if it isn't, then it may indicate a Unicode stream.
44+ */
45+ if( *stream == 0 )
46+ {
47+ /* An initial NUL byte anticipates a big-endian Unicode stream;
48+ * one such byte implies UTF-16, without a Byte Order Mark, while
49+ * two such followed by the big-endian form of the BOM, or three
50+ * without a BOM, indicates UTF-32.
51+ */
52+ int count = 4;
53+ while( count-- && (*stream++ == '\0') )
54+ ++signature;
55+ signature += UTF_BIG_ENDIAN;
56+ }
57+ if( (*stream & 0xfe) == 0xfe )
58+ {
59+ /* This looks like it might be a Unicode Byte Order Mark;
60+ * identify the UTF encoding standard, if any, which it represents.
61+ */
62+ unsigned bom = *stream++ << 8; bom |= *stream++;
63+ switch( bom )
64+ {
65+ case 0xfffe:
66+ /*
67+ * This is the BOM signature for a little-endian Unicode stream;
68+ * the first byte has already been included in the initial size
69+ * assigned for the encoding unit; adjust this to accommodate the
70+ * second byte, and incorporate the little-endian flag.
71+ */
72+ signature += UTF_WITH_BYTE_ORDER_MARK + UTF_LITTLE_ENDIAN + 1;
73+ if( *stream == '\0' )
74+ {
75+ int count = 4;
76+ while( count-- && (*stream++ == '\0') )
77+ ++signature;
78+ }
79+ break;
80+
81+ case 0xfeff:
82+ /*
83+ * This is the BOM signature for a big-endian Unicode stream;
84+ * if preceded by two NULs, (already counted), then it is UTF-32,
85+ * else it is UTF-16. In either case, adding an additional one
86+ * to the accumulated size of the encoding unit yields the
87+ * desired result, since the first byte of the BOM, and
88+ * any leading NULs, have already been counted.
89+ */
90+ signature += UTF_WITH_BYTE_ORDER_MARK + UTF_BIG_ENDIAN + 1;
91+ break;
92+
93+ case 0xffbb:
94+ /*
95+ * Provided it's followed by one further `0xbf' byte, this is the
96+ * BOM used as a signature for a UTF-8 encoded stream; it becomes
97+ * invalid, if there were any leading NUL bytes.
98+ */
99+ if( (signature == 1) && (*stream++ == (unsigned char)('\xbf')) )
100+ signature |= UTF_WITH_BYTE_ORDER_MARK;
101+ }
102+ }
103+ else if( (signature == 1) && (*++stream == 0) )
104+ {
105+ /* NUL as the second byte in the input stream indicates a probable
106+ * little-endian Unicode input stream, although this is not indicated
107+ * by a Byte Order Mark; count the trailing NULs, to determine if we
108+ * should interpret it as UTF-16LE, or as UTF-32LE.
109+ */
110+ int count = 4;
111+ while( count-- && (*stream++ == '\0') )
112+ ++signature;
113+ signature += UTF_LITTLE_ENDIAN;
114+ }
115+ return signature;
116+}
117+
118+/* $RCSfile$Revision$: end of file */