OSDN > Desarrollador >

keith > Chamber > mingw-catgets > Commit

keith

mingw-catgets
Fork

(Original repository, No fork origin)

R/O
HTTP
SSH
HTTPS

Commit

Commit MetaInfo

Revisión	b7a9785f64e65c62671dcba922e77872d0324bae (tree)
Tiempo	2007-06-19 07:36:08
Autor	Keith Marshall <keithmarshall@user...>
Commiter	Keith Marshall

Log Message

Handle unicode input streams.

Cambiar Resumen

modified: ChangeLog (diff)
modified: Makefile.in (diff)
modified: aclocal.m4 (diff)
modified: configure (diff)
modified: configure.ac (diff)
modified: include/gcmsgs.h (diff)
add: include/mcutfsig.h (diff)
modified: mcsource.c (diff)
add: mcutfsig.c (diff)

Diferencia incremental

--- a/ChangeLog

+++ b/ChangeLog

		@@ -1,3 +1,37 @@
	1	+2007-06-18 Keith Marshall <keithmarshall@users.sourceforge.net>
	2	+
	3	+ Handle unicode input streams.
	4	+
	5	+ * mcutfsig.c, include/mcutfsig.h: New files; they define the
	6	+ interface and implementation for...
	7	+ (mc_utf_signature): ...this new function.
	8	+
	9	+ * mcsource.c (mc_discard): New static function.
	10	+ (mc_source): Use it to clean up heap on abnormal termination.
	11	+ Include `mcutfsig.h'; use `mc_utf_signature' to identify unicode
	12	+ streams; map codeset using new local variables `input_encoding'
	13	+ and `input_code_size'; use them with...
	14	+ (UTF_TYPE): ...this new macro, to parse input accordingly.
	15	+
	16	+ * include/gcmsgs.h (MSG_HAD_CODESET): New wording.
	17	+ (MSG_UTF_CODESET, NMSG_UTF_UNKNOWN): New message definitions.
	18	+ (MSG_UTF_SIZE_ERROR, MSG_UTF_FRAME_ERROR): Likewise.
	19	+
	20	+ * Makefile.in (canonical_prefix): New macro.
	21	+ (HEADER_TRANSFORM_SCRIPT): Use it.
	22	+ (GENCAT_HEADERS): Add `mcutfsig.h'.
	23	+ (GENCAT_SOURCES): Add `mcutfsig.c'.
	24	+ (target): Macro renamed as...
	25	+ (tool_prefix): ...this.
	26	+ (install-progs): Updated to use it.
	27	+ (clean): Remove all locally generated headers.
	28	+
	29	+ * aclocal.m4 (MINGW_AC_HOST_CANONICAL_PREFIX): New macro.
	30	+ (MSYS_AC_CANONICAL_PATH): New macro; copied from `man' sources.
	31	+ * configure.ac: Use them.
	32	+ (GENCAT_AC_OBJECTS_ADD): Add `mcutfsig'.
	33	+ * configure: Regenerated.
	34	+
1	35	2007-06-01 Keith Marshall <keithmarshall@users.sourceforge.net>
2	36
3	37	* repl/include/langinfo.h (LC_CTYPE): Include locale.h, to define.

--- a/Makefile.in

+++ b/Makefile.in

		@@ -32,6 +32,8 @@ VPATH = ${srcdir}:${srcdir}/include:${srcdir}/repl
32	32	prefix = @prefix@
33	33	exec_prefix = @exec_prefix@
34	34
	35	+canonical_prefix = @canonical_prefix@
	36	+
35	37	bindir = @bindir@
36	38	libdir = @libdir@
37	39	includedir = @includedir@

		@@ -85,8 +87,8 @@ CATGETS_HEADERS = mctab.h msgcat.h
85	87	CATGETS_SOURCES = catopen.c catgets.c catclose.c mctab.c mcref.c mcfree.c
86	88	CATGETS_OBJECTS = $(subst .c,.$(OBJEXT),$(CATGETS_SOURCES))
87	89
88		-GENCAT_HEADERS = gencat.h gcmsgs.h
89		-GENCAT_SOURCES = gencat.c mcload.c mcsource.c mciconv.c mcmerge.c
	90	+GENCAT_HEADERS = gencat.h gcmsgs.h mcutfsig.h
	91	+GENCAT_SOURCES = gencat.c mcload.c mcsource.c mciconv.c mcmerge.c mcutfsig.c
90	92	GENCAT_OBJECTS = @GENCAT_OBJECTS@
91	93
92	94	OTHER_SOURCES = COPYING ChangeLog mkinstalldirs install-sh $(MISC_SOURCES)

		@@ -110,7 +112,7 @@ INSTALL_PROGS = gendefs.awk
110	112	$(LN_S) $< $@
111	113
112	114	HEADER_TRANSFORM_SCRIPT = \
113		- -e 's,@HOST_PREFIX@,${prefix},g' \
	115	+ -e 's,@HOST_PREFIX@,${canonical_prefix},g' \
114	116	-e 's,@PATH_SEPARATOR_INTERNAL@,$(PATH_SEPARATOR_INTERNAL),g' \
115	117	-e 's,$(subst .,\.,$@)\.in$$,$@ -- $(AUTOGENERATED),'
116	118

		@@ -146,7 +148,7 @@ $(GENCAT_OBJECTS): $(GENCAT_HEADERS) $(CATGETS_OBJECTS)
146	148	## ============== ##
147	149
148	150	mkinstalldirs = ${srcdir}/mkinstalldirs
149		-target = @target_alias@`test -n "@target_alias@" && echo "-"`
	151	+tool_prefix = @target_alias@`test -n "@target_alias@" && echo "-"`
150	152
151	153	INSTALL = @INSTALL@
152	154	INSTALL_PROGRAM = @INSTALL_PROGRAM@

		@@ -159,10 +161,10 @@ install: install-headers install-progs install-libs
159	161	install-progs: all
160	162	$(mkinstalldirs) --require ${exec_prefix} ${bindir} ${libdir}
161	163	for prog in $(BUILD_PROGS); do \
162		- $(INSTALL_PROGRAM) $$prog ${bindir}/$(target)$$prog; \
	164	+ $(INSTALL_PROGRAM) $$prog ${bindir}/$(tool_prefix)$$prog; \
163	165	done
164	166	for prog in $(INSTALL_PROGS); do \
165		- $(INSTALL_PROGRAM) ${srcdir}/$$prog ${bindir}/$(target)$$prog; \
	167	+ $(INSTALL_PROGRAM) ${srcdir}/$$prog ${bindir}/$(tool_prefix)$$prog; \
166	168	done
167	169
168	170	install-libs: all-libs $(all-dll)

		@@ -230,9 +232,10 @@ bindist: all
230	232	## ========== ##
231	233
232	234	clean:
	235	+ rm -f nlspath.h $(LOCAL_HEADERS) $(REPLACEMENT_HEADERS)
233	236	rm -f .$(OBJEXT) ~ gencat$(EXEEXT)
234	237
235	238	distclean: clean
236	239	rm -rf Makefile config.h config.[ls]* autom4te.cache
237	240
238		-# $RCSfile$Revision$: end of file
	241	+# $RCSfile$Revision: 1.1.1.1 $: end of file

--- a/aclocal.m4

+++ b/aclocal.m4

		@@ -15,6 +15,42 @@ AC_DEFUN([MINGW_AC_WIN32_NATIVE_HOST],
15	15	#endif]], [mingw_cv_win32_host=no], [mingw_cv_win32_host=yes]))dnl
16	16	])# MINGW_AC_WIN32_NATIVE_HOST
17	17
	18	+# MINGW_AC_HOST_CANONICAL_PREFIX
	19	+# ------------------------------
	20	+# Set the AC_SUBST variable `canonical_prefix' to the canonical form
	21	+# of `prefix', as applicable for a mingw32 host.
	22	+#
	23	+AC_DEFUN([MINGW_AC_HOST_CANONICAL_PREFIX],
	24	+[AC_SUBST([canonical_prefix])dnl
	25	+ ac_val=$prefix; test "x$ac_val" = xNONE && ac_val=$ac_default_prefix
	26	+ MSYS_AC_CANONICAL_PATH([canonical_prefix],[$ac_val])dnl
	27	+])# MINGW_AC_HOST_CANONICAL_PREFIX
	28	+
	29	+# MSYS_AC_CANONICAL_PATH( VAR, PATHNAME )
	30	+# ---------------------------------------
	31	+# Set VAR to the canonically resolved absolute equivalent of PATHNAME,
	32	+# (which may be a relative path, and need not refer to any existing entity).
	33	+#
	34	+# On Win32-MSYS build hosts, the returned path is resolved to its true
	35	+# native Win32 path name, (but with slashes, not backslashes).
	36	+#
	37	+# On any other system, it is simply the result which would be obtained
	38	+# if PATHNAME represented an existing directory, and the pwd command was
	39	+# executed in that directory.
	40	+#
	41	+AC_DEFUN([MSYS_AC_CANONICAL_PATH],
	42	+[ac_dir="$2"
	43	+ pwd -W >/dev/null 2>&1 && ac_pwd_w="pwd -W" \|\| ac_pwd_w=pwd
	44	+ until ac_val=`exec 2>/dev/null; cd "$ac_dir" && $ac_pwd_w`
	45	+ do
	46	+ ac_dir=`AS_DIRNAME(["$ac_dir"])`
	47	+ done
	48	+ ac_dir=`echo "$ac_dir" \| sed 's?^[[./]]*??'`
	49	+ ac_val=`echo "$ac_val" \| sed 's?/*$[]??'`
	50	+ $1=`echo "$2" \| sed "s?^[[./]]$ac_dir/?$ac_val/?"'
	51	+ s?/*$[]??'`dnl
	52	+])# MSYS_AC_CANONICAL_PATH
	53	+
18	54	# MINGW_AC_CHECK_HEADER( LISTVAR, HEADER )
19	55	# ----------------------------------------
20	56	# Invoke AC_CHECK_HEADER, to check availability of HEADER;

		@@ -152,3 +188,5 @@ AC_DEFUN([CATGETS_AC_CONFIG_VERSION_DEFINE],
152	188	[AC_DEFINE_UNQUOTED([$1],[`IFS=.;set x $PACKAGE_VERSION;echo ${$3}`],
153	189	[Define numerically to the catgets $2 version number])dnl
154	190	])# CATGETS_AC_CONFIG_VERSION_DEFINE
	191	+
	192	+# $RCSfile$Revision$: end of file

--- a/configure

+++ b/configure

		@@ -310,7 +310,7 @@ ac_includes_default="\
310	310	# include <unistd.h>
311	311	#endif"
312	312
313		-ac_subst_vars='SHELL PATH_SEPARATOR PACKAGE_NAME PACKAGE_TARNAME PACKAGE_VERSION PACKAGE_STRING PACKAGE_BUGREPORT exec_prefix prefix program_transform_name bindir sbindir libexecdir datadir sysconfdir sharedstatedir localstatedir libdir includedir oldincludedir infodir mandir build_alias host_alias target_alias DEFS ECHO_C ECHO_N ECHO_T LIBS CC CFLAGS LDFLAGS CPPFLAGS ac_ct_CC EXEEXT OBJEXT LN_S AR ac_ct_AR INSTALL_PROGRAM INSTALL_SCRIPT INSTALL_DATA MAKE_DLL DLLVERSION HOST_PATH_SEPARATOR REPLACEMENT_HEADERS CPP EGREP LOCAL_HEADERS GENCAT_OBJECTS INCICONV LIBICONV LIBOBJS LTLIBOBJS'
	313	+ac_subst_vars='SHELL PATH_SEPARATOR PACKAGE_NAME PACKAGE_TARNAME PACKAGE_VERSION PACKAGE_STRING PACKAGE_BUGREPORT exec_prefix prefix program_transform_name bindir sbindir libexecdir datadir sysconfdir sharedstatedir localstatedir libdir includedir oldincludedir infodir mandir build_alias host_alias target_alias DEFS ECHO_C ECHO_N ECHO_T LIBS CC CFLAGS LDFLAGS CPPFLAGS ac_ct_CC EXEEXT OBJEXT LN_S AR ac_ct_AR INSTALL_PROGRAM INSTALL_SCRIPT INSTALL_DATA MAKE_DLL DLLVERSION canonical_prefix HOST_PATH_SEPARATOR REPLACEMENT_HEADERS CPP EGREP LOCAL_HEADERS GENCAT_OBJECTS INCICONV LIBICONV LIBOBJS LTLIBOBJS'
314	314	ac_subst_files=''
315	315
316	316	# Initialize some variables set by options.

		@@ -2508,9 +2508,31 @@ echo $ECHO_N "checking release version for mingw32 DLLs... $ECHO_C" >&6
2508	2508	echo "$as_me:$LINENO: result: ${DLLVERSION}" >&5
2509	2509	echo "${ECHO_T}${DLLVERSION}" >&6
2510	2510
2511		-# We need to identify the appropriate PATH separator character,
2512		-# to be used in the default NLSPATH definition.
	2511	+# We need to identify the canonical path prefix, and the appropriate
	2512	+# PATH separator character, to be used in the default NLSPATH definition.
2513	2513	#
	2514	+ ac_val=$prefix; test "x$ac_val" = xNONE && ac_val=$ac_default_prefix
	2515	+ ac_dir="$ac_val"
	2516	+ pwd -W >/dev/null 2>&1 && ac_pwd_w="pwd -W" \|\| ac_pwd_w=pwd
	2517	+ until ac_val=`exec 2>/dev/null; cd "$ac_dir" && $ac_pwd_w`
	2518	+ do
	2519	+ ac_dir=`(dirname "$ac_dir") 2>/dev/null \|\|
	2520	+$as_expr X"$ac_dir" : 'X$.[^/]$//[^/][^/]/$' \\| \
	2521	+ X"$ac_dir" : 'X$//$[^/]' \\| \
	2522	+ X"$ac_dir" : 'X$//$$' \\| \
	2523	+ X"$ac_dir" : 'X$/$' \\| \
	2524	+ . : '$.$' 2>/dev/null \|\|
	2525	+echo X"$ac_dir" \|
	2526	+ sed '/^X$.[^/]$\/\/[^/][^/]\/$/{ s//\1/; q; }
	2527	+ /^X$\/\/$[^/].*/{ s//\1/; q; }
	2528	+ /^X$\/\/$$/{ s//\1/; q; }
	2529	+ /^X$\/$.*/{ s//\1/; q; }
	2530	+ s/.*/./; q'`
	2531	+ done
	2532	+ ac_dir=`echo "$ac_dir" \| sed 's?^[./]*??'`
	2533	+ ac_val=`echo "$ac_val" \| sed 's?/*$??'`
	2534	+ canonical_prefix=`echo "$ac_val" \| sed "s?^[./]$ac_dir/?$ac_val/?"'
	2535	+ s?/*$??'`
2514	2536	echo "$as_me:$LINENO: checking NLSPATH separator character used at runtime" >&5
2515	2537	echo $ECHO_N "checking NLSPATH separator character used at runtime... $ECHO_C" >&6
2516	2538	if test "${mingw_cv_host_path_separator+set}" = set; then

		@@ -4332,6 +4354,7 @@ done
4332	4354	GENCAT_OBJECTS=${GENCAT_OBJECTS}' mciconv.$(OBJEXT)'
4333	4355	GENCAT_OBJECTS=${GENCAT_OBJECTS}' mcsource.$(OBJEXT)'
4334	4356	GENCAT_OBJECTS=${GENCAT_OBJECTS}' mcmerge.$(OBJEXT)'
	4357	+ GENCAT_OBJECTS=${GENCAT_OBJECTS}' mcutfsig.$(OBJEXT)'
4335	4358
4336	4359	# We also require `iconv', always expecting it to be pre-installed; we
4337	4360	# need this to check if we are using a standard `libc' implementation,

		@@ -5468,6 +5491,7 @@ s,@INSTALL_SCRIPT@,$INSTALL_SCRIPT,;t t
5468	5491	s,@INSTALL_DATA@,$INSTALL_DATA,;t t
5469	5492	s,@MAKE_DLL@,$MAKE_DLL,;t t
5470	5493	s,@DLLVERSION@,$DLLVERSION,;t t
	5494	+s,@canonical_prefix@,$canonical_prefix,;t t
5471	5495	s,@HOST_PATH_SEPARATOR@,$HOST_PATH_SEPARATOR,;t t
5472	5496	s,@REPLACEMENT_HEADERS@,$REPLACEMENT_HEADERS,;t t
5473	5497	s,@CPP@,$CPP,;t t

		@@ -5970,4 +5994,4 @@ if test "$no_create" != yes; then
5970	5994	fi
5971	5995
5972	5996	#
5973		-# $RCSfile$Revision$: end of file
	5997	+# $RCSfile$Revision: 1.1.1.1 $: end of file

--- a/configure.ac

+++ b/configure.ac

		@@ -65,9 +65,10 @@
65	65	#
66	66	MINGW_AC_HOST_CONFIG_DLL([__MINGW_AC_PACKAGE_DLL_VERSION__])
67	67
68		-# We need to identify the appropriate PATH separator character,
69		-# to be used in the default NLSPATH definition.
	68	+# We need to identify the canonical path prefix, and the appropriate
	69	+# PATH separator character, to be used in the default NLSPATH definition.
70	70	#
	71	+ MINGW_AC_HOST_CANONICAL_PREFIX
71	72	MINGW_AC_HOST_PATH_SEPARATOR([NLSPATH])
72	73
73	74	# Schedule an automatic header update, if we find a `unistd.h',

		@@ -108,6 +109,7 @@
108	109	GENCAT_AC_OBJECTS_ADD([mciconv])
109	110	GENCAT_AC_OBJECTS_ADD([mcsource])
110	111	GENCAT_AC_OBJECTS_ADD([mcmerge])
	112	+ GENCAT_AC_OBJECTS_ADD([mcutfsig])
111	113
112	114	# We also require `iconv', always expecting it to be pre-installed; we
113	115	# need this to check if we are using a standard `libc' implementation,

		@@ -118,4 +120,4 @@
118	120	AC_CONFIG_FILES([Makefile])
119	121	AC_OUTPUT
120	122	#
121		-# $RCSfile$Revision$: end of file
	123	+# $RCSfile$Revision: 1.1.1.1 $: end of file

--- a/include/gcmsgs.h

+++ b/include/gcmsgs.h

		@@ -44,7 +44,7 @@
44	44	#define MSG_BAD_CATALOGUE 2, 1, "%s: %s: file is not a valid message catalogue\n"
45	45	#define MSG_UNKNOWN_CODESET 2, 2, "%s: %s: unknown codeset descriptor\n"
46	46	#define MSG_CODESET_CLASH 2, 3, "%s:%u: codeset `%s' conflicts with prior declaration\n"
47		-#define MSG_HAD_CODESET 2, 4, "%s:%u: codeset `%s' was previously declared here\n"
	47	+#define MSG_HAD_CODESET 2, 4, "%s:%u: codeset `%s' previously declared here\n"
48	48	#define MSG_SETNUM_NOT_INCR 2, 5, "invalid set number: expecting > %d; got %d\n"
49	49	#define MSG_MSGNUM_NOT_INCR 2, 6, "invalid message number: expecting > %d; got %d\n"
50	50	#define MSG_REDEFINED 2, 7, "%s: %s:%u: redefinition of message %u in set %u\n"

		@@ -53,7 +53,11 @@
53	53	#define MSG_TEXT_DISCARDED 3, 3, "%s:%u: incomplete message marked for deletion\n"
54	54	#define MSG_MISSING_NEWLINE 3, 4, "%s:%u: missing newline at end of file\n"
55	55	#define MSG_BAD_INDEX 3, 5, "invalid reference in message index"
	56	+#define MSG_UTF_UNKNOWN 4, 1, "%s:unrecognisable encoding format\n"
	57	+#define MSG_UTF_SIZE_ERROR 4, 2, "%s:invalid byte count per code point; value was %d\n"
	58	+#define MSG_UTF_FRAME_ERROR 4, 3, "%s:%u:UTF-%u%cE input framing error\n"
	59	+#define MSG_UTF_CODESET 4, 4, "%s:input codeset identified as %s; conflicts with ...\n"
56	60	/* !
57	61	* !$ end of file
58	62	*/
59		-#endif /* !defined( GCMSGS_H ): $RCSfile$Revision: 1.2 $: end of file */
	63	+#endif /* !defined( GCMSGS_H ): $RCSfile$Revision: 1.3 $: end of file */

--- /dev/null

+++ b/include/mcutfsig.h

		@@ -0,0 +1,65 @@
	1	+#ifndef _MCUTFSIG_H_
	2	+/*
	3	+ * mcutfsig.h
	4	+ *
	5	+ * $Id$
	6	+ *
	7	+ * Copyright (C) 2007, Keith Marshall
	8	+ *
	9	+ * Header file defining the `mc_utf_signature' function API, and a set
	10	+ * of supporting macros, used for obtaining and manipulating an encoding
	11	+ * `signature' for UTF-8, UTF-16 and UTF-32 encoded input files.
	12	+ *
	13	+ * Written by Keith Marshall <keithmarshall@users.sourceforge.net>
	14	+ * Last Revision: 30-May-2007
	15	+ *
	16	+ *
	17	+ * This is free software. It is provided AS IS, in the hope that it may
	18	+ * be useful, but WITHOUT WARRANTY OF ANY KIND, not even an IMPLIED WARRANTY
	19	+ * of MERCHANTABILITY, nor of FITNESS FOR ANY PARTICULAR PURPOSE.
	20	+ *
	21	+ * Permission is granted to redistribute this software, either "as is" or
	22	+ * in modified form, under the terms of the GNU General Public License, as
	23	+ * published by the Free Software Foundation; either version 2, or (at your
	24	+ * option) any later version.
	25	+ *
	26	+ * You should have received a copy of the GNU General Public License
	27	+ * along with this software; see the file COPYING. If not, write to the
	28	+ * Free Software Foundation, 51 Franklin St - Fifth Floor, Boston,
	29	+ * MA 02110-1301, USA.
	30	+ *
	31	+ */
	32	+#define _MCUTFSIG_H_
	33	+
	34	+/*
	35	+ * Flags used to designate the `endianness' of the input encoding,
	36	+ * and to record presence or absence of a byte order mark.
	37	+ *
	38	+ */
	39	+#define UTF_BIG_ENDIAN 0100
	40	+#define UTF_LITTLE_ENDIAN 0200
	41	+#define UTF_WITH_BYTE_ORDER_MARK 0400
	42	+
	43	+/*
	44	+ * Mask used to isolate a bit-field representing the number of bytes
	45	+ * per encoding unit in the input stream.
	46	+ *
	47	+ */
	48	+#define UTF_CODE_UNIT_SIZE_MASK 0007
	49	+
	50	+/*
	51	+ * Macros used to disambiguate the codeset name, wrt byte order.
	52	+ *
	53	+ */
	54	+#define UTF_IS_MB(FLAGS) (UTF_CODE_SIZE(FLAGS) > 1)
	55	+#define UTF_CODE_SIZE(FLAGS) ((FLAGS) & UTF_CODE_UNIT_SIZE_MASK)
	56	+#define UTF_BYTE_ORDER(FLAGS) (UTF_IS_MB(FLAGS) ? UTF_SUFFIX(FLAGS) : '\0')
	57	+#define UTF_SUFFIX(FLAGS) (((FLAGS) & UTF_LITTLE_ENDIAN) ? 'L' : 'B')
	58	+
	59	+/*
	60	+ * Function prototypes.
	61	+ *
	62	+ */
	63	+unsigned short mc_ucs_signature( unsigned char *stream );
	64	+
	65	+#endif /* !defined(_MCUTFSIG_H_): $RCSfile$Revision$: end of file */

--- a/mcsource.c

+++ b/mcsource.c

		@@ -57,6 +57,7 @@
57	57	#include <debug.h>
58	58
59	59	#include <platform.h>
	60	+#include <mcutfsig.h>
60	61
61	62	#ifdef DEBUG_BUFSIZ
62	63	# undef BUFSIZ

		@@ -205,9 +206,35 @@ char mc_update_workspace( char buf, char *cache, unsigned int count )
205	206	return buf;
206	207	}
207	208
	209	+static inline
	210	+struct msgdict mc_discard( struct msgdict index, char *messages )
	211	+{
	212	+ /* A helper function, to reclaim all memory allocated to a local
	213	+ * message dictionary, prior to aborting compilation of the current
	214	+ * message catalogue source file.
	215	+ */
	216	+ while( index )
	217	+ {
	218	+ /* Walk the linked list of dictionary index entries, (if any),
	219	+ * releasing the memory block alloted to each individually.
	220	+ */
	221	+ struct msgdict *next = index->link;
	222	+ free( index );
	223	+ index = next;
	224	+ }
	225	+ if( messages )
	226	+ /*
	227	+ * All of the indexed messages are collected into a single block,
	228	+ * which is allocated, and so must be released, separately.
	229	+ */
	230	+ free( messages );
	231	+ return index;
	232	+}
	233	+
208	234	struct msgdict mc_source( const char input )
209	235	{
210	236	# define CODESET_DECLARED codeset_decl_src, codeset_decl_lineno
	237	+# define UTF_TYPE(ORDER) 8 * input_code_size, (ORDER)
211	238
212	239	dinvoke( int dtrace = 0; )
213	240

		@@ -231,6 +258,9 @@ struct msgdict mc_source( const char input )
231	258	static char *codeset = NULL;
232	259	static const char *codeset_decl_src = NULL;
233	260	static unsigned int codeset_decl_lineno = 0;
	261	+
	262	+ unsigned short input_encoding = 0, input_code_size;
	263	+
234	264	static iconv_t iconv_map[2] = {(iconv_t)(-1), (iconv_t)(-1)};
235	265	char *messages; off_t msgloc, headroom;
236	266	/*

		@@ -297,6 +327,101 @@ struct msgdict mc_source( const char input )
297	327	char *p = buf;
298	328	int high_water_mark = count - ( count >> 2 );
299	329	dfprintf(( stderr, "\n%s:%u:read %u byte%s", input, linenum, count, count == 1 ? "" : "s" ));
	330	+
	331	+ if( input_encoding == 0 )
	332	+ {
	333	+ input_encoding = mc_utf_signature( buf );
	334	+ switch( input_code_size = input_encoding & UTF_CODE_UNIT_SIZE_MASK )
	335	+ {
	336	+ case 1:
	337	+ if( (input_encoding & UTF_WITH_BYTE_ORDER_MARK) != 0 )
	338	+ {
	339	+ /*
	340	+ * This is UTF-8 input encoding, affirmed by the presence of
	341	+ * the byte order mark, (three bytes), which we must skip.
	342	+ */
	343	+ p += 3;
	344	+ count -= 3;
	345	+ }
	346	+ break;
	347	+
	348	+ case 2:
	349	+ case 4:
	350	+ if( (input_encoding & UTF_WITH_BYTE_ORDER_MARK) != 0 )
	351	+ {
	352	+ /* This is either UTF-16, or UTF-32, also affirmed by the BOM,
	353	+ * which occupies the first code unit, so skip it.
	354	+ */
	355	+ p += input_code_size;
	356	+ count -= input_code_size;
	357	+ }
	358	+ break;
	359	+
	360	+ default:
	361	+ /*
	362	+ * This isn't valid, for any recognisable codeset in the required
	363	+ * POSIX Portable Character Set input context; diagnose, clean up,
	364	+ * and bail out.
	365	+ */
	366	+ dfputc(( '\n', stderr ));
	367	+ fprintf( errmsg( MSG_UTF_UNKNOWN ), input );
	368	+ fprintf( errmsg( MSG_UTF_SIZE_ERROR ), input, input_code_size );
	369	+ free( messages );
	370	+ close( input_fd );
	371	+ return NULL;
	372	+ }
	373	+
	374	+ if( input_encoding > 1 )
	375	+ {
	376	+ /* We've detected a UTF input encoding, which implicitly specifies
	377	+ * the codeset of the messages defined within this source file.
	378	+ */
	379	+ char utf_byte_order = UTF_BYTE_ORDER( input_encoding );
	380	+ sprintf( keyword, "UTF-%d%cE", 8 * input_code_size, utf_byte_order );
	381	+
	382	+ dfprintf(( stderr, "\n%s:", input ));
	383	+ dinvoke( if( (input_encoding & UTF_WITH_BYTE_ORDER_MARK) != 0 ) )
	384	+ dfprintf(( stderr, "unicode byte order mark detected; " ));
	385	+ dfprintf(( stderr, "encoding identified as %s", keyword ));
	386	+
	387	+ if( codeset != NULL )
	388	+ {
	389	+ /* We could coalesce these two conditions into a single test,
	390	+ * but we choose to nest them thus, to facilitate a possible
	391	+ * future change, to support codeset alternation.
	392	+ */
	393	+ if( strcmp( keyword, codeset ) != 0 )
	394	+ {
	395	+ /* The detected UTF input encoding is not compatible with the
	396	+ * previously declared codeset of the messages in the catalogue;
	397	+ * diagnose, and skip this source file.
	398	+ */
	399	+ dfputc(( '\n', stderr ));
	400	+ fprintf( errmsg( MSG_UTF_CODESET ), input, keyword );
	401	+ fprintf( errmsg( MSG_HAD_CODESET ), CODESET_DECLARED, codeset );
	402	+ free( messages );
	403	+ close( input_fd );
	404	+ return NULL;
	405	+ }
	406	+ }
	407	+
	408	+ else
	409	+ {
	410	+ /* We don't yet have a codeset declaration; establish one implicitly,
	411	+ * based on the identified input encoding.
	412	+ */
	413	+ id = strdup( keyword );
	414	+ if( (codeset = map_codeset( iconv_map, id, "wchar_t" )) == NULL )
	415	+ {
	416	+ free( id );
	417	+ }
	418	+
	419	+ else
	420	+ codeset_decl_src = input;
	421	+ }
	422	+ }
	423	+ }
	424	+
300	425	while( count > 0 )
301	426	{
302	427	/* ... scanning character by character,

		@@ -325,14 +450,64 @@ struct msgdict mc_source( const char input )
325	450	* transforming to the wide character domain, for local processing.
326	451	*/
327	452	p += ((skip = iconv_mbtowc( &c, p, count )) > 0) ? skip : 0;
	453	+
	454	+ /* For UTF-16 or UTF-32 input encodings, the `skip' count must
	455	+ * match the codeset size, ...
	456	+ */
	457	+ if( (input_code_size > 1) && (skip != input_code_size) )
	458	+ {
	459	+ /* ... or we have a framing error; diagnose,
	460	+ * and discard this input stream.
	461	+ */
	462	+ dfputc(( '\n', stderr ));
	463	+ fprintf( errmsg( MSG_UTF_FRAME_ERROR ), input, linenum, codeset );
	464	+ return mc_discard( head, messages );
	465	+ }
328	466	}
329	467
330	468	else
331	469	{
332	470	/* We are parsing context which is defined in the POSIX,
333		- * or "C" locale, so read single byte character sequences.
	471	+ * or "C" locale, so read single byte character sequences,
	472	+ * but stripping out any padding NULs required to fill the
	473	+ * input stream to a UTF-16 or UTF-32 framing boundary.
334	474	*/
	475	+ int utf_skip = input_code_size - 1;
	476	+ if( (utf_skip > 0) && ((input_encoding & UTF_BIG_ENDIAN) != 0) )
	477	+ {
	478	+ /* Big-Endian Unicode should have padding NULs before the
	479	+ * POSIX `C' locale byte required.
	480	+ */
	481	+ while( (*p == '\0') && utf_skip-- && count-- )
	482	+ ++p;
	483	+ if( (utf_skip > 0) \|\| (*p == '\0') )
	484	+ {
	485	+ /* Diagnose and bail out, if the number of padding NULs
	486	+ * wasn't what we expected.
	487	+ */
	488	+ dfputc(( '\n', stderr ));
	489	+ fprintf( errmsg( MSG_UTF_FRAME_ERROR ), input, linenum, UTF_TYPE( 'B' ));
	490	+ return mc_discard( head, messages );
	491	+ }
	492	+ }
335	493	c = (wchar_t)(*p++);
	494	+ if( (utf_skip > 0) && ((input_encoding & UTF_LITTLE_ENDIAN) != 0) )
	495	+ {
	496	+ /* Little-Endian Unicode should have the padding NULs after
	497	+ * this significant byte.
	498	+ */
	499	+ while( (*p == '\0') && utf_skip-- && count-- )
	500	+ ++p;
	501	+ if( (utf_skip > 0) \|\| (*p == '\0') )
	502	+ {
	503	+ /* Diagnose and bail out, if the number of padding NULs
	504	+ * wasn't what we expected.
	505	+ */
	506	+ dfputc(( '\n', stderr ));
	507	+ fprintf( errmsg( MSG_UTF_FRAME_ERROR ), input, linenum, UTF_TYPE( 'L' ));
	508	+ return mc_discard( head, messages );
	509	+ }
	510	+ }
336	511	}
337	512
338	513	if( skip > 0 )

		@@ -460,12 +635,13 @@ struct msgdict mc_source( const char input )
460	635	{
461	636	if( strcmp( codeset, id ) != 0 )
462	637	{
	638	+ dfputc(( '\n', stderr ));
463	639	fprintf( errmsg( MSG_CODESET_CLASH ), input, linenum, id );
464	640	fprintf( errmsg( MSG_HAD_CODESET ), CODESET_DECLARED, codeset );
465	641	}
466	642	free( id );
467	643	}
468		- dfprintf(( stderr, "; declare %s", keyword ));
	644	+ dfprintf(( stderr, "\n%s:%u:declare %s", input, linenum, keyword ));
469	645	}
470	646	}
471	647

		@@ -1087,4 +1263,4 @@ struct msgdict mc_source( const char input )
1087	1263	return head;
1088	1264	}
1089	1265
1090		-/* $RCSfile$Revision: 1.9 $: end of file */
	1266	+/* $RCSfile$Revision: 1.10 $: end of file */

--- /dev/null

+++ b/mcutfsig.c

		@@ -0,0 +1,118 @@
	1	+/*
	2	+ * mcutfsig.c
	3	+ *
	4	+ * $Id$
	5	+ *
	6	+ * Copyright (C) 2007, Keith Marshall
	7	+ *
	8	+ * This file implements the `mc_utf_signature' function, which is used
	9	+ * by `gencat', to identify message definition source files which appear
	10	+ * to exhibit any recognisable standard of Unicode encoding.
	11	+ *
	12	+ * Written by Keith Marshall <keithmarshall@users.sourceforge.net>
	13	+ * Last Revision: 22-May-2007
	14	+ *
	15	+ *
	16	+ * This is free software. It is provided AS IS, in the hope that it may
	17	+ * be useful, but WITHOUT WARRANTY OF ANY KIND, not even an IMPLIED WARRANTY
	18	+ * of MERCHANTABILITY, nor of FITNESS FOR ANY PARTICULAR PURPOSE.
	19	+ *
	20	+ * Permission is granted to redistribute this software, either "as is" or
	21	+ * in modified form, under the terms of the GNU General Public License, as
	22	+ * published by the Free Software Foundation; either version 2, or (at your
	23	+ * option) any later version.
	24	+ *
	25	+ * You should have received a copy of the GNU General Public License
	26	+ * along with this software; see the file COPYING. If not, write to the
	27	+ * Free Software Foundation, 51 Franklin St - Fifth Floor, Boston,
	28	+ * MA 02110-1301, USA.
	29	+ *
	30	+ */
	31	+#include <mcutfsig.h>
	32	+
	33	+unsigned short mc_utf_signature( unsigned char *stream )
	34	+{
	35	+ /* Inspect the first few bytes of the specified data stream;
	36	+ * attempt to identify a potential Unicode encoding signature,
	37	+ * defaulting to non-specific single byte encoding units.
	38	+ */
	39	+ unsigned short signature = 1;
	40	+ /*
	41	+ * The first character in the input stream must not be NUL,
	42	+ * and must be a member of the POSIX Portable Character Set;
	43	+ * if it isn't, then it may indicate a Unicode stream.
	44	+ */
	45	+ if( *stream == 0 )
	46	+ {
	47	+ /* An initial NUL byte anticipates a big-endian Unicode stream;
	48	+ * one such byte implies UTF-16, without a Byte Order Mark, while
	49	+ * two such followed by the big-endian form of the BOM, or three
	50	+ * without a BOM, indicates UTF-32.
	51	+ */
	52	+ int count = 4;
	53	+ while( count-- && (*stream++ == '\0') )
	54	+ ++signature;
	55	+ signature += UTF_BIG_ENDIAN;
	56	+ }
	57	+ if( (*stream & 0xfe) == 0xfe )
	58	+ {
	59	+ /* This looks like it might be a Unicode Byte Order Mark;
	60	+ * identify the UTF encoding standard, if any, which it represents.
	61	+ */
	62	+ unsigned bom = stream++ << 8; bom \|= stream++;
	63	+ switch( bom )
	64	+ {
	65	+ case 0xfffe:
	66	+ /*
	67	+ * This is the BOM signature for a little-endian Unicode stream;
	68	+ * the first byte has already been included in the initial size
	69	+ * assigned for the encoding unit; adjust this to accommodate the
	70	+ * second byte, and incorporate the little-endian flag.
	71	+ */
	72	+ signature += UTF_WITH_BYTE_ORDER_MARK + UTF_LITTLE_ENDIAN + 1;
	73	+ if( *stream == '\0' )
	74	+ {
	75	+ int count = 4;
	76	+ while( count-- && (*stream++ == '\0') )
	77	+ ++signature;
	78	+ }
	79	+ break;
	80	+
	81	+ case 0xfeff:
	82	+ /*
	83	+ * This is the BOM signature for a big-endian Unicode stream;
	84	+ * if preceded by two NULs, (already counted), then it is UTF-32,
	85	+ * else it is UTF-16. In either case, adding an additional one
	86	+ * to the accumulated size of the encoding unit yields the
	87	+ * desired result, since the first byte of the BOM, and
	88	+ * any leading NULs, have already been counted.
	89	+ */
	90	+ signature += UTF_WITH_BYTE_ORDER_MARK + UTF_BIG_ENDIAN + 1;
	91	+ break;
	92	+
	93	+ case 0xffbb:
	94	+ /*
	95	+ * Provided it's followed by one further `0xbf' byte, this is the
	96	+ * BOM used as a signature for a UTF-8 encoded stream; it becomes
	97	+ * invalid, if there were any leading NUL bytes.
	98	+ */
	99	+ if( (signature == 1) && (*stream++ == (unsigned char)('\xbf')) )
	100	+ signature \|= UTF_WITH_BYTE_ORDER_MARK;
	101	+ }
	102	+ }
	103	+ else if( (signature == 1) && (*++stream == 0) )
	104	+ {
	105	+ /* NUL as the second byte in the input stream indicates a probable
	106	+ * little-endian Unicode input stream, although this is not indicated
	107	+ * by a Byte Order Mark; count the trailing NULs, to determine if we
	108	+ * should interpret it as UTF-16LE, or as UTF-32LE.
	109	+ */
	110	+ int count = 4;
	111	+ while( count-- && (*stream++ == '\0') )
	112	+ ++signature;
	113	+ signature += UTF_LITTLE_ENDIAN;
	114	+ }
	115	+ return signature;
	116	+}
	117	+
	118	+/* $RCSfile$Revision$: end of file */

mingw-catgets Fork