sumom****@users*****
sumom****@users*****
2012年 8月 11日 (土) 19:44:02 JST
Index: julius4/mkbingram/Makefile.in diff -u julius4/mkbingram/Makefile.in:1.7 julius4/mkbingram/Makefile.in:1.8 --- julius4/mkbingram/Makefile.in:1.7 Fri Jul 27 17:44:57 2012 +++ julius4/mkbingram/Makefile.in Sat Aug 11 19:44:02 2012 @@ -3,7 +3,7 @@ # Copyright (c) 2005-2012 Julius project team, Nagoya Institute of Technology # All rights reserved # -# $Id: Makefile.in,v 1.7 2012/07/27 08:44:57 sumomo Exp $ +# $Id: Makefile.in,v 1.8 2012/08/11 10:44:02 sumomo Exp $ # SHELL=/bin/sh .SUFFIXES: @@ -21,7 +21,7 @@ exec_prefix=@exec_prefix@ INSTALL=@INSTALL@ -OBJ=mkbingram.o +OBJ=mkbingram.o charconv.o TARGET=mkbingram @ EXEEXT@ all: $(TARGET) Index: julius4/mkbingram/charconv.c diff -u /dev/null julius4/mkbingram/charconv.c:1.1 --- /dev/null Sat Aug 11 19:44:02 2012 +++ julius4/mkbingram/charconv.c Sat Aug 11 19:44:02 2012 @@ -0,0 +1,224 @@ +/* + * Copyright (c) 1991-2012 Kawahara Lab., Kyoto University + * Copyright (c) 2000-2005 Shikano Lab., Nara Institute of Science and Technology + * Copyright (c) 2005-2012 Julius project team, Nagoya Institute of Technology + * All rights reserved + */ + +#include <stdio.h> +#include <string.h> + +#define strmatch !strcmp + +#if defined(_WIN32) + +/* winnls */ +#include <windows.h> +#include <winnls.h> +#define UNICODE_BUFFER_SIZE 4096 ///< Buffer length for unicode conversion +static unsigned int from_cp; ///< Source codepage +static unsigned int to_cp; ///< Target codepage +static wchar_t unibuf[UNICODE_BUFFER_SIZE]; ///< buffer for unicode conversion + +#else + +/* iconv */ +#include <iconv.h> +#include <errno.h> +#include <stdlib.h> +static iconv_t cd = (iconv_t)-1; ///< Converstion descriptor + +#endif + +static int convert_enabled = 0; ///< 1 if charset converstion is enabled + + +#if defined(_WIN32) + +static int +str2code(char *codestr, unsigned int *code) +{ + if (strmatch(codestr, "euc-jp") + || strmatch(codestr, "euc") + || strmatch(codestr, "eucjp")) { + /* input = Shift_jis (codepage 932) */ + *code = CODE_JPN_EUC; + } else if (strmatch(codestr, "ansi")) { + /* ANSI codepage (MBCS) ex. shift-jis in Windows XP Japanese edition.*/ + *code = CP_ACP; + } else if (strmatch(codestr, "mac")) { + /* Macintosh codepage */ + *code = CP_MACCP; + } else if (strmatch(codestr, "oem")) { + /* OEM localized default codepage */ + *code = CP_OEMCP; + } else if (strmatch(codestr, "utf-7")) { + /* UTF-7 codepage */ + *code = CP_UTF7; + } else if (strmatch(codestr, "utf-8")) { + /* UTF-8 codepage */ + *code = CP_UTF8; + } else if (strmatch(codestr, "sjis") + || strmatch(codestr, "sjis-win") + || strmatch(codestr, "shift-jis") + || strmatch(codestr, "shift_jis")) { + /* sjis codepage = 932 */ + *code = 932; + } else if (codestr[0] >= '0' && codestr[0] <= '9') { + /* codepage number */ + *code = atoi(codestr); + if (! IsValidCodePage(*code)) { + jlog("Error: charconv_win32: codepage \"%d\" not found\n", codestr); + return -1; + } + } else { + fprintf(stderr, "Error: str2code: unknown source codepage \"%s\"\n", codestr); + fprintf(stderr, "Error: str2code: valids are \"euc-jp\", \"ansi\", \"mac\", \"oem\", \"utf-7\", \"utf-8\", \"sjis\" and codepage number\n"); + return -1; + } + + return 0; +} +#endif + + +/** + * Setup charset conversion. + * + * @param fromcode [in] input charset name (only libjcode accepts NULL) + * @param tocode [in] output charset name, or NULL when disable conversion + * + * @return 0 on success, -1 on failure. + */ +int +charconv_setup(char *fromcode, char *tocode) +{ + convert_enabled = 0; + + if (fromcode == NULL || tocode == NULL) { + fprintf(stderr, "Error: charconv_setup: input code or output code not specified\n"); + return -1; + } + +#if defined(_WIN32) + if (str2code(fromcode, &from_cp) == -1) { + fpritnf(stderr, "Error: charconv_setup: unknown codepage specified\n"); + return -1; + } + if (str2code(tocode, &to_cp) == -1) { + fpritnf(stderr, "Error: charconv_setup: unknown codepage specified\n"); + return -1; + } +#else + /* clear already allocated descriptor */ + if (cd != (iconv_t)-1) { + if (iconv_close(cd) < 0) { + fprintf(stderr, "Error: charconv_setup: failed to close iconv\n"); + return -1; + } + cd = (iconv_t)-1; + } + /* allocate conversion descriptor */ + cd = iconv_open(tocode, fromcode); + if (cd == (iconv_t)-1) { + /* allocation failed */ + fprintf(stderr, "Error: charconv_setup: unknown charset name in \"%s\" or \"%s\"\n", fromcode, tocode); + fprintf(stderr, "Error: charconv_setup: do \"iconv --list\" to get the list of available charset names.\n"); + return -1; + } + +#endif + + convert_enabled = 1; + + return(0); +} + +/** + * Apply charset conversion to a string. + * + * @param instr [in] source string + * @param outstr [in] destination buffer + * @param maxoutlen [in] allocated length of outstr in byte. + * + * @return either of instr or outstr, that holds the result string. + * + */ +char * +charconv(char *instr, char *outstr, int maxoutlen) +{ + +#if defined(_WIN32) + + int unilen, newlen; + char *srcbuf; + + /* if diabled return instr itself */ + if (convert_enabled == 0) return(instr); /* no conversion */ + + srcbuf = instr; + + /* get length of unicode string */ + unilen = MultiByteToWideChar(from_cp, 0, srcbuf, -1, NULL, 0); + if (unilen <= 0) { + jlog("Error: charconv: conversion error?\n"); + return(instr); + } + if (unilen > UNICODE_BUFFER_SIZE) { + jlog("Error: charconv: unicode buffer size exceeded (%d > %d)!\n", unilen, UNICODE_BUFFER_SIZE); + return(instr); + } + /* convert source string to unicode */ + MultiByteToWideChar(from_cp, 0, srcbuf, -1, unibuf, unilen); + /* get length of target string */ + newlen = WideCharToMultiByte(to_cp, 0, unibuf, -1, outstr, 0, NULL, NULL); + if (newlen <= 0) { + jlog("Error: charconv: conversion error?\n"); + return(instr); + } + if (newlen > maxoutlen) { + jlog("Error: charconv: target buffer size exceeded (%d > %d)!\n", newlen, maxoutlen); + return(instr); + } + /* convert unicode to target string */ + WideCharToMultiByte(to_cp, 0, unibuf, -1, outstr, newlen, NULL, NULL); + return(outstr); + +#else + + char *src, *dst; + size_t srclen, dstlen; + size_t ret; + + /* if diabled return instr itself */ + if (convert_enabled == 0) return(instr); /* no conversion */ + + if (cd == (iconv_t)-1) { + fprintf(stderr, "Error: charconv: conversion descriptor not allocated\n"); + return(instr); + } + + srclen = strlen(instr)+1; + dstlen = maxoutlen; + src = instr; + dst = outstr; + ret = iconv(cd, &src, &srclen, &dst, &dstlen); + if (ret == -1) { + switch(errno) { + case EILSEQ: + fprintf(stderr, "Error: charconv: invalid multibyte sequence in the input\n"); exit(-1); + break; + case EINVAL: + fprintf(stderr, "Error: charconv: incomplete multibyte sequence in the input\n"); exit(-1); + break; + case E2BIG: + fprintf(stderr, "Error: charconv: converted string size exceeded buffer (>%d)\n", maxoutlen); exit(-1); + break; + } + } + + return(outstr); + +#endif + +} Index: julius4/mkbingram/charconv.h diff -u /dev/null julius4/mkbingram/charconv.h:1.1 --- /dev/null Sat Aug 11 19:44:02 2012 +++ julius4/mkbingram/charconv.h Sat Aug 11 19:44:02 2012 @@ -0,0 +1,7 @@ +#ifndef __CHARCONV_H__ +#define __CHARCONV_H__ + +int charconv_setup(char *fromcode, char *tocode); +char *charconv(char *instr, char *outstr, int maxoutlen); + +#endif /* __CHARCONV_H__ */ Index: julius4/mkbingram/mkbingram.c diff -u julius4/mkbingram/mkbingram.c:1.5 julius4/mkbingram/mkbingram.c:1.6 --- julius4/mkbingram/mkbingram.c:1.5 Fri Jul 27 17:44:57 2012 +++ julius4/mkbingram/mkbingram.c Sat Aug 11 19:44:02 2012 @@ -18,7 +18,7 @@ * @author Akinobu LEE * @date Thu Mar 24 12:22:27 2005 * - * $Revision: 1.5 $ + * $Revision: 1.6 $ * */ /* @@ -30,13 +30,15 @@ /* mkbingram --- make binary n-gram for JULIUS from ARPA standard format */ -/* $Id: mkbingram.c,v 1.5 2012/07/27 08:44:57 sumomo Exp $ */ +/* $Id: mkbingram.c,v 1.6 2012/08/11 10:44:02 sumomo Exp $ */ #include <sent/stddefs.h> #include <sent/ngram2.h> #include <sys/stat.h> #include <time.h> +#include "charconv.h" + static NGRAM_INFO *ngram; void @@ -48,6 +50,7 @@ printf(" -nlr file forward N-gram in ARPA format\n"); printf(" -nrl file backward N-gram in ARPA format\n"); printf(" -d bingramfile Julius binary N-gram file input\n"); + printf(" -c from to convert character code\n"); printf(" -swap swap \"%s\" and \"%s\"\n", BEGIN_WORD_DEFAULT, END_WORD_DEFAULT); printf("\n When both \"-nlr\" and \"-nrl\" are specified, \n"); printf(" Julius will use the BACKWARD N-gram as main LM\n"); @@ -66,9 +69,13 @@ time_t now; char *binfile, *lrfile, *rlfile, *outfile; int i; + char *from_code, *to_code, *buf; + boolean charconv_enabled = FALSE; boolean force_swap = FALSE; + WORD_ID w; binfile = lrfile = rlfile = outfile = NULL; + from_code = to_code = NULL; if (argc <= 1) { usage(argv[0]); return -1; @@ -106,6 +113,21 @@ usage(argv[0]); return -1; } + } else if (argv[i][1] == 'c') { + if (++i >= argc) { + printf("Error: no argument for option \"%s\"\n", argv[i]); + usage(argv[0]); + return -1; + } + from_code = strcpy((char*)mymalloc(strlen(argv[i])+1), argv[i]); + if (++i >= argc) { + printf("Error: no argument for option \"%s\"\n", argv[i]); + usage(argv[0]); + free(from_code); + return -1; + } + to_code = strcpy((char*)mymalloc(strlen(argv[i])+1),argv[i]); + charconv_enabled = TRUE; } else if (argv[i][1] == 's') { force_swap = TRUE; } @@ -184,6 +206,21 @@ } print_ngram_info(stdout, ngram); + + if (charconv_enabled == TRUE) { + /* do character conversion */ + if (charconv_setup(from_code, to_code) == -1) { + fprintf(stderr, "failed to setup character convertsion\n"); + return -1; + } + buf = (char *)mymalloc(4096); + for (w = 0; w < ngram->max_word_num; w++) { + charconv(ngram->wname[w], buf, 4096); + ngram->wname[w] = mybmalloc2(strlen(buf)+1, &(ngram->mroot)); + strcpy(ngram->wname[w], buf); + } + free(buf); + } /* write in JULIUS binary format */ if ((fp = fopen_writefile(outfile)) == NULL) {