Kouhei Sutou 2018-10-29 15:44:25 +0900 (Mon, 29 Oct 2018) Revision: 3744cc101832044d4f5a9e2ccd9ff63fb3dcaf40 https://github.com/groonga/groonga/commit/3744cc101832044d4f5a9e2ccd9ff63fb3dcaf40 Message: TokenFilterStem: add algorithm option Added files: test/command/suite/token_filters/stem/french.expected test/command/suite/token_filters/stem/french.test Modified files: plugins/token_filters/stem.c Modified: plugins/token_filters/stem.c (+88 -10) =================================================================== --- plugins/token_filters/stem.c 2018-10-29 15:44:07 +0900 (e918ed8a1) +++ plugins/token_filters/stem.c 2018-10-29 15:44:25 +0900 (dd61dbe37) @@ -1,6 +1,7 @@ /* -*- c-basic-offset: 2 -*- */ /* Copyright(C) 2014 Brazil + Copyright(C) 2018 Kouhei Sutou <kou****@clear*****> This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public @@ -31,16 +32,93 @@ #include <libstemmer.h> typedef struct { + grn_obj algorithm; +} grn_stem_token_filter_options; + +typedef struct { + grn_stem_token_filter_options *options; struct sb_stemmer *stemmer; grn_tokenizer_token token; grn_obj buffer; } grn_stem_token_filter; +static void +stem_options_init(grn_ctx *ctx, grn_stem_token_filter_options *options) +{ + GRN_TEXT_INIT(&(options->algorithm), 0); + GRN_TEXT_SETS(ctx, &(options->algorithm), "english"); + GRN_TEXT_PUTC(ctx, &(options->algorithm), '\0'); +} + static void * -stem_init(grn_ctx *ctx, grn_obj *table, grn_token_mode mode) +stem_open_options(grn_ctx *ctx, + grn_obj *tokenizer, + grn_obj *raw_options, + void *user_data) { + grn_stem_token_filter_options *options; + + options = GRN_PLUGIN_MALLOC(ctx, sizeof(grn_stem_token_filter_options)); + if (!options) { + GRN_PLUGIN_ERROR(ctx, + GRN_NO_MEMORY_AVAILABLE, + "[token-filter][stem] " + "failed to allocate memory for options"); + return NULL; + } + + stem_options_init(ctx, options); + + GRN_OPTION_VALUES_EACH_BEGIN(ctx, raw_options, i, name, name_length) { + grn_raw_string name_raw; + name_raw.value = name; + name_raw.length = name_length; + + if (GRN_RAW_STRING_EQUAL_CSTRING(name_raw, "algorithm")) { + const char *algorithm; + unsigned int length; + length = grn_vector_get_element(ctx, + raw_options, + i, + &algorithm, + NULL, + NULL); + GRN_TEXT_SET(ctx, &(options->algorithm), algorithm, length); + GRN_TEXT_PUTC(ctx, &(options->algorithm), '\0'); + } + } GRN_OPTION_VALUES_EACH_END(); + + return options; +} + +static void +stem_close_options(grn_ctx *ctx, void *data) +{ + grn_stem_token_filter_options *options = data; + GRN_OBJ_FIN(ctx, &(options->algorithm)); + GRN_PLUGIN_FREE(ctx, options); +} + +static void * +stem_init(grn_ctx *ctx, grn_tokenizer_query *query) +{ + grn_obj *lexicon; + unsigned int i; + grn_stem_token_filter_options *options; grn_stem_token_filter *token_filter; + lexicon = grn_tokenizer_query_get_lexicon(ctx, query); + i = grn_tokenizer_query_get_token_filter_index(ctx, query); + options = grn_table_cache_token_filter_options(ctx, + lexicon, + i, + stem_open_options, + stem_close_options, + NULL); + if (ctx->rc != GRN_SUCCESS) { + return NULL; + } + token_filter = GRN_PLUGIN_MALLOC(ctx, sizeof(grn_stem_token_filter)); if (!token_filter) { GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE, @@ -48,10 +126,11 @@ stem_init(grn_ctx *ctx, grn_obj *table, grn_token_mode mode) "failed to allocate grn_stem_token_filter"); return NULL; } + token_filter->options = options; { - /* TODO: Support other languages. */ - const char *algorithm = "english"; + const char *algorithm = GRN_TEXT_VALUE(&(token_filter->options->algorithm)); + /* TODO: Support other encoding. */ const char *encoding = "UTF_8"; token_filter->stemmer = sb_stemmer_new(algorithm, encoding); if (!token_filter->stemmer) { @@ -261,15 +340,14 @@ GRN_PLUGIN_INIT(grn_ctx *ctx) grn_rc GRN_PLUGIN_REGISTER(grn_ctx *ctx) { - grn_rc rc; + grn_obj *token_filter; - rc = grn_token_filter_register(ctx, - "TokenFilterStem", -1, - stem_init, - stem_filter, - stem_fin); + token_filter = grn_token_filter_create(ctx, "TokenFilterStem", -1); + grn_token_filter_set_init_func(ctx, token_filter, stem_init); + grn_token_filter_set_filter_func(ctx, token_filter, stem_filter); + grn_token_filter_set_fin_func(ctx, token_filter, stem_fin); - return rc; + return ctx->rc; } grn_rc Added: test/command/suite/token_filters/stem/french.expected (+49 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/token_filters/stem/french.expected 2018-10-29 15:44:25 +0900 (bf2362501) @@ -0,0 +1,49 @@ +plugin_register token_filters/stem +[[0,0.0,0.0],true] +table_create Memos TABLE_NO_KEY +[[0,0.0,0.0],true] +column_create Memos content COLUMN_SCALAR ShortText +[[0,0.0,0.0],true] +table_create Terms TABLE_PAT_KEY ShortText --default_tokenizer TokenBigram --normalizer NormalizerAuto --token_filters 'TokenFilterStem("algorithm", "french")' +[[0,0.0,0.0],true] +column_create Terms memos_content COLUMN_INDEX|WITH_POSITION Memos content +[[0,0.0,0.0],true] +load --table Memos +[ +{"content": "maintenait"}, +{"content": "maintenant"} +] +[[0,0.0,0.0],2] +select Memos --match_columns content --query "maintenir" +[ + [ + 0, + 0.0, + 0.0 + ], + [ + [ + [ + 2 + ], + [ + [ + "_id", + "UInt32" + ], + [ + "content", + "ShortText" + ] + ], + [ + 1, + "maintenait" + ], + [ + 2, + "maintenant" + ] + ] + ] +] Added: test/command/suite/token_filters/stem/french.test (+20 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/token_filters/stem/french.test 2018-10-29 15:44:25 +0900 (ef48bb533) @@ -0,0 +1,20 @@ +#@on-error omit +plugin_register token_filters/stem +#@on-error default + +table_create Memos TABLE_NO_KEY +column_create Memos content COLUMN_SCALAR ShortText + +table_create Terms TABLE_PAT_KEY ShortText \ + --default_tokenizer TokenBigram \ + --normalizer NormalizerAuto \ + --token_filters 'TokenFilterStem("algorithm", "french")' +column_create Terms memos_content COLUMN_INDEX|WITH_POSITION Memos content + +load --table Memos +[ +{"content": "maintenait"}, +{"content": "maintenant"} +] + +select Memos --match_columns content --query "maintenir" -------------- next part -------------- An HTML attachment was scrubbed... URL: <https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20181029/41277ecb/attachment-0001.html>