Kouhei Sutou
null+****@clear*****
Thu Oct 2 23:03:13 JST 2014
Kouhei Sutou 2014-10-02 23:03:13 +0900 (Thu, 02 Oct 2014) New Revision: c6fc5197b22f5ae01ceca31d2b0d485b157e6760 https://github.com/groonga/groonga/commit/c6fc5197b22f5ae01ceca31d2b0d485b157e6760 Merged fec9865: Merge pull request #209 from groonga/support-token-filters Message: Add stop-word token filter Added files: plugins/token_filters/Makefile.am plugins/token_filters/stop_word.c plugins/token_filters/stop_word_sources.am Copied files: plugins/token_filters/CMakeLists.txt (from plugins/CMakeLists.txt) Modified files: configure.ac plugins/CMakeLists.txt plugins/Makefile.am Modified: configure.ac (+4 -0) =================================================================== --- configure.ac 2014-10-02 22:53:26 +0900 (8768ca3) +++ configure.ac 2014-10-02 23:03:13 +0900 (2be82a1) @@ -234,6 +234,7 @@ AC_CONFIG_FILES([ plugins/table/Makefile plugins/query_expanders/Makefile plugins/ruby/Makefile + plugins/token_filters/Makefile examples/Makefile examples/dictionary/Makefile examples/dictionary/edict/Makefile @@ -1243,6 +1244,9 @@ AC_SUBST(table_pluginsdir) ruby_pluginsdir="\${pluginsdir}/ruby" AC_SUBST(ruby_pluginsdir) +token_filter_pluginsdir="\${pluginsdir}/token_filters" +AC_SUBST(token_filter_pluginsdir) + AC_MSG_CHECKING(for the suffix of plugin shared libraries) shrext_cmds=$(./libtool --config | grep '^shrext_cmds=') eval $shrext_cmds Modified: plugins/CMakeLists.txt (+2 -1) =================================================================== --- plugins/CMakeLists.txt 2014-10-02 22:53:26 +0900 (a2a8f28) +++ plugins/CMakeLists.txt 2014-10-02 23:03:13 +0900 (42a3045) @@ -1,4 +1,4 @@ -# Copyright(C) 2012 Brazil +# Copyright(C) 2012-2014 Brazil # # This library is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public @@ -18,3 +18,4 @@ add_subdirectory(tokenizers) add_subdirectory(table) add_subdirectory(query_expanders) add_subdirectory(ruby) +add_subdirectory(token_filters) Modified: plugins/Makefile.am (+2 -1) =================================================================== --- plugins/Makefile.am 2014-10-02 22:53:26 +0900 (75a0059) +++ plugins/Makefile.am 2014-10-02 23:03:13 +0900 (1d1abf1) @@ -3,7 +3,8 @@ SUBDIRS = \ suggest \ table \ query_expanders \ - ruby + ruby \ + token_filters EXTRA_DIST = \ CMakeLists.txt Copied: plugins/token_filters/CMakeLists.txt (+16 -6) 56% =================================================================== --- plugins/CMakeLists.txt 2014-10-02 22:53:26 +0900 (a2a8f28) +++ plugins/token_filters/CMakeLists.txt 2014-10-02 23:03:13 +0900 (3adbf24) @@ -1,4 +1,4 @@ -# Copyright(C) 2012 Brazil +# Copyright(C) 2014 Brazil # # This library is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public @@ -13,8 +13,18 @@ # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -add_subdirectory(suggest) -add_subdirectory(tokenizers) -add_subdirectory(table) -add_subdirectory(query_expanders) -add_subdirectory(ruby) +include_directories( + ${CMAKE_CURRENT_SOURCE_DIR}/../../lib + ) + +set(TOKEN_FILTERS_DIR "${GRN_RELATIVE_PLUGINS_DIR}/token_filters") + +read_file_list(${CMAKE_CURRENT_SOURCE_DIR}/stop_word_sources.am + STOP_WORD_SOURCES) +set_source_files_properties(${STOP_WORD_SOURCES} + PROPERTIES + COMPILE_FLAGS "${GRN_C_COMPILE_FLAGS}") +set_target_properties(stop_word_token_filter PROPERTIES + PREFIX "" + OUTPUT_NAME "stop_word") +install(TARGETS stop_word_token_filter DESTINATION "${TOKEN_FILTERS_DIR}") Added: plugins/token_filters/Makefile.am (+20 -0) 100644 =================================================================== --- /dev/null +++ plugins/token_filters/Makefile.am 2014-10-02 23:03:13 +0900 (8d77466) @@ -0,0 +1,20 @@ +EXTRA_DIST = \ + CMakeLists.txt + +AM_CPPFLAGS = \ + -I$(top_builddir) \ + -I$(top_srcdir)/include \ + -I$(top_srcdir)/lib + +AM_LDFLAGS = \ + -avoid-version \ + -module \ + -no-undefined + +LIBS = \ + $(top_builddir)/lib/libgroonga.la + +token_filter_plugins_LTLIBRARIES = +token_filter_plugins_LTLIBRARIES += stop_word.la + +include stop_word_sources.am Added: plugins/token_filters/stop_word.c (+153 -0) 100644 =================================================================== --- /dev/null +++ plugins/token_filters/stop_word.c 2014-10-02 23:03:13 +0900 (1f976de) @@ -0,0 +1,153 @@ +/* -*- c-basic-offset: 2 -*- */ +/* + Copyright(C) 2014 Brazil + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License version 2.1 as published by the Free Software Foundation. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +*/ + +#include <str.h> + +#include <groonga.h> +#include <groonga/token_filter.h> + +#include <string.h> + +#define COLUMN_NAME "stop_word" + +typedef struct { + grn_obj *table; + grn_token_mode mode; + grn_obj *column; + grn_obj value; + grn_tokenizer_token token; +} grn_stop_word_token_filter; + +static grn_obj * +stop_word_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) +{ + grn_stop_word_token_filter *token_filter; + + token_filter = GRN_PLUGIN_MALLOC(ctx, sizeof(grn_stop_word_token_filter)); + if (!token_filter) { + GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE, + "[token-filter][stop-word] " + "failed to allocate grn_stop_word_token_filter"); + return NULL; + } + + token_filter->table = args[0]; + token_filter->mode = GRN_UINT32_VALUE(args[1]); + token_filter->column = grn_obj_column(ctx, + token_filter->table, + COLUMN_NAME, + strlen(COLUMN_NAME)); + if (!token_filter->column) { + char table_name[GRN_TABLE_MAX_KEY_SIZE]; + unsigned int table_name_size; + + table_name_size = grn_obj_name(ctx, + token_filter->table, + table_name, + GRN_TABLE_MAX_KEY_SIZE); + GRN_PLUGIN_ERROR(ctx, GRN_TOKEN_FILTER_ERROR, + "[token-filter][stop-word] " + "column for judging stop word doesn't exit: <%.*s.%s>", + table_name_size, + table_name, + COLUMN_NAME); + GRN_PLUGIN_FREE(ctx, token_filter); + return NULL; + } + + user_data->ptr = token_filter; + + GRN_BOOL_INIT(&(token_filter->value), 0); + grn_tokenizer_token_init(ctx, &(token_filter->token)); + + return NULL; +} + +static grn_obj * +stop_word_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) +{ + grn_stop_word_token_filter *token_filter = user_data->ptr; + grn_obj *current_token = args[0]; + int status = GRN_INT32_VALUE(args[1]); + + if (token_filter->mode == GRN_TOKEN_GET) { + grn_id id; + id = grn_table_get(ctx, + token_filter->table, + GRN_TEXT_VALUE(current_token), + GRN_TEXT_LEN(current_token)); + if (id != GRN_ID_NIL) { + GRN_BULK_REWIND(&(token_filter->value)); + grn_obj_get_value(ctx, + token_filter->column, + id, + &(token_filter->value)); + if (GRN_BOOL_VALUE(&(token_filter->value))) { + status |= GRN_TOKENIZER_TOKEN_SKIP; + } + } + } + + grn_tokenizer_token_push(ctx, + &(token_filter->token), + GRN_TEXT_VALUE(current_token), + GRN_TEXT_LEN(current_token), + status); + + return NULL; +} + +static grn_obj * +stop_word_fin(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) +{ + grn_stop_word_token_filter *token_filter = user_data->ptr; + if (!token_filter) { + return NULL; + } + grn_tokenizer_token_fin(ctx, &(token_filter->token)); + grn_obj_unlink(ctx, token_filter->column); + grn_obj_unlink(ctx, &(token_filter->value)); + GRN_PLUGIN_FREE(ctx, token_filter); + return NULL; +} + +grn_rc +GRN_PLUGIN_INIT(grn_ctx *ctx) +{ + return ctx->rc; +} + +grn_rc +GRN_PLUGIN_REGISTER(grn_ctx *ctx) +{ + grn_rc rc; + + rc = grn_token_filter_register(ctx, + "TokenFilterStopWord", -1, + stop_word_init, + stop_word_next, + stop_word_fin); + + return rc; +} + +grn_rc +GRN_PLUGIN_FIN(grn_ctx *ctx) +{ + return GRN_SUCCESS; +} Added: plugins/token_filters/stop_word_sources.am (+2 -0) 100644 =================================================================== --- /dev/null +++ plugins/token_filters/stop_word_sources.am 2014-10-02 23:03:13 +0900 (bab8955) @@ -0,0 +1,2 @@ +stop_word_la_SOURCES = \ + stop_word.c -------------- next part -------------- HTML����������������������������...Descargar