[Groonga-commit] groonga/groonga at c6fc519 [master] Add stop-word token filter

Back to archive index

Kouhei Sutou null+****@clear*****
Thu Oct 2 23:03:13 JST 2014


Kouhei Sutou	2014-10-02 23:03:13 +0900 (Thu, 02 Oct 2014)

  New Revision: c6fc5197b22f5ae01ceca31d2b0d485b157e6760
  https://github.com/groonga/groonga/commit/c6fc5197b22f5ae01ceca31d2b0d485b157e6760

  Merged fec9865: Merge pull request #209 from groonga/support-token-filters

  Message:
    Add stop-word token filter

  Added files:
    plugins/token_filters/Makefile.am
    plugins/token_filters/stop_word.c
    plugins/token_filters/stop_word_sources.am
  Copied files:
    plugins/token_filters/CMakeLists.txt
      (from plugins/CMakeLists.txt)
  Modified files:
    configure.ac
    plugins/CMakeLists.txt
    plugins/Makefile.am

  Modified: configure.ac (+4 -0)
===================================================================
--- configure.ac    2014-10-02 22:53:26 +0900 (8768ca3)
+++ configure.ac    2014-10-02 23:03:13 +0900 (2be82a1)
@@ -234,6 +234,7 @@ AC_CONFIG_FILES([
   plugins/table/Makefile
   plugins/query_expanders/Makefile
   plugins/ruby/Makefile
+  plugins/token_filters/Makefile
   examples/Makefile
   examples/dictionary/Makefile
   examples/dictionary/edict/Makefile
@@ -1243,6 +1244,9 @@ AC_SUBST(table_pluginsdir)
 ruby_pluginsdir="\${pluginsdir}/ruby"
 AC_SUBST(ruby_pluginsdir)
 
+token_filter_pluginsdir="\${pluginsdir}/token_filters"
+AC_SUBST(token_filter_pluginsdir)
+
 AC_MSG_CHECKING(for the suffix of plugin shared libraries)
 shrext_cmds=$(./libtool --config | grep '^shrext_cmds=')
 eval $shrext_cmds

  Modified: plugins/CMakeLists.txt (+2 -1)
===================================================================
--- plugins/CMakeLists.txt    2014-10-02 22:53:26 +0900 (a2a8f28)
+++ plugins/CMakeLists.txt    2014-10-02 23:03:13 +0900 (42a3045)
@@ -1,4 +1,4 @@
-# Copyright(C) 2012 Brazil
+# Copyright(C) 2012-2014 Brazil
 #
 # This library is free software; you can redistribute it and/or
 # modify it under the terms of the GNU Lesser General Public
@@ -18,3 +18,4 @@ add_subdirectory(tokenizers)
 add_subdirectory(table)
 add_subdirectory(query_expanders)
 add_subdirectory(ruby)
+add_subdirectory(token_filters)

  Modified: plugins/Makefile.am (+2 -1)
===================================================================
--- plugins/Makefile.am    2014-10-02 22:53:26 +0900 (75a0059)
+++ plugins/Makefile.am    2014-10-02 23:03:13 +0900 (1d1abf1)
@@ -3,7 +3,8 @@ SUBDIRS =					\
 	suggest					\
 	table					\
 	query_expanders				\
-	ruby
+	ruby					\
+	token_filters
 
 EXTRA_DIST =					\
 	CMakeLists.txt

  Copied: plugins/token_filters/CMakeLists.txt (+16 -6) 56%
===================================================================
--- plugins/CMakeLists.txt    2014-10-02 22:53:26 +0900 (a2a8f28)
+++ plugins/token_filters/CMakeLists.txt    2014-10-02 23:03:13 +0900 (3adbf24)
@@ -1,4 +1,4 @@
-# Copyright(C) 2012 Brazil
+# Copyright(C) 2014 Brazil
 #
 # This library is free software; you can redistribute it and/or
 # modify it under the terms of the GNU Lesser General Public
@@ -13,8 +13,18 @@
 # License along with this library; if not, write to the Free Software
 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 
-add_subdirectory(suggest)
-add_subdirectory(tokenizers)
-add_subdirectory(table)
-add_subdirectory(query_expanders)
-add_subdirectory(ruby)
+include_directories(
+  ${CMAKE_CURRENT_SOURCE_DIR}/../../lib
+  )
+
+set(TOKEN_FILTERS_DIR "${GRN_RELATIVE_PLUGINS_DIR}/token_filters")
+
+read_file_list(${CMAKE_CURRENT_SOURCE_DIR}/stop_word_sources.am
+  STOP_WORD_SOURCES)
+set_source_files_properties(${STOP_WORD_SOURCES}
+  PROPERTIES
+  COMPILE_FLAGS "${GRN_C_COMPILE_FLAGS}")
+set_target_properties(stop_word_token_filter PROPERTIES
+  PREFIX ""
+  OUTPUT_NAME "stop_word")
+install(TARGETS stop_word_token_filter DESTINATION "${TOKEN_FILTERS_DIR}")

  Added: plugins/token_filters/Makefile.am (+20 -0) 100644
===================================================================
--- /dev/null
+++ plugins/token_filters/Makefile.am    2014-10-02 23:03:13 +0900 (8d77466)
@@ -0,0 +1,20 @@
+EXTRA_DIST =					\
+	CMakeLists.txt
+
+AM_CPPFLAGS =					\
+	-I$(top_builddir)			\
+	-I$(top_srcdir)/include			\
+	-I$(top_srcdir)/lib
+
+AM_LDFLAGS =					\
+	-avoid-version				\
+	-module					\
+	-no-undefined
+
+LIBS =						\
+	$(top_builddir)/lib/libgroonga.la
+
+token_filter_plugins_LTLIBRARIES =
+token_filter_plugins_LTLIBRARIES += stop_word.la
+
+include stop_word_sources.am

  Added: plugins/token_filters/stop_word.c (+153 -0) 100644
===================================================================
--- /dev/null
+++ plugins/token_filters/stop_word.c    2014-10-02 23:03:13 +0900 (1f976de)
@@ -0,0 +1,153 @@
+/* -*- c-basic-offset: 2 -*- */
+/*
+  Copyright(C) 2014 Brazil
+
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License version 2.1 as published by the Free Software Foundation.
+
+  This library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public
+  License along with this library; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+*/
+
+#include <str.h>
+
+#include <groonga.h>
+#include <groonga/token_filter.h>
+
+#include <string.h>
+
+#define COLUMN_NAME "stop_word"
+
+typedef struct {
+  grn_obj *table;
+  grn_token_mode mode;
+  grn_obj *column;
+  grn_obj value;
+  grn_tokenizer_token token;
+} grn_stop_word_token_filter;
+
+static grn_obj *
+stop_word_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
+{
+  grn_stop_word_token_filter *token_filter;
+
+  token_filter = GRN_PLUGIN_MALLOC(ctx, sizeof(grn_stop_word_token_filter));
+  if (!token_filter) {
+    GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE,
+                     "[token-filter][stop-word] "
+                     "failed to allocate grn_stop_word_token_filter");
+    return NULL;
+  }
+
+  token_filter->table = args[0];
+  token_filter->mode = GRN_UINT32_VALUE(args[1]);
+  token_filter->column = grn_obj_column(ctx,
+                                        token_filter->table,
+                                        COLUMN_NAME,
+                                        strlen(COLUMN_NAME));
+  if (!token_filter->column) {
+    char table_name[GRN_TABLE_MAX_KEY_SIZE];
+    unsigned int table_name_size;
+
+    table_name_size = grn_obj_name(ctx,
+                                   token_filter->table,
+                                   table_name,
+                                   GRN_TABLE_MAX_KEY_SIZE);
+    GRN_PLUGIN_ERROR(ctx, GRN_TOKEN_FILTER_ERROR,
+                     "[token-filter][stop-word] "
+                     "column for judging stop word doesn't exit: <%.*s.%s>",
+                     table_name_size,
+                     table_name,
+                     COLUMN_NAME);
+    GRN_PLUGIN_FREE(ctx, token_filter);
+    return NULL;
+  }
+
+  user_data->ptr = token_filter;
+
+  GRN_BOOL_INIT(&(token_filter->value), 0);
+  grn_tokenizer_token_init(ctx, &(token_filter->token));
+
+  return NULL;
+}
+
+static grn_obj *
+stop_word_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
+{
+  grn_stop_word_token_filter *token_filter = user_data->ptr;
+  grn_obj *current_token = args[0];
+  int status = GRN_INT32_VALUE(args[1]);
+
+  if (token_filter->mode == GRN_TOKEN_GET) {
+    grn_id id;
+    id = grn_table_get(ctx,
+                       token_filter->table,
+                       GRN_TEXT_VALUE(current_token),
+                       GRN_TEXT_LEN(current_token));
+    if (id != GRN_ID_NIL) {
+      GRN_BULK_REWIND(&(token_filter->value));
+      grn_obj_get_value(ctx,
+                        token_filter->column,
+                        id,
+                        &(token_filter->value));
+      if (GRN_BOOL_VALUE(&(token_filter->value))) {
+        status |= GRN_TOKENIZER_TOKEN_SKIP;
+      }
+    }
+  }
+
+  grn_tokenizer_token_push(ctx,
+                           &(token_filter->token),
+                           GRN_TEXT_VALUE(current_token),
+                           GRN_TEXT_LEN(current_token),
+                           status);
+
+  return NULL;
+}
+
+static grn_obj *
+stop_word_fin(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
+{
+  grn_stop_word_token_filter *token_filter = user_data->ptr;
+  if (!token_filter) {
+    return NULL;
+  }
+  grn_tokenizer_token_fin(ctx, &(token_filter->token));
+  grn_obj_unlink(ctx, token_filter->column);
+  grn_obj_unlink(ctx, &(token_filter->value));
+  GRN_PLUGIN_FREE(ctx, token_filter);
+  return NULL;
+}
+
+grn_rc
+GRN_PLUGIN_INIT(grn_ctx *ctx)
+{
+  return ctx->rc;
+}
+
+grn_rc
+GRN_PLUGIN_REGISTER(grn_ctx *ctx)
+{
+  grn_rc rc;
+
+  rc = grn_token_filter_register(ctx,
+                                 "TokenFilterStopWord", -1,
+                                 stop_word_init,
+                                 stop_word_next,
+                                 stop_word_fin);
+
+  return rc;
+}
+
+grn_rc
+GRN_PLUGIN_FIN(grn_ctx *ctx)
+{
+  return GRN_SUCCESS;
+}

  Added: plugins/token_filters/stop_word_sources.am (+2 -0) 100644
===================================================================
--- /dev/null
+++ plugins/token_filters/stop_word_sources.am    2014-10-02 23:03:13 +0900 (bab8955)
@@ -0,0 +1,2 @@
+stop_word_la_SOURCES =				\
+	stop_word.c
-------------- next part --------------
HTML����������������������������...
Descargar 



More information about the Groonga-commit mailing list
Back to archive index