[Groonga-commit] groonga/groonga at 2215657 [master] Add new tokenizer API

Back to archive index

Kouhei Sutou null+****@clear*****
Wed May 9 10:50:40 JST 2018


Kouhei Sutou	2018-05-09 10:50:40 +0900 (Wed, 09 May 2018)

  New Revision: 2215657d0d97c498fd0dc23b272e1e03d911be0b
  https://github.com/groonga/groonga/commit/2215657d0d97c498fd0dc23b272e1e03d911be0b

  Message:
    Add new tokenizer API
    
    grn_tokenizer_register() is deprecated.
    Use grn_tokenizer_create() and grn_tokenizer_set_*() API to register a
    new tokenizer.

  Modified files:
    include/groonga/tokenizer.h
    lib/grn_db.h
    lib/grn_token_cursor.h
    lib/token_cursor.c

  Modified: include/groonga/tokenizer.h (+36 -4)
===================================================================
--- include/groonga/tokenizer.h    2018-05-09 10:49:16 +0900 (e62a94776)
+++ include/groonga/tokenizer.h    2018-05-09 10:50:40 +0900 (75f6cd3e4)
@@ -263,11 +263,43 @@ GRN_PLUGIN_EXPORT const char *grn_tokenizer_tokenized_delimiter_next(grn_ctx *ct
   GRN_SUCCESS on success, an error code on failure. See "groonga.h" for more
   details of grn_proc_func and grn_user_data, that is used as an argument of
   grn_proc_func.
+
+  Deprecated since 8.0.2. Use grn_tokenizer_create() and
+  grn_tokenizer_set_*_func().
  */
-GRN_PLUGIN_EXPORT grn_rc grn_tokenizer_register(grn_ctx *ctx, const char *plugin_name_ptr,
-                                                unsigned int plugin_name_length,
-                                                grn_proc_func *init, grn_proc_func *next,
-                                                grn_proc_func *fin);
+GRN_PLUGIN_EXPORT grn_rc
+grn_tokenizer_register(grn_ctx *ctx, const char *plugin_name_ptr,
+                       unsigned int plugin_name_length,
+                       grn_proc_func *init, grn_proc_func *next,
+                       grn_proc_func *fin);
+
+GRN_PLUGIN_EXPORT grn_obj *
+grn_tokenizer_create(grn_ctx *ctx,
+                     const char *name,
+                     int name_length);
+
+typedef void *grn_tokenizer_init_func(grn_ctx *ctx,
+                                      grn_tokenizer_query *query);
+typedef void grn_tokenizer_next_func(grn_ctx *ctx,
+                                     grn_tokenizer_query *query,
+                                     grn_token *token,
+                                     void *user_data);
+typedef void grn_tokenizer_fin_func(grn_ctx *ctx,
+                                    void *user_data);
+
+
+GRN_PLUGIN_EXPORT grn_rc
+grn_tokenizer_set_init_func(grn_ctx *ctx,
+                            grn_obj *tokenizer,
+                            grn_tokenizer_init_func *init);
+GRN_PLUGIN_EXPORT grn_rc
+grn_tokenizer_set_next_func(grn_ctx *ctx,
+                            grn_obj *tokenizer,
+                            grn_tokenizer_next_func *next);
+GRN_PLUGIN_EXPORT grn_rc
+grn_tokenizer_set_fin_func(grn_ctx *ctx,
+                           grn_obj *tokenizer,
+                           grn_tokenizer_fin_func *fin);
 
 #ifdef __cplusplus
 }  /* extern "C" */

  Modified: lib/grn_db.h (+5 -0)
===================================================================
--- lib/grn_db.h    2018-05-09 10:49:16 +0900 (95a5a381e)
+++ lib/grn_db.h    2018-05-09 10:50:40 +0900 (891f9922e)
@@ -268,6 +268,11 @@ struct _grn_proc {
       grn_command_run_func *run;
     } command;
     struct {
+      grn_tokenizer_init_func *init;
+      grn_tokenizer_next_func *next;
+      grn_tokenizer_fin_func  *fin;
+    } tokenizer;
+    struct {
       grn_token_filter_init_func   *init;
       grn_token_filter_filter_func *filter;
       grn_token_filter_fin_func    *fin;

  Modified: lib/grn_token_cursor.h (+9 -3)
===================================================================
--- lib/grn_token_cursor.h    2018-05-09 10:49:16 +0900 (f9d064724)
+++ lib/grn_token_cursor.h    2018-05-09 10:50:40 +0900 (352e39563)
@@ -19,6 +19,7 @@
 #pragma once
 
 #include "grn_ctx.h"
+#include "grn_token.h"
 #include "grn_tokenizer.h"
 #include "grn_db.h"
 
@@ -50,14 +51,19 @@ typedef struct {
   grn_bool force_prefix;
   grn_obj_flags table_flags;
   grn_encoding encoding;
-  grn_obj *tokenizer;
-  grn_proc_ctx pctx;
+  struct {
+    grn_obj *object;
+    grn_proc_ctx pctx;
+    grn_tokenizer_query query;
+    void *user_data;
+    grn_token current_token;
+    grn_token next_token;
+  } tokenizer;
   struct {
     grn_obj *objects;
     void **data;
   } token_filter;
   uint32_t variant;
-  grn_obj *nstr;
 } grn_token_cursor;
 
 #define GRN_TOKEN_CURSOR_ENABLE_TOKENIZED_DELIMITER (0x01L<<0)

  Modified: lib/token_cursor.c (+111 -77)
===================================================================
--- lib/token_cursor.c    2018-05-09 10:49:16 +0900 (e21e3c9c1)
+++ lib/token_cursor.c    2018-05-09 10:50:40 +0900 (b9acbf170)
@@ -1,6 +1,6 @@
 /* -*- c-basic-offset: 2 -*- */
 /*
-  Copyright(C) 2009-2017 Brazil
+  Copyright(C) 2009-2018 Brazil
 
   This library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
@@ -75,59 +75,80 @@ grn_token_cursor_open(grn_ctx *ctx, grn_obj *table,
   token_cursor->table = table;
   token_cursor->mode = mode;
   token_cursor->encoding = encoding;
-  token_cursor->tokenizer = tokenizer;
+  token_cursor->tokenizer.object = tokenizer;
+  grn_tokenizer_query_init(ctx, &(token_cursor->tokenizer.query));
+  grn_tokenizer_query_set_lexicon(ctx, &(token_cursor->tokenizer.query), table);
+  grn_tokenizer_query_set_flags(ctx, &(token_cursor->tokenizer.query), flags);
+  grn_tokenizer_query_set_mode(ctx, &(token_cursor->tokenizer.query), mode);
+  grn_token_init(ctx, &(token_cursor->tokenizer.current_token));
+  grn_token_init(ctx, &(token_cursor->tokenizer.next_token));
   token_cursor->token_filter.objects = token_filters;
   token_cursor->token_filter.data = NULL;
   token_cursor->orig = (const unsigned char *)str;
   token_cursor->orig_blen = str_len;
   token_cursor->curr = NULL;
-  token_cursor->nstr = NULL;
   token_cursor->curr_size = 0;
   token_cursor->pos = -1;
   token_cursor->status = GRN_TOKEN_CURSOR_DOING;
   token_cursor->force_prefix = GRN_FALSE;
   if (tokenizer) {
-    grn_obj str_, flags_, mode_;
-    GRN_TEXT_INIT(&str_, GRN_OBJ_DO_SHALLOW_COPY);
-    GRN_TEXT_SET_REF(&str_, str, str_len);
-    GRN_UINT32_INIT(&flags_, 0);
-    GRN_UINT32_SET(ctx, &flags_, flags);
-    GRN_UINT32_INIT(&mode_, 0);
-    GRN_UINT32_SET(ctx, &mode_, mode);
-    token_cursor->pctx.caller = NULL;
-    token_cursor->pctx.user_data.ptr = NULL;
-    token_cursor->pctx.proc = (grn_proc *)tokenizer;
-    token_cursor->pctx.hooks = NULL;
-    token_cursor->pctx.currh = NULL;
-    token_cursor->pctx.phase = PROC_INIT;
-    grn_ctx_push(ctx, &mode_);
-    grn_ctx_push(ctx, &str_);
-    grn_ctx_push(ctx, &flags_);
-    ((grn_proc *)tokenizer)->funcs[PROC_INIT](ctx, 1, &table, &token_cursor->pctx.user_data);
-    grn_obj_close(ctx, &flags_);
-    grn_obj_close(ctx, &str_);
-    grn_obj_close(ctx, &mode_);
+    grn_proc *tokenizer_proc = (grn_proc *)tokenizer;
+    if (tokenizer_proc->callbacks.tokenizer.init) {
+      grn_tokenizer_query *query = &(token_cursor->tokenizer.query);
+      grn_tokenizer_query_set_raw_string(ctx, query, str, str_len);
+      if (ctx->rc != GRN_SUCCESS) {
+        goto exit;
+      }
+      token_cursor->tokenizer.user_data =
+        tokenizer_proc->callbacks.tokenizer.init(ctx, query);
+    } else if (tokenizer_proc->funcs[PROC_INIT]) {
+      grn_obj str_, flags_, mode_;
+      GRN_TEXT_INIT(&str_, GRN_OBJ_DO_SHALLOW_COPY);
+      GRN_TEXT_SET_REF(&str_, str, str_len);
+      GRN_UINT32_INIT(&flags_, 0);
+      GRN_UINT32_SET(ctx, &flags_, flags);
+      GRN_UINT32_INIT(&mode_, 0);
+      GRN_UINT32_SET(ctx, &mode_, mode);
+      token_cursor->tokenizer.pctx.caller = NULL;
+      token_cursor->tokenizer.pctx.user_data.ptr = NULL;
+      token_cursor->tokenizer.pctx.proc = (grn_proc *)tokenizer;
+      token_cursor->tokenizer.pctx.hooks = NULL;
+      token_cursor->tokenizer.pctx.currh = NULL;
+      token_cursor->tokenizer.pctx.phase = PROC_INIT;
+      grn_ctx_push(ctx, &mode_);
+      grn_ctx_push(ctx, &str_);
+      grn_ctx_push(ctx, &flags_);
+      ((grn_proc *)tokenizer)->funcs[PROC_INIT](ctx,
+                                                1,
+                                                &table,
+                                                &(token_cursor->tokenizer.pctx.user_data));
+      grn_obj_close(ctx, &flags_);
+      grn_obj_close(ctx, &str_);
+      grn_obj_close(ctx, &mode_);
+    }
   } else {
-    int nflags = 0;
-    token_cursor->nstr = grn_string_open_(ctx, str, str_len,
-                                          table,
-                                          nflags,
-                                          token_cursor->encoding);
-    if (token_cursor->nstr) {
-      const char *normalized;
-      grn_string_get_normalized(ctx, token_cursor->nstr,
-                                &normalized, &(token_cursor->curr_size), NULL);
-      token_cursor->curr = (const unsigned char *)normalized;
-    } else {
-      ERR(GRN_TOKENIZER_ERROR,
-          "[token-cursor][open] failed to grn_string_open()");
+    grn_tokenizer_query *query = &(token_cursor->tokenizer.query);
+    grn_obj *string;
+    const char *normalized;
+
+    grn_tokenizer_query_set_raw_string(ctx, query, str, str_len);
+    if (ctx->rc != GRN_SUCCESS) {
+      goto exit;
     }
+    string = grn_tokenizer_query_get_normalized_string(ctx, query);
+    grn_string_get_normalized(ctx,
+                              string,
+                              &normalized,
+                              &(token_cursor->curr_size),
+                              NULL);
+    token_cursor->curr = (const unsigned char *)normalized;
   }
 
   if (ctx->rc == GRN_SUCCESS) {
     grn_token_cursor_open_initialize_token_filters(ctx, token_cursor);
   }
 
+exit :
   if (ctx->rc) {
     grn_token_cursor_close(ctx, token_cursor);
     token_cursor = NULL;
@@ -137,14 +158,12 @@ grn_token_cursor_open(grn_ctx *ctx, grn_obj *table,
 
 static int
 grn_token_cursor_next_apply_token_filters(grn_ctx *ctx,
-                                          grn_token_cursor *token_cursor,
-                                          grn_obj *current_token_data,
-                                          grn_obj *status)
+                                          grn_token_cursor *token_cursor)
 {
   grn_obj *token_filters = token_cursor->token_filter.objects;
   unsigned int i, n_token_filters;
-  grn_token current_token;
-  grn_token next_token;
+  grn_token *current_token = &(token_cursor->tokenizer.current_token);
+  grn_token *next_token = &(token_cursor->tokenizer.next_token);
 
   if (token_filters) {
     n_token_filters = GRN_BULK_VSIZE(token_filters) / sizeof(grn_obj *);
@@ -152,17 +171,7 @@ grn_token_cursor_next_apply_token_filters(grn_ctx *ctx,
     n_token_filters = 0;
   }
 
-  grn_token_init(ctx, &current_token);
-  GRN_TEXT_SET(ctx, &(current_token.data),
-               GRN_TEXT_VALUE(current_token_data),
-               GRN_TEXT_LEN(current_token_data));
-  current_token.status = GRN_INT32_VALUE(status);
-  grn_token_init(ctx, &next_token);
-  GRN_TEXT_SET(ctx, &(next_token.data),
-               GRN_TEXT_VALUE(&(current_token.data)),
-               GRN_TEXT_LEN(&(current_token.data)));
-  next_token.status = current_token.status;
-
+  grn_token_copy(ctx, next_token, current_token);
   for (i = 0; i < n_token_filters; i++) {
     grn_obj *token_filter_object = GRN_PTR_VALUE_AT(token_filters, i);
     grn_proc *token_filter = (grn_proc *)token_filter_object;
@@ -171,26 +180,25 @@ grn_token_cursor_next_apply_token_filters(grn_ctx *ctx,
 #define SKIP_FLAGS\
     (GRN_TOKEN_SKIP |\
      GRN_TOKEN_SKIP_WITH_POSITION)
-    if (current_token.status & SKIP_FLAGS) {
+    if (grn_token_get_status(ctx, current_token) & SKIP_FLAGS) {
       break;
     }
 #undef SKIP_FLAGS
 
     token_filter->callbacks.token_filter.filter(ctx,
-                                                &current_token,
-                                                &next_token,
+                                                current_token,
+                                                next_token,
                                                 data);
-    GRN_TEXT_SET(ctx, &(current_token.data),
-                 GRN_TEXT_VALUE(&(next_token.data)),
-                 GRN_TEXT_LEN(&(next_token.data)));
-    current_token.status = next_token.status;
+    grn_token_copy(ctx, current_token, next_token);
   }
 
-  token_cursor->curr =
-    (const unsigned char *)GRN_TEXT_VALUE(&(current_token.data));
-  token_cursor->curr_size = GRN_TEXT_LEN(&(current_token.data));
+  {
+    size_t size;
+    token_cursor->curr = grn_token_get_data_raw(ctx, current_token, &size);
+    token_cursor->curr_size = size;
+  }
 
-  return current_token.status;
+  return grn_token_get_status(ctx, current_token);
 }
 
 grn_id
@@ -199,15 +207,33 @@ grn_token_cursor_next(grn_ctx *ctx, grn_token_cursor *token_cursor)
   int status;
   grn_id tid = GRN_ID_NIL;
   grn_obj *table = token_cursor->table;
-  grn_obj *tokenizer = token_cursor->tokenizer;
+  grn_obj *tokenizer = token_cursor->tokenizer.object;
+  grn_tokenizer_query *query = &(token_cursor->tokenizer.query);
+  grn_token *current_token = &(token_cursor->tokenizer.current_token);
+  void *user_data = token_cursor->tokenizer.user_data;
   while (token_cursor->status != GRN_TOKEN_CURSOR_DONE) {
     if (tokenizer) {
-      grn_obj *curr_, *stat_;
-      ((grn_proc *)tokenizer)->funcs[PROC_NEXT](ctx, 1, &table, &token_cursor->pctx.user_data);
-      stat_ = grn_ctx_pop(ctx);
-      curr_ = grn_ctx_pop(ctx);
-      status = grn_token_cursor_next_apply_token_filters(ctx, token_cursor,
-                                                         curr_, stat_);
+      grn_proc *tokenizer_proc = (grn_proc *)tokenizer;
+      if (tokenizer_proc->callbacks.tokenizer.next) {
+        tokenizer_proc->callbacks.tokenizer.next(ctx,
+                                                 query,
+                                                 current_token,
+                                                 user_data);
+      } else if (tokenizer_proc->funcs[PROC_NEXT]) {
+        grn_obj *data, *status;
+        tokenizer_proc->funcs[PROC_NEXT](ctx,
+                                         1,
+                                         &table,
+                                         &token_cursor->tokenizer.pctx.user_data);
+        status = grn_ctx_pop(ctx);
+        data = grn_ctx_pop(ctx);
+        grn_token_set_data(ctx,
+                           current_token,
+                           GRN_TEXT_VALUE(data),
+                           GRN_TEXT_LEN(data));
+        grn_token_set_status(ctx, current_token, GRN_UINT32_VALUE(status));
+      }
+      status = grn_token_cursor_next_apply_token_filters(ctx, token_cursor);
       token_cursor->status =
         ((status & GRN_TOKEN_LAST) ||
          (token_cursor->mode == GRN_TOKENIZE_GET &&
@@ -236,7 +262,7 @@ grn_token_cursor_next(grn_ctx *ctx, grn_token_cursor *token_cursor)
           char tokenizer_name[GRN_TABLE_MAX_KEY_SIZE];
           int tokenizer_name_length;
           tokenizer_name_length =
-            grn_obj_name(ctx, token_cursor->tokenizer,
+            grn_obj_name(ctx, token_cursor->tokenizer.object,
                          tokenizer_name, GRN_TABLE_MAX_KEY_SIZE);
           GRN_LOG(ctx, GRN_WARN,
                   "[token_next] ignore an empty token: <%.*s>: <%.*s>",
@@ -371,14 +397,22 @@ grn_rc
 grn_token_cursor_close(grn_ctx *ctx, grn_token_cursor *token_cursor)
 {
   if (token_cursor) {
-    if (token_cursor->tokenizer) {
-      ((grn_proc *)token_cursor->tokenizer)->funcs[PROC_FIN](ctx, 1, &token_cursor->table,
-                                                             &token_cursor->pctx.user_data);
+    if (token_cursor->tokenizer.object) {
+      grn_proc *tokenizer_proc = (grn_proc *)(token_cursor->tokenizer.object);
+      if (tokenizer_proc->callbacks.tokenizer.fin) {
+        void *user_data = token_cursor->tokenizer.user_data;
+        tokenizer_proc->callbacks.tokenizer.fin(ctx, user_data);
+      } else if (tokenizer_proc->funcs[PROC_FIN]) {
+        tokenizer_proc->funcs[PROC_FIN](ctx,
+                                        1,
+                                        &token_cursor->table,
+                                        &token_cursor->tokenizer.pctx.user_data);
+      }
     }
+    grn_token_fin(ctx, &(token_cursor->tokenizer.current_token));
+    grn_token_fin(ctx, &(token_cursor->tokenizer.next_token));
+    grn_tokenizer_query_fin(ctx, &(token_cursor->tokenizer.query));
     grn_token_cursor_close_token_filters(ctx, token_cursor);
-    if (token_cursor->nstr) {
-      grn_obj_close(ctx, token_cursor->nstr);
-    }
     GRN_FREE(token_cursor);
     return GRN_SUCCESS;
   } else {
-------------- next part --------------
HTML����������������������������...
URL: https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20180509/d500b793/attachment-0001.htm 



More information about the Groonga-commit mailing list
Back to archive index