[Groonga-commit] groonga/groonga at 5f42bd9 [master] tokenizers regexp: skip the last one character token

Back to archive index

Kouhei Sutou null+****@clear*****
Thu Apr 2 13:03:17 JST 2015


Kouhei Sutou	2015-04-02 13:03:17 +0900 (Thu, 02 Apr 2015)

  New Revision: 5f42bd915b68925aff9f367febe94c5e14cd4b42
  https://github.com/groonga/groonga/commit/5f42bd915b68925aff9f367febe94c5e14cd4b42

  Message:
    tokenizers regexp: skip the last one character token

  Modified files:
    lib/tokenizers.c

  Modified: lib/tokenizers.c (+26 -6)
===================================================================
--- lib/tokenizers.c    2015-04-02 12:26:25 +0900 (1c07c52)
+++ lib/tokenizers.c    2015-04-02 13:03:17 +0900 (ea85cc6)
@@ -478,6 +478,7 @@ typedef struct {
   } get;
   grn_bool is_begin;
   grn_bool is_end;
+  grn_bool is_first_token;
   grn_bool is_overlapping;
   const char *next;
   const char *end;
@@ -515,6 +516,7 @@ regexp_init(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
 
   tokenizer->is_begin = GRN_TRUE;
   tokenizer->is_end   = GRN_FALSE;
+  tokenizer->is_first_token = GRN_TRUE;
   tokenizer->is_overlapping = GRN_FALSE;
 
   grn_string_get_normalized(ctx, tokenizer->query->normalized_query,
@@ -659,15 +661,31 @@ regexp_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
   }
   tokenizer->is_overlapping = (n_characters > 1);
 
-  if (tokenizer->next == end) {
-    tokenizer->is_end = GRN_TRUE;
-    if (mode == GRN_TOKEN_GET) {
-      if (!tokenizer->get.have_end) {
+  if (mode == GRN_TOKEN_GET) {
+    if ((end - tokenizer->next) < ngram_unit) {
+      if (tokenizer->get.have_end) {
+        if (tokenizer->next == end) {
+          tokenizer->is_end = GRN_TRUE;
+        }
+        if (status & GRN_TOKEN_UNMATURED) {
+          if (tokenizer->is_first_token) {
+            status |= GRN_TOKEN_FORCE_PREFIX;
+          } else {
+            status |= GRN_TOKEN_SKIP;
+          }
+        }
+      } else {
+        tokenizer->is_end = GRN_TRUE;
         status |= GRN_TOKEN_LAST | GRN_TOKEN_REACH_END;
-      } else if (status & GRN_TOKEN_UNMATURED) {
-        status |= GRN_TOKEN_FORCE_PREFIX;
+        if (status & GRN_TOKEN_UNMATURED) {
+          status |= GRN_TOKEN_FORCE_PREFIX;
+        }
       }
     }
+  } else {
+    if (tokenizer->next == end) {
+      tokenizer->is_end = GRN_TRUE;
+    }
   }
 
   grn_tokenizer_token_push(ctx,
@@ -675,6 +693,8 @@ regexp_next(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
                            GRN_TEXT_VALUE(buffer),
                            GRN_TEXT_LEN(buffer),
                            status);
+  tokenizer->is_first_token = GRN_FALSE;
+
   return NULL;
 }
 
-------------- next part --------------
HTML����������������������������...
Descargar 



More information about the Groonga-commit mailing list
Back to archive index