[Groonga-commit] groonga/groonga at e681cca [support-token-filters] Ignore query that all tokens are skipped

Back to archive index

Kouhei Sutou null+****@clear*****
Thu Oct 2 23:04:36 JST 2014


Kouhei Sutou	2014-10-02 23:04:36 +0900 (Thu, 02 Oct 2014)

  New Revision: e681cca8f838a91c92008f3d6136dd563197aa42
  https://github.com/groonga/groonga/commit/e681cca8f838a91c92008f3d6136dd563197aa42

  Message:
    Ignore query that all tokens are skipped
    
    Is it a good approach?

  Added files:
    test/command/suite/token_filters/stop_word/skip.expected
    test/command/suite/token_filters/stop_word/skip.test
  Modified files:
    lib/db.c
    lib/ii.c
    lib/token.c
    lib/token.h

  Modified: lib/db.c (+1 -1)
===================================================================
--- lib/db.c    2014-10-02 23:03:13 +0900 (2f1f8e0)
+++ lib/db.c    2014-10-02 23:04:36 +0900 (0011676)
@@ -9872,7 +9872,7 @@ grn_table_tokenize(grn_ctx *ctx, grn_obj *table,
       goto exit;
     }
   }
-  while (token->status != GRN_TOKEN_DONE) {
+  while (token->status != GRN_TOKEN_DONE && token->status != GRN_TOKEN_DONE_SKIP) {
     grn_id tid;
     if ((tid = grn_token_next(ctx, token))) {
       GRN_RECORD_PUT(ctx, buf, tid);

  Modified: lib/ii.c (+14 -4)
===================================================================
--- lib/ii.c    2014-10-02 23:03:13 +0900 (67e5f28)
+++ lib/ii.c    2014-10-02 23:04:36 +0900 (4f83a79)
@@ -5355,7 +5355,8 @@ token_compare(const void *a, const void *b)
 
 inline static grn_rc
 token_info_build(grn_ctx *ctx, grn_obj *lexicon, grn_ii *ii, const char *string, unsigned int string_len,
-                 token_info **tis, uint32_t *n, grn_operator mode)
+                 token_info **tis, uint32_t *n, grn_bool *only_skip_token,
+                 grn_operator mode)
 {
   token_info *ti;
   const char *key;
@@ -5364,6 +5365,7 @@ token_info_build(grn_ctx *ctx, grn_obj *lexicon, grn_ii *ii, const char *string,
   unsigned int token_flags = GRN_TOKEN_ENABLE_TOKENIZED_DELIMITER;
   grn_token *token = grn_token_open(ctx, lexicon, string, string_len,
                                     GRN_TOKEN_GET, token_flags);
+  *only_skip_token = GRN_FALSE;
   if (!token) { return GRN_NO_MEMORY_AVAILABLE; }
   if (mode == GRN_OP_UNSPLIT) {
     if ((ti = token_info_open(ctx, lexicon, ii, (char *)token->orig, token->orig_blen, 0, EX_BOTH))) {
@@ -5408,6 +5410,9 @@ token_info_build(grn_ctx *ctx, grn_obj *lexicon, grn_ii *ii, const char *string,
       ti = token_info_open(ctx, lexicon, ii, (char *)token->orig,
                            token->orig_blen, 0, ef);
       break;
+    case GRN_TOKEN_DONE_SKIP :
+      *only_skip_token = GRN_TRUE;
+      goto exit;
     default :
       goto exit;
     }
@@ -5416,6 +5421,8 @@ token_info_build(grn_ctx *ctx, grn_obj *lexicon, grn_ii *ii, const char *string,
     while (token->status == GRN_TOKEN_DOING) {
       tid = grn_token_next(ctx, token);
       switch (token->status) {
+      case GRN_TOKEN_DONE_SKIP :
+        continue;
       case GRN_TOKEN_DOING :
         key = _grn_table_key(ctx, lexicon, tid, &size);
         ti = token_info_open(ctx, lexicon, ii, key, size, token->pos, EX_NONE);
@@ -5659,7 +5666,7 @@ grn_ii_similar_search(grn_ctx *ctx, grn_ii *ii,
     return GRN_NO_MEMORY_AVAILABLE;
   }
   if (!(max_size = optarg->max_size)) { max_size = 1048576; }
-  while (token->status != GRN_TOKEN_DONE) {
+  while (token->status != GRN_TOKEN_DONE && token->status != GRN_TOKEN_DONE_SKIP) {
     if ((tid = grn_token_next(ctx, token))) {
       if (grn_hash_add(ctx, h, &tid, sizeof(grn_id), (void **)&w1, NULL)) { (*w1)++; }
     }
@@ -5858,6 +5865,7 @@ grn_ii_select(grn_ctx *ctx, grn_ii *ii, const char *string, unsigned int string_
   int rep, orp, weight, max_interval = 0;
   token_info *ti, **tis = NULL, **tip, **tie;
   uint32_t n = 0, rid, sid, nrid, nsid;
+  grn_bool only_skip_token = GRN_FALSE;
   grn_operator mode = GRN_OP_EXACT;
   grn_wv_mode wvm = grn_wv_none;
   grn_obj *lexicon = ii->lexicon;
@@ -5886,7 +5894,7 @@ grn_ii_select(grn_ctx *ctx, grn_ii *ii, const char *string, unsigned int string_
   if (!(tis = GRN_MALLOC(sizeof(token_info *) * string_len * 2))) {
     return GRN_NO_MEMORY_AVAILABLE;
   }
-  if (token_info_build(ctx, lexicon, ii, string, string_len, tis, &n, mode) || !n) { goto exit; }
+  if (token_info_build(ctx, lexicon, ii, string, string_len, tis, &n, &only_skip_token, mode) || !n) { goto exit; }
   switch (mode) {
   case GRN_OP_NEAR2 :
     token_info_clear_offset(tis, n);
@@ -6012,7 +6020,9 @@ exit :
     if (*tip) { token_info_close(ctx, *tip); }
   }
   if (tis) { GRN_FREE(tis); }
-  grn_ii_resolve_sel_and(ctx, s, op);
+  if (!only_skip_token) {
+    grn_ii_resolve_sel_and(ctx, s, op);
+  }
   //  grn_hash_cursor_clear(r);
   bt_close(ctx, bt);
 #ifdef DEBUG

  Modified: lib/token.c (+13 -5)
===================================================================
--- lib/token.c    2014-10-02 23:03:13 +0900 (618f852)
+++ lib/token.c    2014-10-02 23:04:36 +0900 (25eca07)
@@ -680,12 +680,20 @@ grn_token_next(grn_ctx *ctx, grn_token *token)
                         (status & GRN_TOKENIZER_TOKEN_REACH_END)))
         ? GRN_TOKEN_DONE : GRN_TOKEN_DOING;
       token->force_prefix = 0;
-      if (status & GRN_TOKENIZER_TOKEN_SKIP) {
-        token->pos++;
-        continue;
-      } else if (status & GRN_TOKENIZER_TOKEN_SKIP_WITH_POSITION) {
-        continue;
+#define SKIP_FLAGS \
+      (GRN_TOKENIZER_TOKEN_SKIP | GRN_TOKENIZER_TOKEN_SKIP_WITH_POSITION)
+      if (status & SKIP_FLAGS) {
+        if (status & GRN_TOKENIZER_TOKEN_SKIP) {
+          token->pos++;
+        }
+        if (token->status == GRN_TOKEN_DONE && tid == GRN_ID_NIL) {
+          token->status = GRN_TOKEN_DONE_SKIP;
+          break;
+        } else {
+          continue;
+        }
       }
+#undef SKIP_FLAGS
       if (token->curr_size == 0) {
         char tokenizer_name[GRN_TABLE_MAX_KEY_SIZE];
         int tokenizer_name_length;

  Modified: lib/token.h (+1 -0)
===================================================================
--- lib/token.h    2014-10-02 23:03:13 +0900 (69b8352)
+++ lib/token.h    2014-10-02 23:04:36 +0900 (938597e)
@@ -42,6 +42,7 @@ extern "C" {
 typedef enum {
   GRN_TOKEN_DOING = 0,
   GRN_TOKEN_DONE,
+  GRN_TOKEN_DONE_SKIP,
   GRN_TOKEN_NOT_FOUND
 } grn_token_status;
 

  Added: test/command/suite/token_filters/stop_word/skip.expected (+57 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/token_filters/stop_word/skip.expected    2014-10-02 23:04:36 +0900 (5c389ee)
@@ -0,0 +1,57 @@
+register token_filters/stop_word
+[[0,0.0,0.0],true]
+table_create Memos TABLE_NO_KEY
+[[0,0.0,0.0],true]
+column_create Memos content COLUMN_SCALAR ShortText
+[[0,0.0,0.0],true]
+table_create Terms TABLE_PAT_KEY ShortText   --default_tokenizer TokenBigram   --normalizer NormalizerAuto   --token_filters TokenFilterStopWord
+[[0,0.0,0.0],true]
+column_create Terms memos_content COLUMN_INDEX|WITH_POSITION Memos content
+[[0,0.0,0.0],true]
+column_create Terms stop_word COLUMN_SCALAR Bool
+[[0,0.0,0.0],true]
+load --table Terms
+[
+{"_key": "and", "stop_word": true}
+]
+[[0,0.0,0.0],1]
+load --table Memos
+[
+{"content": "Hello"},
+{"content": "Hello and Good-bye"},
+{"content": "Good-bye"}
+]
+[[0,0.0,0.0],3]
+select Memos --match_columns content --query "Hello and"
+[
+  [
+    0,
+    0.0,
+    0.0
+  ],
+  [
+    [
+      [
+        2
+      ],
+      [
+        [
+          "_id",
+          "UInt32"
+        ],
+        [
+          "content",
+          "ShortText"
+        ]
+      ],
+      [
+        1,
+        "Hello"
+      ],
+      [
+        2,
+        "Hello and Good-bye"
+      ]
+    ]
+  ]
+]

  Added: test/command/suite/token_filters/stop_word/skip.test (+25 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/token_filters/stop_word/skip.test    2014-10-02 23:04:36 +0900 (e3ab1f6)
@@ -0,0 +1,25 @@
+register token_filters/stop_word
+
+table_create Memos TABLE_NO_KEY
+column_create Memos content COLUMN_SCALAR ShortText
+
+table_create Terms TABLE_PAT_KEY ShortText \
+  --default_tokenizer TokenBigram \
+  --normalizer NormalizerAuto \
+  --token_filters TokenFilterStopWord
+column_create Terms memos_content COLUMN_INDEX|WITH_POSITION Memos content
+column_create Terms stop_word COLUMN_SCALAR Bool
+
+load --table Terms
+[
+{"_key": "and", "stop_word": true}
+]
+
+load --table Memos
+[
+{"content": "Hello"},
+{"content": "Hello and Good-bye"},
+{"content": "Good-bye"}
+]
+
+select Memos --match_columns content --query "Hello and"
-------------- next part --------------
HTML����������������������������...
Descargar 



More information about the Groonga-commit mailing list
Back to archive index