[Groonga-commit] groonga/groonga at 852039c [ii-regexp-support-dot-asterisk] ii: support index search for "XXX.*YYY" regular expression search

Back to archive index

Kouhei Sutou null+****@clear*****
Tue Feb 21 19:25:41 JST 2017


Kouhei Sutou	2017-02-21 19:25:41 +0900 (Tue, 21 Feb 2017)

  New Revision: 852039c0339219f3037a5a63900d76ccbbeb784d
  https://github.com/groonga/groonga/commit/852039c0339219f3037a5a63900d76ccbbeb784d

  Message:
    ii: support index search for "XXX.*YYY" regular expression search
    
    It doesn't work yet.

  Added files:
    test/command/suite/select/filter/index/regexp/dot_asterisk.test
  Modified files:
    lib/expr.c
    lib/ii.c

  Modified: lib/expr.c (+20 -1)
===================================================================
--- lib/expr.c    2017-02-21 14:47:23 +0900 (1d6e319)
+++ lib/expr.c    2017-02-21 19:25:41 +0900 (f0d4ae4)
@@ -4379,6 +4379,7 @@ is_index_searchable_regexp(grn_ctx *ctx, grn_obj *regexp)
   const char *regexp_raw;
   const char *regexp_raw_end;
   grn_bool escaping = GRN_FALSE;
+  grn_bool dot = GRN_FALSE;
 
   if (!(regexp->header.domain == GRN_DB_SHORT_TEXT ||
         regexp->header.domain == GRN_DB_TEXT ||
@@ -4432,12 +4433,24 @@ is_index_searchable_regexp(grn_ctx *ctx, grn_obj *regexp)
       } else {
         switch (regexp_raw[0]) {
         case '.' :
+          escaping = GRN_FALSE;
+          if (dot) {
+            return GRN_FALSE;
+          }
+          dot = GRN_TRUE;
+          break;
+        case '*' :
+          escaping = GRN_FALSE;
+          if (!dot) {
+            return GRN_FALSE;
+          }
+          dot = GRN_FALSE;
+          break;
         case '[' :
         case ']' :
         case '|' :
         case '?' :
         case '+' :
-        case '*' :
         case '{' :
         case '}' :
         case '^' :
@@ -4447,9 +4460,15 @@ is_index_searchable_regexp(grn_ctx *ctx, grn_obj *regexp)
           escaping = GRN_FALSE;
           return GRN_FALSE;
         case '\\' :
+          if (dot) {
+            return GRN_FALSE;
+          }
           escaping = GRN_TRUE;
           break;
         default :
+          if (dot) {
+            return GRN_FALSE;
+          }
           escaping = GRN_FALSE;
           break;
         }

  Modified: lib/ii.c (+408 -18)
===================================================================
--- lib/ii.c    2017-02-21 14:47:23 +0900 (eca9b0c)
+++ lib/ii.c    2017-02-21 19:25:41 +0900 (66f1c34)
@@ -1,5 +1,6 @@
 /* -*- c-basic-offset: 2 -*- */
-/* Copyright(C) 2009-2016 Brazil
+/*
+  Copyright(C) 2009-2017 Brazil
 
   This library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
@@ -7634,23 +7635,321 @@ grn_ii_term_extract(grn_ctx *ctx, grn_ii *ii, const char *string,
   return rc;
 }
 
+typedef struct {
+  btr *bt;
+  grn_ii *ii;
+  token_info **tis;
+  uint32_t n_tis;
+  int max_interval;
+  grn_operator mode;
+  grn_posting posting;
+  const char *string;
+  unsigned int string_len;
+  grn_bool done;
+} grn_ii_select_cursor;
+
+grn_rc
+grn_ii_select_cursor_close(grn_ctx *ctx,
+                           grn_ii_select_cursor *cursor)
+{
+  token_info **tip;
+
+  if (!cursor) {
+    return GRN_SUCCESS;
+  }
+
+  for (tip = cursor->tis; tip < cursor->tis + cursor->n_tis; tip++) {
+    if (*tip) {
+      token_info_close(ctx, *tip);
+    }
+  }
+  if (cursor->tis) {
+    GRN_FREE(cursor->tis);
+  }
+  bt_close(ctx, cursor->bt);
+  GRN_FREE(cursor);
+
+  return GRN_SUCCESS;
+}
+
+grn_ii_select_cursor *
+grn_ii_select_cursor_open(grn_ctx *ctx,
+                          grn_ii *ii,
+                          const char *string,
+                          unsigned int string_len,
+                          grn_select_optarg *optarg)
+{
+  grn_operator mode = GRN_OP_EXACT;
+  grn_ii_select_cursor *cursor;
+
+  if (string_len == 0) {
+    ERR(GRN_INVALID_ARGUMENT,
+        "[ii][select][cursor][open] empty string");
+    return NULL;
+  }
+
+  if (optarg) {
+    mode = optarg->mode;
+  }
+  switch (mode) {
+  case GRN_OP_EXACT :
+  case GRN_OP_FUZZY :
+  case GRN_OP_NEAR :
+  case GRN_OP_NEAR2 :
+    break;
+  default :
+    ERR(GRN_INVALID_ARGUMENT,
+        "[ii][select][cursor][open] "
+        "EXACT, FUZZY, NEAR and NEAR2 are only supported mode: %s",
+        grn_operator_to_string(mode));
+    break;
+  }
+
+  cursor = GRN_CALLOC(sizeof(grn_ii_select_cursor));
+  if (!cursor) {
+    ERR(ctx->rc,
+        "[ii][select][cursor][open] failed to allocate cursor: %s",
+        ctx->errbuf);
+    return NULL;
+  }
+
+  cursor->ii = ii;
+  cursor->mode = mode;
+
+  if (!(cursor->tis = GRN_MALLOC(sizeof(token_info *) * string_len * 2))) {
+    ERR(ctx->rc,
+        "[ii][select][cursor][open] failed to allocate token info container: %s",
+        ctx->errbuf);
+    GRN_FREE(cursor);
+    return NULL;
+  }
+  cursor->n_tis = 0;
+  if (cursor->mode == GRN_OP_FUZZY) {
+    grn_bool only_skip_token = GRN_FALSE;
+    grn_id previous_min = GRN_ID_NIL;
+    if (token_info_build_fuzzy(ctx, ii->lexicon, ii, string, string_len,
+                               cursor->tis, &(cursor->n_tis),
+                               &only_skip_token, previous_min,
+                               cursor->mode, &(optarg->fuzzy)) != GRN_SUCCESS) {
+      grn_ii_select_cursor_close(ctx, cursor);
+      return NULL;
+    }
+  } else {
+    grn_bool only_skip_token = GRN_FALSE;
+    grn_id previous_min = GRN_ID_NIL;
+    if (token_info_build(ctx, ii->lexicon, ii, string, string_len,
+                         cursor->tis, &(cursor->n_tis),
+                         &only_skip_token, previous_min,
+                         cursor->mode) != GRN_SUCCESS) {
+      grn_ii_select_cursor_close(ctx, cursor);
+      return NULL;
+    }
+  }
+  if (cursor->n_tis == 0) {
+    grn_ii_select_cursor_close(ctx, cursor);
+    return NULL;
+  }
+
+  switch (cursor->mode) {
+  case GRN_OP_NEAR2 :
+    token_info_clear_offset(cursor->tis, cursor->n_tis);
+    cursor->mode = GRN_OP_NEAR;
+    /* fallthru */
+  case GRN_OP_NEAR :
+    if (!(cursor->bt = bt_open(ctx, cursor->n_tis))) {
+      ERR(ctx->rc,
+          "[ii][select][cursor][open] failed to allocate btree: %s",
+          ctx->errbuf);
+      grn_ii_select_cursor_close(ctx, cursor);
+      return NULL;
+    }
+    cursor->max_interval = optarg->max_interval;
+    break;
+  default :
+    break;
+  }
+  qsort(cursor->tis, cursor->n_tis, sizeof(token_info *), token_compare);
+  GRN_LOG(ctx, GRN_LOG_INFO,
+          "[ii][select][cursor][open] n=%d <%.*s>",
+          cursor->n_tis,
+          string_len, string);
+
+  cursor->posting.rest = 0;
+
+  cursor->string = string;
+  cursor->string_len = string_len;
+
+  cursor->done = GRN_FALSE;
+
+  return cursor;
+}
+
+grn_posting *
+grn_ii_select_cursor_next(grn_ctx *ctx,
+                          grn_ii_select_cursor *cursor)
+{
+  btr *bt = cursor->bt;
+  token_info **tis = cursor->tis;
+  token_info **tie = tis + cursor->n_tis;
+  uint32_t n_tis = cursor->n_tis;
+  int max_interval = cursor->max_interval;
+  grn_operator mode = cursor->mode;
+
+  if (cursor->done) {
+    return NULL;
+  }
+
+  for (;;) {
+    grn_id rid;
+    grn_id sid;
+    grn_id next_rid;
+    grn_id next_sid;
+    token_info **tip;
+
+    rid = (*tis)->p->rid;
+    sid = (*tis)->p->sid;
+    for (tip = tis + 1, next_rid = rid, next_sid = sid + 1;
+         tip < tie;
+         tip++) {
+      token_info *ti = *tip;
+      if (token_info_skip(ctx, ti, rid, sid)) { return NULL; }
+      if (ti->p->rid != rid || ti->p->sid != sid) {
+        next_rid = ti->p->rid;
+        next_sid = ti->p->sid;
+        break;
+      }
+    }
+
+    if (tip == tie) {
+      int n_occurs = 0;
+      int pos = 0;
+      int score = 0;
+      int tscore = 0;
+
+#define SKIP_OR_BREAK(pos) {\
+  if (token_info_skip_pos(ctx, ti, rid, sid, pos)) { break; }    \
+  if (ti->p->rid != rid || ti->p->sid != sid) { \
+    next_rid = ti->p->rid; \
+    next_sid = ti->p->sid; \
+    break; \
+  } \
+}
+      if (n_tis == 1) {
+        n_occurs = (*tis)->p->tf;
+        tscore = (*tis)->p->weight + (*tis)->cursors->bins[0]->weight;
+        pos = (*tis)->p->pos;
+      } else if (mode == GRN_OP_NEAR) {
+        bt_zap(bt);
+        for (tip = tis; tip < tie; tip++) {
+          token_info *ti = *tip;
+          SKIP_OR_BREAK(pos);
+          bt_push(bt, ti);
+        }
+        if (tip == tie) {
+          for (;;) {
+            token_info *ti;
+            int min;
+            int max;
+
+            ti = bt->min;
+            min = ti->pos;
+            max = bt->max->pos;
+            if (min > max) {
+              char ii_name[GRN_TABLE_MAX_KEY_SIZE];
+              int ii_name_size;
+              ii_name_size = grn_obj_name(ctx,
+                                          (grn_obj *)(cursor->ii),
+                                          ii_name,
+                                          GRN_TABLE_MAX_KEY_SIZE);
+              ERR(GRN_FILE_CORRUPT,
+                  "[ii][select][cursor][near] "
+                  "max position must be larger than min position: "
+                  "min:<%d> max:<%d> ii:<%.*s> string:<%.*s>",
+                  min, max,
+                  ii_name_size, ii_name,
+                  cursor->string_len,
+                  cursor->string);
+              return NULL;
+            }
+            if ((max_interval < 0) || (max - min <= max_interval)) {
+              n_occurs++;
+              if (ti->pos == max + 1) {
+                break;
+              }
+              SKIP_OR_BREAK(max + 1);
+            } else {
+              if (ti->pos == max - max_interval) {
+                break;
+              }
+              SKIP_OR_BREAK(max - max_interval);
+            }
+            bt_pop(bt);
+          }
+        }
+      } else {
+        int count = 0;
+        for (tip = tis; ; tip++) {
+          token_info *ti;
+
+          if (tip == tie) { tip = tis; }
+          ti = *tip;
+          SKIP_OR_BREAK(pos);
+          if (ti->pos == pos) {
+            score += ti->p->weight + ti->cursors->bins[0]->weight;
+            count++;
+          } else {
+            score = ti->p->weight + ti->cursors->bins[0]->weight;
+            count = 1;
+            pos = ti->pos;
+          }
+          if (count == n_tis) {
+            tscore += score;
+            score = 0;
+            count = 0;
+            pos++;
+            n_occurs++;
+          }
+        }
+      }
+      if (n_occurs > 0) {
+        cursor->posting.rid = rid;
+        cursor->posting.sid = sid;
+        cursor->posting.pos = pos;
+        cursor->posting.tf = n_occurs;
+        cursor->posting.weight = tscore;
+        if (token_info_skip(ctx, *tis, next_rid, next_sid) != GRN_SUCCESS) {
+          cursor->done = GRN_TRUE;
+        }
+        return &(cursor->posting);
+      }
+#undef SKIP_OR_BREAK
+    }
+    if (token_info_skip(ctx, *tis, next_rid, next_sid)) {
+      return NULL;
+    }
+  }
+}
+
 static grn_rc
 grn_ii_parse_regexp_query(grn_ctx *ctx,
                           const char *log_tag,
                           const char *string, unsigned int string_len,
-                          grn_obj *parsed_string)
+                          grn_obj *parsed_strings)
 {
   grn_bool escaping = GRN_FALSE;
   int nth_char = 0;
   const char *current = string;
   const char *string_end = string + string_len;
+  grn_obj buffer;
 
+  GRN_TEXT_INIT(&buffer, 0);
   while (current < string_end) {
     const char *target;
     int char_len;
 
     char_len = grn_charlen(ctx, current, string_end);
     if (char_len == 0) {
+      GRN_OBJ_FIN(ctx, &buffer);
       ERR(GRN_INVALID_ARGUMENT,
           "%s invalid encoding character: <%.*s|%#x|>",
           log_tag,
@@ -7682,15 +7981,40 @@ grn_ii_parse_regexp_query(grn_ctx *ctx,
         }
       }
     } else {
-      if (char_len == 1 && *target == '\\') {
-        escaping = GRN_TRUE;
-        continue;
+      if (char_len == 1) {
+        if (*target == '\\') {
+          escaping = GRN_TRUE;
+          continue;
+        } else if (*target == '.' &&
+                   grn_charlen(ctx, current, string_end) == 1 &&
+                   *current == '*') {
+          if (GRN_TEXT_LEN(&buffer) > 0) {
+            grn_vector_add_element(ctx,
+                                   parsed_strings,
+                                   GRN_TEXT_VALUE(&buffer),
+                                   GRN_TEXT_LEN(&buffer),
+                                   0,
+                                   GRN_DB_TEXT);
+            GRN_BULK_REWIND(&buffer);
+          }
+          nth_char++;
+          continue;
+        }
       }
     }
 
-    GRN_TEXT_PUT(ctx, parsed_string, target, char_len);
+    GRN_TEXT_PUT(ctx, &buffer, target, char_len);
     nth_char++;
   }
+  if (GRN_TEXT_LEN(&buffer) > 0) {
+    grn_vector_add_element(ctx,
+                           parsed_strings,
+                           GRN_TEXT_VALUE(&buffer),
+                           GRN_TEXT_LEN(&buffer),
+                           0,
+                           GRN_DB_TEXT);
+  }
+  GRN_OBJ_FIN(ctx, &buffer);
 
   return GRN_SUCCESS;
 }
@@ -7701,25 +8025,91 @@ grn_ii_select_regexp(grn_ctx *ctx, grn_ii *ii,
                      grn_hash *s, grn_operator op, grn_select_optarg *optarg)
 {
   grn_rc rc;
-  grn_obj parsed_string;
+  grn_obj parsed_strings;
+  unsigned int n_parsed_strings;
 
-  GRN_TEXT_INIT(&parsed_string, 0);
+  GRN_TEXT_INIT(&parsed_strings, GRN_OBJ_VECTOR);
   rc = grn_ii_parse_regexp_query(ctx, "[ii][select][regexp]",
-                                 string, string_len, &parsed_string);
+                                 string, string_len, &parsed_strings);
   if (rc != GRN_SUCCESS) {
-    GRN_OBJ_FIN(ctx, &parsed_string);
+    GRN_OBJ_FIN(ctx, &parsed_strings);
     return rc;
   }
 
   if (optarg) {
-    optarg->mode = GRN_OP_MATCH;
-  }
+    optarg->mode = GRN_OP_EXACT;
+  }
+
+  n_parsed_strings = grn_vector_size(ctx, &parsed_strings);
+  if (n_parsed_strings == 1) {
+    const char *parsed_string;
+    unsigned int parsed_string_len;
+    parsed_string_len = grn_vector_get_element(ctx,
+                                               &parsed_strings,
+                                               0,
+                                               &parsed_string,
+                                               NULL,
+                                               NULL);
+    rc = grn_ii_select(ctx, ii,
+                       parsed_string,
+                       parsed_string_len,
+                       s, op, optarg);
+  } else {
+    int i;
+    grn_ii_select_cursor **cursors;
+
+    cursors = GRN_MALLOC(sizeof(grn_ii_select_cursor *) * n_parsed_strings);
+    for (i = 0; i < n_parsed_strings; i++) {
+      const char *parsed_string;
+      unsigned int parsed_string_len;
+      parsed_string_len = grn_vector_get_element(ctx,
+                                                 &parsed_strings,
+                                                 0,
+                                                 &parsed_string,
+                                                 NULL,
+                                                 NULL);
+      cursors[i] = grn_ii_select_cursor_open(ctx,
+                                             ii,
+                                             parsed_string,
+                                             parsed_string_len,
+                                             optarg);
+    }
 
-  rc = grn_ii_select(ctx, ii,
-                     GRN_TEXT_VALUE(&parsed_string),
-                     GRN_TEXT_LEN(&parsed_string),
-                     s, op, optarg);
-  GRN_OBJ_FIN(ctx, &parsed_string);
+    for (;;) {
+      grn_posting *posting;
+      uint32_t pos;
+
+      posting = grn_ii_select_cursor_next(ctx, cursors[0]);
+      if (!posting) {
+        break;
+      }
+
+      pos = posting->pos;
+      for (i = 1; i < n_parsed_strings; i++) {
+        grn_posting *posting_i;
+
+        while ((posting_i = grn_ii_select_cursor_next(ctx, cursors[i]))) {
+          if (posting_i->rid == posting->rid &&
+              posting_i->sid == posting->sid &&
+              posting_i->pos > pos) {
+            break;
+          }
+          if (posting_i->rid > posting->rid) {
+            break;
+          }
+        }
+
+        if (posting_i->rid != posting->rid || posting_i->sid != posting->sid) {
+          break;
+        }
+      }
+
+      if (i == n_parsed_strings) {
+        
+      }
+    }
+  }
+  GRN_OBJ_FIN(ctx, &parsed_strings);
 
   if (optarg) {
     optarg->mode = GRN_OP_REGEXP;
@@ -8270,7 +8660,7 @@ grn_ii_estimate_size_for_query_regexp(grn_ctx *ctx, grn_ii *ii,
   }
 
   if (optarg) {
-    optarg->mode = GRN_OP_MATCH;
+    optarg->mode = GRN_OP_EXACT;
   }
 
   size = grn_ii_estimate_size_for_query(ctx, ii,

  Added: test/command/suite/select/filter/index/regexp/dot_asterisk.test (+21 -0) 100644
===================================================================
--- /dev/null
+++ test/command/suite/select/filter/index/regexp/dot_asterisk.test    2017-02-21 19:25:41 +0900 (a85e2c0)
@@ -0,0 +1,21 @@
+table_create Memos TABLE_NO_KEY
+column_create Memos content COLUMN_SCALAR Text
+
+table_create RegexpTokens TABLE_PAT_KEY ShortText \
+  --normalizer NormalizerAuto \
+  --default_tokenizer TokenRegexp
+column_create RegexpTokens memos_content COLUMN_INDEX|WITH_POSITION \
+  Memos content
+
+load --table Memos
+[
+{"content": "Groonga"},
+{"content": "Rroonga"},
+{"content": "PGroonga"}
+]
+
+log_level --level info
+#@add-important-log-levels info
+select Memos --filter 'content @~ "g.*ga"'
+#@remove-important-log-levels info
+log_level --level notice
-------------- next part --------------
HTML����������������������������...
Descargar 



More information about the Groonga-commit mailing list
Back to archive index