Kouhei Sutou
null+****@clear*****
Thu Oct 2 23:04:36 JST 2014
Kouhei Sutou 2014-10-02 23:04:36 +0900 (Thu, 02 Oct 2014) New Revision: e681cca8f838a91c92008f3d6136dd563197aa42 https://github.com/groonga/groonga/commit/e681cca8f838a91c92008f3d6136dd563197aa42 Message: Ignore query that all tokens are skipped Is it a good approach? Added files: test/command/suite/token_filters/stop_word/skip.expected test/command/suite/token_filters/stop_word/skip.test Modified files: lib/db.c lib/ii.c lib/token.c lib/token.h Modified: lib/db.c (+1 -1) =================================================================== --- lib/db.c 2014-10-02 23:03:13 +0900 (2f1f8e0) +++ lib/db.c 2014-10-02 23:04:36 +0900 (0011676) @@ -9872,7 +9872,7 @@ grn_table_tokenize(grn_ctx *ctx, grn_obj *table, goto exit; } } - while (token->status != GRN_TOKEN_DONE) { + while (token->status != GRN_TOKEN_DONE && token->status != GRN_TOKEN_DONE_SKIP) { grn_id tid; if ((tid = grn_token_next(ctx, token))) { GRN_RECORD_PUT(ctx, buf, tid); Modified: lib/ii.c (+14 -4) =================================================================== --- lib/ii.c 2014-10-02 23:03:13 +0900 (67e5f28) +++ lib/ii.c 2014-10-02 23:04:36 +0900 (4f83a79) @@ -5355,7 +5355,8 @@ token_compare(const void *a, const void *b) inline static grn_rc token_info_build(grn_ctx *ctx, grn_obj *lexicon, grn_ii *ii, const char *string, unsigned int string_len, - token_info **tis, uint32_t *n, grn_operator mode) + token_info **tis, uint32_t *n, grn_bool *only_skip_token, + grn_operator mode) { token_info *ti; const char *key; @@ -5364,6 +5365,7 @@ token_info_build(grn_ctx *ctx, grn_obj *lexicon, grn_ii *ii, const char *string, unsigned int token_flags = GRN_TOKEN_ENABLE_TOKENIZED_DELIMITER; grn_token *token = grn_token_open(ctx, lexicon, string, string_len, GRN_TOKEN_GET, token_flags); + *only_skip_token = GRN_FALSE; if (!token) { return GRN_NO_MEMORY_AVAILABLE; } if (mode == GRN_OP_UNSPLIT) { if ((ti = token_info_open(ctx, lexicon, ii, (char *)token->orig, token->orig_blen, 0, EX_BOTH))) { @@ -5408,6 +5410,9 @@ token_info_build(grn_ctx *ctx, grn_obj *lexicon, grn_ii *ii, const char *string, ti = token_info_open(ctx, lexicon, ii, (char *)token->orig, token->orig_blen, 0, ef); break; + case GRN_TOKEN_DONE_SKIP : + *only_skip_token = GRN_TRUE; + goto exit; default : goto exit; } @@ -5416,6 +5421,8 @@ token_info_build(grn_ctx *ctx, grn_obj *lexicon, grn_ii *ii, const char *string, while (token->status == GRN_TOKEN_DOING) { tid = grn_token_next(ctx, token); switch (token->status) { + case GRN_TOKEN_DONE_SKIP : + continue; case GRN_TOKEN_DOING : key = _grn_table_key(ctx, lexicon, tid, &size); ti = token_info_open(ctx, lexicon, ii, key, size, token->pos, EX_NONE); @@ -5659,7 +5666,7 @@ grn_ii_similar_search(grn_ctx *ctx, grn_ii *ii, return GRN_NO_MEMORY_AVAILABLE; } if (!(max_size = optarg->max_size)) { max_size = 1048576; } - while (token->status != GRN_TOKEN_DONE) { + while (token->status != GRN_TOKEN_DONE && token->status != GRN_TOKEN_DONE_SKIP) { if ((tid = grn_token_next(ctx, token))) { if (grn_hash_add(ctx, h, &tid, sizeof(grn_id), (void **)&w1, NULL)) { (*w1)++; } } @@ -5858,6 +5865,7 @@ grn_ii_select(grn_ctx *ctx, grn_ii *ii, const char *string, unsigned int string_ int rep, orp, weight, max_interval = 0; token_info *ti, **tis = NULL, **tip, **tie; uint32_t n = 0, rid, sid, nrid, nsid; + grn_bool only_skip_token = GRN_FALSE; grn_operator mode = GRN_OP_EXACT; grn_wv_mode wvm = grn_wv_none; grn_obj *lexicon = ii->lexicon; @@ -5886,7 +5894,7 @@ grn_ii_select(grn_ctx *ctx, grn_ii *ii, const char *string, unsigned int string_ if (!(tis = GRN_MALLOC(sizeof(token_info *) * string_len * 2))) { return GRN_NO_MEMORY_AVAILABLE; } - if (token_info_build(ctx, lexicon, ii, string, string_len, tis, &n, mode) || !n) { goto exit; } + if (token_info_build(ctx, lexicon, ii, string, string_len, tis, &n, &only_skip_token, mode) || !n) { goto exit; } switch (mode) { case GRN_OP_NEAR2 : token_info_clear_offset(tis, n); @@ -6012,7 +6020,9 @@ exit : if (*tip) { token_info_close(ctx, *tip); } } if (tis) { GRN_FREE(tis); } - grn_ii_resolve_sel_and(ctx, s, op); + if (!only_skip_token) { + grn_ii_resolve_sel_and(ctx, s, op); + } // grn_hash_cursor_clear(r); bt_close(ctx, bt); #ifdef DEBUG Modified: lib/token.c (+13 -5) =================================================================== --- lib/token.c 2014-10-02 23:03:13 +0900 (618f852) +++ lib/token.c 2014-10-02 23:04:36 +0900 (25eca07) @@ -680,12 +680,20 @@ grn_token_next(grn_ctx *ctx, grn_token *token) (status & GRN_TOKENIZER_TOKEN_REACH_END))) ? GRN_TOKEN_DONE : GRN_TOKEN_DOING; token->force_prefix = 0; - if (status & GRN_TOKENIZER_TOKEN_SKIP) { - token->pos++; - continue; - } else if (status & GRN_TOKENIZER_TOKEN_SKIP_WITH_POSITION) { - continue; +#define SKIP_FLAGS \ + (GRN_TOKENIZER_TOKEN_SKIP | GRN_TOKENIZER_TOKEN_SKIP_WITH_POSITION) + if (status & SKIP_FLAGS) { + if (status & GRN_TOKENIZER_TOKEN_SKIP) { + token->pos++; + } + if (token->status == GRN_TOKEN_DONE && tid == GRN_ID_NIL) { + token->status = GRN_TOKEN_DONE_SKIP; + break; + } else { + continue; + } } +#undef SKIP_FLAGS if (token->curr_size == 0) { char tokenizer_name[GRN_TABLE_MAX_KEY_SIZE]; int tokenizer_name_length; Modified: lib/token.h (+1 -0) =================================================================== --- lib/token.h 2014-10-02 23:03:13 +0900 (69b8352) +++ lib/token.h 2014-10-02 23:04:36 +0900 (938597e) @@ -42,6 +42,7 @@ extern "C" { typedef enum { GRN_TOKEN_DOING = 0, GRN_TOKEN_DONE, + GRN_TOKEN_DONE_SKIP, GRN_TOKEN_NOT_FOUND } grn_token_status; Added: test/command/suite/token_filters/stop_word/skip.expected (+57 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/token_filters/stop_word/skip.expected 2014-10-02 23:04:36 +0900 (5c389ee) @@ -0,0 +1,57 @@ +register token_filters/stop_word +[[0,0.0,0.0],true] +table_create Memos TABLE_NO_KEY +[[0,0.0,0.0],true] +column_create Memos content COLUMN_SCALAR ShortText +[[0,0.0,0.0],true] +table_create Terms TABLE_PAT_KEY ShortText --default_tokenizer TokenBigram --normalizer NormalizerAuto --token_filters TokenFilterStopWord +[[0,0.0,0.0],true] +column_create Terms memos_content COLUMN_INDEX|WITH_POSITION Memos content +[[0,0.0,0.0],true] +column_create Terms stop_word COLUMN_SCALAR Bool +[[0,0.0,0.0],true] +load --table Terms +[ +{"_key": "and", "stop_word": true} +] +[[0,0.0,0.0],1] +load --table Memos +[ +{"content": "Hello"}, +{"content": "Hello and Good-bye"}, +{"content": "Good-bye"} +] +[[0,0.0,0.0],3] +select Memos --match_columns content --query "Hello and" +[ + [ + 0, + 0.0, + 0.0 + ], + [ + [ + [ + 2 + ], + [ + [ + "_id", + "UInt32" + ], + [ + "content", + "ShortText" + ] + ], + [ + 1, + "Hello" + ], + [ + 2, + "Hello and Good-bye" + ] + ] + ] +] Added: test/command/suite/token_filters/stop_word/skip.test (+25 -0) 100644 =================================================================== --- /dev/null +++ test/command/suite/token_filters/stop_word/skip.test 2014-10-02 23:04:36 +0900 (e3ab1f6) @@ -0,0 +1,25 @@ +register token_filters/stop_word + +table_create Memos TABLE_NO_KEY +column_create Memos content COLUMN_SCALAR ShortText + +table_create Terms TABLE_PAT_KEY ShortText \ + --default_tokenizer TokenBigram \ + --normalizer NormalizerAuto \ + --token_filters TokenFilterStopWord +column_create Terms memos_content COLUMN_INDEX|WITH_POSITION Memos content +column_create Terms stop_word COLUMN_SCALAR Bool + +load --table Terms +[ +{"_key": "and", "stop_word": true} +] + +load --table Memos +[ +{"content": "Hello"}, +{"content": "Hello and Good-bye"}, +{"content": "Good-bye"} +] + +select Memos --match_columns content --query "Hello and" -------------- next part -------------- HTML����������������������������...Descargar