Kouhei Sutou
null+****@clear*****
Mon Sep 10 12:10:05 JST 2018
Kouhei Sutou 2018-09-10 12:10:05 +0900 (Mon, 10 Sep 2018) Revision: f1a9779f66a113410aaf26c49a02fabe3b662f7c https://github.com/groonga/groonga/commit/f1a9779f66a113410aaf26c49a02fabe3b662f7c Message: tokenize table_tokenize: support outputting metadata Modified files: lib/proc/proc_tokenize.c Modified: lib/proc/proc_tokenize.c (+109 -5) =================================================================== --- lib/proc/proc_tokenize.c 2018-09-10 12:09:48 +0900 (7094979ac) +++ lib/proc/proc_tokenize.c 2018-09-10 12:10:05 +0900 (60f570bed) @@ -1,6 +1,7 @@ /* -*- c-basic-offset: 2 -*- */ /* Copyright(C) 2009-2018 Brazil + Copyright(C) 2018 Kouhei Sutou <kou �� clear-code.com> This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public @@ -67,9 +68,33 @@ typedef struct { uint64_t source_offset; uint32_t source_length; uint32_t source_first_character_length; + grn_obj metadata; } tokenize_token; static void +init_tokens(grn_ctx *ctx, + grn_obj *tokens) +{ + GRN_VALUE_FIX_SIZE_INIT(tokens, GRN_OBJ_VECTOR, GRN_ID_NIL); +} + +static void +fin_tokens(grn_ctx *ctx, + grn_obj *tokens) +{ + int i; + int n_tokens; + + n_tokens = GRN_BULK_VSIZE(tokens) / sizeof(tokenize_token); + for (i = 0; i < n_tokens; i++) { + tokenize_token *token; + token = ((tokenize_token *)(GRN_BULK_HEAD(tokens))) + i; + GRN_OBJ_FIN(ctx, &(token->metadata)); + } + GRN_OBJ_FIN(ctx, tokens); +} + +static void output_tokens(grn_ctx *ctx, grn_obj *tokens, grn_obj *lexicon, @@ -78,6 +103,7 @@ output_tokens(grn_ctx *ctx, int i, n_tokens, n_elements; grn_obj estimated_size; grn_bool have_source_location = GRN_FALSE; + grn_bool have_metadata = GRN_FALSE; n_tokens = GRN_BULK_VSIZE(tokens) / sizeof(tokenize_token); n_elements = 3; @@ -90,12 +116,17 @@ output_tokens(grn_ctx *ctx, token = ((tokenize_token *)(GRN_BULK_HEAD(tokens))) + i; if (token->source_offset > 0 || token->source_length > 0) { have_source_location = GRN_TRUE; - break; + } + if (grn_vector_size(ctx, &(token->metadata)) > 0) { + have_metadata = GRN_TRUE; } } if (have_source_location) { n_elements += 3; } + if (have_metadata) { + n_elements += 1; + } grn_ctx_output_array_open(ctx, "TOKENS", n_tokens); for (i = 0; i < n_tokens; i++) { @@ -136,6 +167,45 @@ output_tokens(grn_ctx *ctx, grn_ctx_output_uint32(ctx, token->source_first_character_length); } + if (have_metadata) { + size_t i; + size_t n_metadata; + grn_obj value; + + n_metadata = grn_vector_size(ctx, &(token->metadata)) / 2; + GRN_VOID_INIT(&value); + grn_ctx_output_cstr(ctx, "metadata"); + grn_ctx_output_map_open(ctx, "METADATA", n_metadata); + for (i = 0; i < n_metadata; i++) { + const char *raw_name; + unsigned int raw_name_length; + const char *raw_value; + unsigned int raw_value_length; + grn_id value_domain; + + raw_name_length = grn_vector_get_element(ctx, + &(token->metadata), + i * 2, + &raw_name, + NULL, + NULL); + grn_ctx_output_str(ctx, raw_name, raw_name_length); + + raw_value_length = grn_vector_get_element(ctx, + &(token->metadata), + i * 2 + 1, + &raw_value, + NULL, + &value_domain); + grn_obj_reinit(ctx, &value, value_domain, 0); + grn_bulk_write(ctx, &value, raw_value, raw_value_length); + grn_ctx_output_obj(ctx, &value, NULL); + } + grn_ctx_output_map_close(ctx); + + GRN_OBJ_FIN(ctx, &value); + } + grn_ctx_output_map_close(ctx); } @@ -171,6 +241,7 @@ tokenize(grn_ctx *ctx, grn_id token_id = grn_token_cursor_next(ctx, token_cursor); grn_token *token; tokenize_token *current_token; + if (token_id == GRN_ID_NIL) { continue; } @@ -184,6 +255,38 @@ tokenize(grn_ctx *ctx, current_token->source_length = grn_token_get_source_length(ctx, token); current_token->source_first_character_length = grn_token_get_source_first_character_length(ctx, token); + + { + grn_obj *metadata; + size_t n_metadata; + size_t i; + grn_obj name; + grn_obj value; + + GRN_TEXT_INIT(&(current_token->metadata), GRN_OBJ_VECTOR); + metadata = grn_token_get_metadata(ctx, token); + n_metadata = grn_token_metadata_get_size(ctx, metadata); + GRN_TEXT_INIT(&name, 0); + GRN_VOID_INIT(&value); + for (i = 0; i < n_metadata; i++) { + grn_token_metadata_at(ctx, metadata, i, &name, &value); + if (GRN_TEXT_LEN(&name) == 0) { + continue; + } + grn_vector_add_element(ctx, + &(current_token->metadata), + GRN_BULK_HEAD(&name), + GRN_BULK_VSIZE(&name), + 0, + name.header.domain); + grn_vector_add_element(ctx, + &(current_token->metadata), + GRN_BULK_HEAD(&value), + GRN_BULK_VSIZE(&value), + 0, + value.header.domain); + } + } } grn_token_cursor_close(ctx, token_cursor); } @@ -269,7 +372,7 @@ command_table_tokenize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *u { grn_obj tokens; - GRN_VALUE_FIX_SIZE_INIT(&tokens, GRN_OBJ_VECTOR, GRN_ID_NIL); + init_tokens(ctx, &tokens); if (mode_raw.length == 0 || GRN_RAW_STRING_EQUAL_CSTRING(mode_raw, "GET")) { tokenize(ctx, lexicon, &string_raw, GRN_TOKEN_GET, flags, &tokens); @@ -283,7 +386,7 @@ command_table_tokenize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *u (int)mode_raw.length, mode_raw.value); } - GRN_OBJ_FIN(ctx, &tokens); + fin_tokens(ctx, &tokens); } #undef MODE_NAME_EQUAL @@ -378,7 +481,8 @@ command_tokenize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_da { grn_obj tokens; - GRN_VALUE_FIX_SIZE_INIT(&tokens, GRN_OBJ_VECTOR, GRN_ID_NIL); + init_tokens(ctx, &tokens); + fin_tokens(ctx, &tokens); if (mode_raw.length == 0 || GRN_RAW_STRING_EQUAL_CSTRING(mode_raw, "ADD")) { tokenize(ctx, lexicon, &string_raw, GRN_TOKEN_ADD, flags, &tokens); @@ -395,7 +499,7 @@ command_tokenize(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_da (int)mode_raw.length, mode_raw.value); } - GRN_OBJ_FIN(ctx, &tokens); + fin_tokens(ctx, &tokens); } #undef MODE_NAME_EQUAL -------------- next part -------------- HTML����������������������������... URL: https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20180910/395d567f/attachment-0001.htm