No issues found
1 /*
2 * Copyright (C) 2011 Nokia <ivan.frade@nokia.com>
3 *
4 * Author: Carlos Garnacho <carlos@lanedo.com>
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19 * 02110-1301 USA
20 */
21
22 /* FTS3/4 Tokenizer using TrackerParser */
23
24 #include <assert.h>
25 #include <string.h>
26 #include "tracker-fts-tokenizer.h"
27 #include "tracker-fts-config.h"
28 #include "tracker-parser.h"
29 #include "fts3_tokenizer.h"
30
31 typedef struct TrackerTokenizer TrackerTokenizer;
32 typedef struct TrackerCursor TrackerCursor;
33
34 struct TrackerTokenizer {
35 sqlite3_tokenizer base;
36 TrackerLanguage *language;
37 int max_word_length;
38 int max_words;
39 gboolean enable_stemmer;
40 gboolean enable_unaccent;
41 gboolean ignore_numbers;
42 gboolean ignore_stop_words;
43 };
44
45 struct TrackerCursor {
46 sqlite3_tokenizer_cursor base;
47
48 TrackerTokenizer *tokenizer;
49 TrackerParser *parser;
50 guint n_words;
51 };
52
53 /*
54 ** Create a new tokenizer instance.
55 */
56 static int trackerCreate(
57 int argc, /* Number of entries in argv[] */
58 const char * const *argv, /* Tokenizer creation arguments */
59 sqlite3_tokenizer **ppTokenizer /* OUT: Created tokenizer */
60 ){
61 TrackerTokenizer *p;
62 TrackerFTSConfig *config;
63
64 p = (TrackerTokenizer *)sqlite3_malloc(sizeof(TrackerTokenizer));
65 if( !p ){
66 return SQLITE_NOMEM;
67 }
68 memset(p, 0, sizeof(TrackerTokenizer));
69 p->language = tracker_language_new (NULL);
70
71 config = tracker_fts_config_new ();
72
73 p->max_word_length = tracker_fts_config_get_max_word_length (config);
74 p->enable_stemmer = tracker_fts_config_get_enable_stemmer (config);
75 p->enable_unaccent = tracker_fts_config_get_enable_unaccent (config);
76 p->ignore_numbers = tracker_fts_config_get_ignore_numbers (config);
77
78 /* disable stop words if TRACKER_FTS_STOP_WORDS is set to 0 - used by tests
79 * otherwise, get value from the conf file */
80 p->ignore_stop_words = (g_strcmp0 (g_getenv ("TRACKER_FTS_STOP_WORDS"), "0") == 0 ?
81 FALSE : tracker_fts_config_get_ignore_stop_words (config));
82
83 p->max_words = tracker_fts_config_get_max_words_to_index (config);
84
85 g_object_unref (config);
86
87 *ppTokenizer = (sqlite3_tokenizer *)p;
88
89 return SQLITE_OK;
90 }
91
92 /*
93 ** Destroy a tokenizer
94 */
95 static int trackerDestroy(sqlite3_tokenizer *pTokenizer){
96 TrackerTokenizer *p = (TrackerTokenizer *)pTokenizer;
97 g_object_unref (p->language);
98 sqlite3_free(p);
99 return SQLITE_OK;
100 }
101
102 /*
103 ** Prepare to begin tokenizing a particular string. The input
104 ** string to be tokenized is pInput[0..nBytes-1]. A cursor
105 ** used to incrementally tokenize this string is returned in
106 ** *ppCursor.
107 */
108 static int trackerOpen(
109 sqlite3_tokenizer *pTokenizer, /* The tokenizer */
110 const char *zInput, /* Input string */
111 int nInput, /* Length of zInput in bytes */
112 sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */
113 ){
114 TrackerTokenizer *p = (TrackerTokenizer *)pTokenizer;
115 TrackerParser *parser;
116 TrackerCursor *pCsr;
117
118 if ( nInput<0 ){
119 nInput = strlen(zInput);
120 }
121
122 parser = tracker_parser_new (p->language);
123 tracker_parser_reset (parser, zInput, nInput,
124 p->max_word_length,
125 p->enable_stemmer,
126 p->enable_unaccent,
127 p->ignore_stop_words,
128 TRUE,
129 p->ignore_numbers);
130
131 pCsr = (TrackerCursor *)sqlite3_malloc(sizeof(TrackerCursor));
132 memset(pCsr, 0, sizeof(TrackerCursor));
133 pCsr->tokenizer = p;
134 pCsr->parser = parser;
135
136 *ppCursor = (sqlite3_tokenizer_cursor *)pCsr;
137 return SQLITE_OK;
138 }
139
140 /*
141 ** Close a tokenization cursor.
142 */
143 static int trackerClose(sqlite3_tokenizer_cursor *pCursor){
144 TrackerCursor *pCsr = (TrackerCursor *)pCursor;
145
146 tracker_parser_free (pCsr->parser);
147 sqlite3_free(pCsr);
148 return SQLITE_OK;
149 }
150
151 /*
152 ** Extract the next token from a tokenization cursor.
153 */
154 static int trackerNext(
155 sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by simpleOpen */
156 const char **ppToken, /* OUT: *ppToken is the token text */
157 int *pnBytes, /* OUT: Number of bytes in token */
158 int *piStartOffset, /* OUT: Starting offset of token */
159 int *piEndOffset, /* OUT: Ending offset of token */
160 int *piPosition /* OUT: Position integer of token */
161 ){
162 TrackerCursor *cursor = (TrackerCursor *) pCursor;
163 TrackerTokenizer *p;
164 const gchar *pToken;
165 gboolean stop_word;
166 int pos, start, end, len;
167
168 p = cursor->tokenizer;
169
170 if (cursor->n_words > p->max_words){
171 return SQLITE_DONE;
172 }
173
174 do {
175 pToken = tracker_parser_next (cursor->parser,
176 &pos,
177 &start, &end,
178 &stop_word,
179 &len);
180
181 if (!pToken){
182 return SQLITE_DONE;
183 }
184 } while (stop_word && p->ignore_stop_words);
185
186 *ppToken = pToken;
187 *piStartOffset = start;
188 *piEndOffset = end;
189 *piPosition = pos;
190 *pnBytes = len;
191
192 cursor->n_words++;
193
194 return SQLITE_OK;
195 }
196
197 /*
198 ** The set of routines that implement the simple tokenizer
199 */
200 static const sqlite3_tokenizer_module trackerTokenizerModule = {
201 0, /* iVersion */
202 trackerCreate, /* xCreate */
203 trackerDestroy, /* xDestroy */
204 trackerOpen, /* xOpen */
205 trackerClose, /* xClose */
206 trackerNext, /* xNext */
207 };
208
209 /*
210 ** Set *ppModule to point at the implementation of the tracker tokenizer.
211 */
212 gboolean tracker_tokenizer_initialize (sqlite3 *db) {
213 const sqlite3_tokenizer_module *pTokenizer;
214 int rc = SQLITE_OK;
215 sqlite3_stmt *stmt;
216
217 pTokenizer = &trackerTokenizerModule;
218 rc = sqlite3_prepare_v2(db, "SELECT fts3_tokenizer(?, ?)",
219 -1, &stmt, 0);
220
221 if (rc != SQLITE_OK) {
222 return FALSE;
223 }
224
225 sqlite3_bind_text(stmt, 1, "TrackerTokenizer", -1, SQLITE_STATIC);
226 sqlite3_bind_blob(stmt, 2, &pTokenizer, sizeof(pTokenizer), SQLITE_STATIC);
227 sqlite3_step(stmt);
228 rc = sqlite3_finalize(stmt);
229
230 return (rc == SQLITE_OK);
231 }