tracker-0.16.2/src/libtracker-fts/tracker-fts-tokenizer.c

No issues found

  1 /*
  2  * Copyright (C) 2011 Nokia <ivan.frade@nokia.com>
  3  *
  4  * Author: Carlos Garnacho <carlos@lanedo.com>
  5  *
  6  * This library is free software; you can redistribute it and/or
  7  * modify it under the terms of the GNU Lesser General Public
  8  * License as published by the Free Software Foundation; either
  9  * version 2.1 of the License, or (at your option) any later version.
 10  *
 11  * This library is distributed in the hope that it will be useful,
 12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 14  * Lesser General Public License for more details.
 15  *
 16  * You should have received a copy of the GNU Lesser General Public
 17  * License along with this library; if not, write to the Free Software
 18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
 19  * 02110-1301  USA
 20  */
 21 
 22 /* FTS3/4 Tokenizer using TrackerParser */
 23 
 24 #include <assert.h>
 25 #include <string.h>
 26 #include "tracker-fts-tokenizer.h"
 27 #include "tracker-fts-config.h"
 28 #include "tracker-parser.h"
 29 #include "fts3_tokenizer.h"
 30 
 31 typedef struct TrackerTokenizer TrackerTokenizer;
 32 typedef struct TrackerCursor TrackerCursor;
 33 
 34 struct TrackerTokenizer {
 35   sqlite3_tokenizer base;
 36   TrackerLanguage *language;
 37   int max_word_length;
 38   int max_words;
 39   gboolean enable_stemmer;
 40   gboolean enable_unaccent;
 41   gboolean ignore_numbers;
 42   gboolean ignore_stop_words;
 43 };
 44 
 45 struct TrackerCursor {
 46   sqlite3_tokenizer_cursor base;
 47 
 48   TrackerTokenizer *tokenizer;
 49   TrackerParser *parser;
 50   guint n_words;
 51 };
 52 
 53 /*
 54 ** Create a new tokenizer instance.
 55 */
 56 static int trackerCreate(
 57   int argc,                            /* Number of entries in argv[] */
 58   const char * const *argv,            /* Tokenizer creation arguments */
 59   sqlite3_tokenizer **ppTokenizer      /* OUT: Created tokenizer */
 60 ){
 61   TrackerTokenizer *p;
 62   TrackerFTSConfig *config;
 63 
 64   p = (TrackerTokenizer *)sqlite3_malloc(sizeof(TrackerTokenizer));
 65   if( !p ){
 66     return SQLITE_NOMEM;
 67   }
 68   memset(p, 0, sizeof(TrackerTokenizer));
 69   p->language = tracker_language_new (NULL);
 70 
 71   config = tracker_fts_config_new ();
 72 
 73   p->max_word_length = tracker_fts_config_get_max_word_length (config);
 74   p->enable_stemmer = tracker_fts_config_get_enable_stemmer (config);
 75   p->enable_unaccent = tracker_fts_config_get_enable_unaccent (config);
 76   p->ignore_numbers = tracker_fts_config_get_ignore_numbers (config);
 77 
 78   /* disable stop words if TRACKER_FTS_STOP_WORDS is set to 0 - used by tests
 79    *  otherwise, get value from the conf file */
 80   p->ignore_stop_words = (g_strcmp0 (g_getenv ("TRACKER_FTS_STOP_WORDS"), "0") == 0 ?
 81                           FALSE : tracker_fts_config_get_ignore_stop_words (config));
 82 
 83   p->max_words = tracker_fts_config_get_max_words_to_index (config);
 84 
 85   g_object_unref (config);
 86 
 87   *ppTokenizer = (sqlite3_tokenizer *)p;
 88 
 89   return SQLITE_OK;
 90 }
 91 
 92 /*
 93 ** Destroy a tokenizer
 94 */
 95 static int trackerDestroy(sqlite3_tokenizer *pTokenizer){
 96   TrackerTokenizer *p = (TrackerTokenizer *)pTokenizer;
 97   g_object_unref (p->language);
 98   sqlite3_free(p);
 99   return SQLITE_OK;
100 }
101 
102 /*
103 ** Prepare to begin tokenizing a particular string.  The input
104 ** string to be tokenized is pInput[0..nBytes-1].  A cursor
105 ** used to incrementally tokenize this string is returned in 
106 ** *ppCursor.
107 */
108 static int trackerOpen(
109   sqlite3_tokenizer *pTokenizer,         /* The tokenizer */
110   const char *zInput,                    /* Input string */
111   int nInput,                            /* Length of zInput in bytes */
112   sqlite3_tokenizer_cursor **ppCursor    /* OUT: Tokenization cursor */
113 ){
114   TrackerTokenizer *p = (TrackerTokenizer *)pTokenizer;
115   TrackerParser *parser;
116   TrackerCursor *pCsr;
117 
118   if ( nInput<0 ){
119     nInput = strlen(zInput);
120   }
121 
122   parser = tracker_parser_new (p->language);
123   tracker_parser_reset (parser, zInput, nInput,
124 			p->max_word_length,
125 			p->enable_stemmer,
126 			p->enable_unaccent,
127 			p->ignore_stop_words,
128 			TRUE,
129 			p->ignore_numbers);
130 
131   pCsr = (TrackerCursor *)sqlite3_malloc(sizeof(TrackerCursor));
132   memset(pCsr, 0, sizeof(TrackerCursor));
133   pCsr->tokenizer = p;
134   pCsr->parser = parser;
135 
136   *ppCursor = (sqlite3_tokenizer_cursor *)pCsr;
137   return SQLITE_OK;
138 }
139 
140 /*
141 ** Close a tokenization cursor.
142 */
143 static int trackerClose(sqlite3_tokenizer_cursor *pCursor){
144   TrackerCursor *pCsr = (TrackerCursor *)pCursor;
145 
146   tracker_parser_free (pCsr->parser);
147   sqlite3_free(pCsr);
148   return SQLITE_OK;
149 }
150 
151 /*
152 ** Extract the next token from a tokenization cursor.
153 */
154 static int trackerNext(
155   sqlite3_tokenizer_cursor *pCursor,  /* Cursor returned by simpleOpen */
156   const char **ppToken,               /* OUT: *ppToken is the token text */
157   int *pnBytes,                       /* OUT: Number of bytes in token */
158   int *piStartOffset,                 /* OUT: Starting offset of token */
159   int *piEndOffset,                   /* OUT: Ending offset of token */
160   int *piPosition                     /* OUT: Position integer of token */
161 ){
162   TrackerCursor *cursor = (TrackerCursor *) pCursor;
163   TrackerTokenizer *p;
164   const gchar *pToken;
165   gboolean stop_word;
166   int pos, start, end, len;
167 
168   p  = cursor->tokenizer;
169 
170   if (cursor->n_words > p->max_words){
171     return SQLITE_DONE;
172   }
173 
174   do {
175     pToken = tracker_parser_next (cursor->parser,
176 				  &pos,
177 				  &start, &end,
178 				  &stop_word,
179 				  &len);
180 
181     if (!pToken){
182       return SQLITE_DONE;
183     }
184   } while (stop_word && p->ignore_stop_words);
185 
186   *ppToken = pToken;
187   *piStartOffset = start;
188   *piEndOffset = end;
189   *piPosition = pos;
190   *pnBytes = len;
191 
192   cursor->n_words++;
193 
194   return SQLITE_OK;
195 }
196 
197 /*
198 ** The set of routines that implement the simple tokenizer
199 */
200 static const sqlite3_tokenizer_module trackerTokenizerModule = {
201   0,                           /* iVersion */
202   trackerCreate,               /* xCreate  */
203   trackerDestroy,              /* xDestroy */
204   trackerOpen,                 /* xOpen    */
205   trackerClose,                /* xClose   */
206   trackerNext,                 /* xNext    */
207 };
208 
209 /*
210 ** Set *ppModule to point at the implementation of the tracker tokenizer.
211 */
212 gboolean tracker_tokenizer_initialize (sqlite3 *db) {
213   const sqlite3_tokenizer_module *pTokenizer;
214   int rc = SQLITE_OK;
215   sqlite3_stmt *stmt;
216 
217   pTokenizer = &trackerTokenizerModule;
218   rc = sqlite3_prepare_v2(db, "SELECT fts3_tokenizer(?, ?)",
219                           -1, &stmt, 0);
220 
221   if (rc != SQLITE_OK) {
222 	  return FALSE;
223   }
224 
225   sqlite3_bind_text(stmt, 1, "TrackerTokenizer", -1, SQLITE_STATIC);
226   sqlite3_bind_blob(stmt, 2, &pTokenizer, sizeof(pTokenizer), SQLITE_STATIC);
227   sqlite3_step(stmt);
228   rc = sqlite3_finalize(stmt);
229 
230   return (rc == SQLITE_OK);
231 }