1 /*
2 * Copyright (C) 2006, Jamie McCracken <jamiemcc@gnome.org>
3 * Copyright (C) 2008,2009,2010 Nokia <ivan.frade@nokia.com>
4 *
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2.1 of the License, or (at your option) any later version.
9 *
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
14 *
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
18 * 02110-1301 USA
19 */
20
21 #include "config.h"
22
23 #include <stdio.h>
24 #include <string.h>
25
26 /* libunistring versions prior to 9.1.2 need this hack */
27 #define _UNUSED_PARAMETER_
28 #include <unistr.h>
29 #include <uniwbrk.h>
30 #include <unictype.h>
31 #include <unicase.h>
32
33 #include "tracker-parser.h"
34 #include "tracker-parser-utils.h"
35
36 /* Type of words detected */
37 typedef enum {
38 TRACKER_PARSER_WORD_TYPE_ASCII,
39 TRACKER_PARSER_WORD_TYPE_OTHER_UNAC,
40 TRACKER_PARSER_WORD_TYPE_OTHER_NO_UNAC,
41 } TrackerParserWordType;
42
43 /* Max possible length of a UTF-8 encoded string (just a safety limit) */
44 #define WORD_BUFFER_LENGTH 512
45
46 struct TrackerParser {
47 const gchar *txt;
48 gint txt_size;
49
50 TrackerLanguage *language;
51 guint max_word_length;
52 gboolean enable_stemmer;
53 gboolean enable_unaccent;
54 gboolean ignore_stop_words;
55 gboolean ignore_reserved_words;
56 gboolean ignore_numbers;
57 gboolean enable_forced_wordbreaks;
58
59 /* Private members */
60 gchar *word;
61 gint word_length;
62 guint word_position;
63
64 /* Cursor, as index of the input array of bytes */
65 gsize cursor;
66 /* libunistring flags array */
67 gchar *word_break_flags;
68 /* general category of the start character in words */
69 uc_general_category_t allowed_start;
70 };
71
72 static gboolean
73 get_word_info (TrackerParser *parser,
74 gsize *p_word_length,
75 gboolean *p_is_allowed_word_start,
76 TrackerParserWordType *p_word_type)
77 {
78 ucs4_t first_unichar;
79 gint first_unichar_len;
80 gboolean ascii_only;
81
82 /* Defaults */
83 *p_is_allowed_word_start = TRUE;
84
85 /* Get first character of the word as UCS4 */
86 first_unichar_len = u8_strmbtouc (&first_unichar,
87 &(parser->txt[parser->cursor]));
pointer targets in passing argument 2 of 'u8_strmbtouc' differ in signedness
(emitted by gcc)
88 if (first_unichar_len <= 0) {
89 /* This should only happen if NIL was passed to u8_strmbtouc,
90 * so better just force stop here */
91 return FALSE;
92 } else {
93 /* If first character has length 1, it's ASCII-7 */
94 ascii_only = first_unichar_len == 1 ? TRUE : FALSE;
95 }
96
97 /* Consider word starts with a forced wordbreak */
98 if (parser->enable_forced_wordbreaks &&
99 IS_FORCED_WORDBREAK_UCS4 ((guint32)first_unichar)) {
100 *p_word_length = first_unichar_len;
101 } else {
102 gsize i;
103
104 /* Find next word break, and in the same loop checking if only ASCII
105 * characters */
106 i = parser->cursor + first_unichar_len;
107 while (1) {
108 /* Text bounds reached? */
109 if (i >= parser->txt_size)
110 break;
111 /* Proper unicode word break detected? */
112 if (parser->word_break_flags[i])
113 break;
114 /* Forced word break detected? */
115 if (parser->enable_forced_wordbreaks &&
116 IS_FORCED_WORDBREAK_UCS4 ((guint32)parser->txt[i]))
117 break;
118
119 if (ascii_only &&
120 !IS_ASCII_UCS4 ((guint32)parser->txt[i])) {
121 ascii_only = FALSE;
122 }
123
124 i++;
125 }
126
127 /* Word end is the first byte after the word, which is either the
128 * start of next word or the end of the string */
129 *p_word_length = i - parser->cursor;
130 }
131
132 /* We only want the words where the first character
133 * in the word is either a letter, a number or a symbol.
134 * This is needed because the word break algorithm also
135 * considers word breaks after for example commas or other
136 * punctuation marks.
137 * Note that looking at the first character in the string
138 * should be compatible with all Unicode normalization
139 * methods.
140 */
141 if (!IS_UNDERSCORE_UCS4 ((guint32)first_unichar) &&
142 !uc_is_general_category (first_unichar,
143 parser->allowed_start)) {
144 *p_is_allowed_word_start = FALSE;
145 return TRUE;
146 }
147
148 /* Decide word type */
149 if (ascii_only) {
150 *p_word_type = TRACKER_PARSER_WORD_TYPE_ASCII;
151 } else if (IS_CJK_UCS4 (first_unichar)) {
152 *p_word_type = TRACKER_PARSER_WORD_TYPE_OTHER_NO_UNAC;
153 } else {
154 *p_word_type = TRACKER_PARSER_WORD_TYPE_OTHER_UNAC;
155 }
156 return TRUE;
157 }
158
159 static gboolean
160 parser_unaccent_nfkd_word (gchar *word,
161 gsize *word_length)
162 {
163 /* The input word in this method MUST be normalized in NFKD form */
164 gsize i;
165 gsize j;
166
167 g_return_val_if_fail (word, FALSE);
168 g_return_val_if_fail (word_length, FALSE);
169 g_return_val_if_fail (*word_length > 0, FALSE);
170
171 i = 0;
172 j = 0;
173 while (i < *word_length) {
174 ucs4_t unichar;
175 gint utf8_len;
176
177 /* Get next character of the word as UCS4 */
178 utf8_len = u8_strmbtouc (&unichar, &word[i]);
pointer targets in passing argument 2 of 'u8_strmbtouc' differ in signedness
(emitted by gcc)
179
180 /* Invalid UTF-8 character or end of original string. */
181 if (utf8_len <= 0) {
182 break;
183 }
184
185 /* If the given unichar is a combining diacritical mark,
186 * just update the original index, not the output one */
187 if (IS_CDM_UCS4 ((guint32) unichar)) {
188 i += utf8_len;
189 continue;
190 }
191
192 /* If already found a previous combining
193 * diacritical mark, indexes are different so
194 * need to copy characters. As output and input
195 * buffers may overlap, need to use memmove
196 * instead of memcpy */
197 if (i != j) {
198 memmove (&word[j], &word[i], utf8_len);
199 }
200
201 /* Update both indexes */
202 i += utf8_len;
203 j += utf8_len;
204 }
205
206 /* Force proper string end */
207 word[j] = '\0';
208
209 /* Set new output length */
210 *word_length = j;
211
212 return TRUE;
213 }
214
215 static gchar *
216 process_word_utf8 (TrackerParser *parser,
217 const gchar *word,
218 gint length,
219 TrackerParserWordType type,
220 gboolean *stop_word)
221 {
222 gchar word_buffer [WORD_BUFFER_LENGTH];
223 gchar *normalized = NULL;
224 gchar *stemmed = NULL;
225 size_t new_word_length;
226
227 g_return_val_if_fail (parser != NULL, NULL);
228 g_return_val_if_fail (word != NULL, NULL);
229
230 /* If length is set as -1, the input word MUST be NIL-terminated.
231 * Otherwise, this restriction is not needed as the length to process
232 * is given as input argument */
233 if (length < 0) {
234 length = strlen (word);
235 }
236
237 /* Log original word */
238 tracker_parser_message_hex ("ORIGINAL word",
239 word, length);
240
241 /* Normalization and case-folding ONLY for non-ASCII */
242 if (type != TRACKER_PARSER_WORD_TYPE_ASCII) {
243 /* Leave space for last NIL */
244 new_word_length = WORD_BUFFER_LENGTH - 1;
245
246 /* Casefold and NFKD normalization in output.
247 * NOTE: if the output buffer is not big enough, u8_casefold will
248 * return a newly-allocated buffer. */
249 normalized = u8_casefold ((const uint8_t *)word,
250 length,
251 uc_locale_language (),
252 UNINORM_NFKD,
253 word_buffer,
254 &new_word_length);
pointer targets in passing argument 5 of 'u8_casefold' differ in signedness
(emitted by gcc)
255
256 /* Case folding + Normalization failed, ignore this word */
257 g_return_val_if_fail (normalized != NULL, NULL);
258
259 /* If output buffer is not the same as the one passed to
260 * u8_casefold, we know it was newly-allocated, so need
261 * to resize it in 1 byte to add last NIL */
262 if (normalized != word_buffer) {
263 normalized = g_realloc (normalized, new_word_length + 1);
264 }
265
266 /* Log after Normalization */
267 tracker_parser_message_hex (" After Casefolding and NFKD normalization",
268 normalized, new_word_length);
269 } else {
270 /* For ASCII-only, just tolower() each character */
271 gsize i;
272
273 normalized = length > WORD_BUFFER_LENGTH ? g_malloc (length + 1) : word_buffer;
274
275 for (i = 0; i < length; i++) {
276 normalized[i] = g_ascii_tolower (word[i]);
277 }
278
279 new_word_length = length;
280
281 /* Log after tolower */
282 tracker_parser_message_hex (" After Lowercasing",
283 normalized, new_word_length);
284 }
285
286 /* Set output NIL */
287 normalized[new_word_length] = '\0';
288
289 /* UNAC stripping needed? (for non-CJK and non-ASCII) */
290 if (parser->enable_unaccent &&
291 type == TRACKER_PARSER_WORD_TYPE_OTHER_UNAC &&
292 parser_unaccent_nfkd_word (normalized, &new_word_length)) {
293 /* Log after UNAC stripping */
294 tracker_parser_message_hex (" After UNAC stripping",
295 normalized, new_word_length);
296 }
297
298 /* Check if stop word */
299 if (parser->ignore_stop_words) {
300 *stop_word = tracker_language_is_stop_word (parser->language,
301 normalized);
302 }
303
304 /* Stemming needed? */
305 if (parser->enable_stemmer) {
306 stemmed = tracker_language_stem_word (parser->language,
307 normalized,
308 new_word_length);
309
310 /* Log after stemming */
311 tracker_parser_message_hex (" After stemming",
312 stemmed, strlen (stemmed));
313 }
314
315 /* If stemmed wanted and succeeded, free previous and return it */
316 if (stemmed) {
317 if (normalized != word_buffer) {
318 g_free (normalized);
319 }
320 return stemmed;
321 }
322
323 /* It may be the case that no stripping and no stemming was needed, and
324 * that the output buffer in stack was enough for case-folding and
325 * normalization. In this case, need to strdup() the string to return it */
326 return normalized == word_buffer ? g_strdup (word_buffer) : normalized;
327 }
328
329 static gboolean
330 parser_next (TrackerParser *parser,
331 gint *byte_offset_start,
332 gint *byte_offset_end,
333 gboolean *stop_word)
334 {
335 gsize word_length = 0;
336 gchar *processed_word = NULL;
337
338 *byte_offset_start = 0;
339 *byte_offset_end = 0;
340
341 g_return_val_if_fail (parser, FALSE);
342
343 /* Loop to look for next valid word */
344 while (!processed_word &&
345 parser->cursor < parser->txt_size) {
346 TrackerParserWordType type;
347 gsize truncated_length;
348 gboolean is_allowed;
349
350 /* Get word info */
351 if (!get_word_info (parser,
352 &word_length,
353 &is_allowed,
354 &type)) {
355 /* Quit loop just in case */
356 parser->cursor = parser->txt_size;
357 break;
358 }
359
360 /* Ignore the word if not an allowed word start */
361 if (!is_allowed) {
362 /* Ignore this word and keep on looping */
363 parser->cursor += word_length;
364 continue;
365 }
366
367 /* Ignore the word if longer than the maximum allowed */
368 if (word_length >= parser->max_word_length) {
369 /* Ignore this word and keep on looping */
370 parser->cursor += word_length;
371 continue;
372 }
373
374 /* check if word is reserved and ignore it if so */
375 if (parser->ignore_reserved_words &&
376 tracker_parser_is_reserved_word_utf8 (&parser->txt[parser->cursor],
377 word_length)) {
378 /* Ignore this word and keep on looping */
379 parser->cursor += word_length;
380 continue;
381 }
382
383 /* compute truncated word length if needed (to avoid extremely
384 * long words)*/
385 truncated_length = (word_length < WORD_BUFFER_LENGTH ?
386 word_length :
387 WORD_BUFFER_LENGTH - 1);
388
389 /* Process the word here. If it fails, we can still go
390 * to the next one. Returns newly allocated string
391 * always */
392 processed_word = process_word_utf8 (parser,
393 &(parser->txt[parser->cursor]),
394 truncated_length,
395 type,
396 stop_word);
397 if (!processed_word) {
398 /* Ignore this word and keep on looping */
399 parser->cursor += word_length;
400 continue;
401 }
402 }
403
404 /* If we got a word here, set output */
405 if (processed_word) {
406 /* Set outputs */
407 *byte_offset_start = parser->cursor;
408 *byte_offset_end = parser->cursor + word_length;
409
410 /* Update cursor */
411 parser->cursor += word_length;
412
413 parser->word_length = strlen (processed_word);
414 parser->word = processed_word;
415
416 return TRUE;
417 }
418
419 /* No more words... */
420 return FALSE;
421 }
422
423 TrackerParser *
424 tracker_parser_new (TrackerLanguage *language)
425 {
426 TrackerParser *parser;
427
428 g_return_val_if_fail (TRACKER_IS_LANGUAGE (language), NULL);
429
430 parser = g_new0 (TrackerParser, 1);
431
432 parser->language = g_object_ref (language);
433
434 return parser;
435 }
436
437 void
438 tracker_parser_free (TrackerParser *parser)
439 {
440 g_return_if_fail (parser != NULL);
441
442 if (parser->language) {
443 g_object_unref (parser->language);
444 }
445
446 g_free (parser->word_break_flags);
447
448 g_free (parser->word);
449
450 g_free (parser);
451 }
452
453 void
454 tracker_parser_reset (TrackerParser *parser,
455 const gchar *txt,
456 gint txt_size,
457 guint max_word_length,
458 gboolean enable_stemmer,
459 gboolean enable_unaccent,
460 gboolean ignore_stop_words,
461 gboolean ignore_reserved_words,
462 gboolean ignore_numbers)
463 {
464 g_return_if_fail (parser != NULL);
465 g_return_if_fail (txt != NULL);
466
467 parser->max_word_length = max_word_length;
468 parser->enable_stemmer = enable_stemmer;
469 parser->enable_unaccent = enable_unaccent;
470 parser->ignore_stop_words = ignore_stop_words;
471 parser->ignore_reserved_words = ignore_reserved_words;
472 parser->ignore_numbers = ignore_numbers;
473
474 /* Note: We're forcing some unicode characters to behave
475 * as wordbreakers: e.g, the '.' The main reason for this
476 * is to enable FTS searches matching file extension. */
477 parser->enable_forced_wordbreaks = TRUE;
478
479 parser->txt_size = txt_size;
480 parser->txt = txt;
481
482 g_free (parser->word);
483 parser->word = NULL;
484
485 parser->word_position = 0;
486
487 parser->cursor = 0;
488
489 g_free (parser->word_break_flags);
490
491 /* Create array of flags, same size as original text. */
492 parser->word_break_flags = g_malloc (txt_size);
493
494 /* Get wordbreak flags in the whole string */
495 u8_wordbreaks ((const uint8_t *)txt,
496 (size_t) txt_size,
497 (char *)parser->word_break_flags);
498
499 /* Prepare a custom category which is a combination of the
500 * desired ones */
501 parser->allowed_start = UC_LETTER;
502 if (!parser->ignore_numbers) {
503 parser->allowed_start = uc_general_category_or (parser->allowed_start, UC_NUMBER);
504 }
505 }
506
507 const gchar *
508 tracker_parser_next (TrackerParser *parser,
509 gint *position,
510 gint *byte_offset_start,
511 gint *byte_offset_end,
512 gboolean *stop_word,
513 gint *word_length)
514 {
515 const gchar *str;
516 gint byte_start = 0, byte_end = 0;
517
518 str = NULL;
519
520 g_free (parser->word);
521 parser->word = NULL;
522
523 *stop_word = FALSE;
524
525 if (parser_next (parser, &byte_start, &byte_end, stop_word)) {
526 str = parser->word;
527 }
528
529 if (!*stop_word) {
530 parser->word_position++;
531 }
532
533 *word_length = parser->word_length;
534 *position = parser->word_position;
535 *byte_offset_start = byte_start;
536 *byte_offset_end = byte_end;
537
538 return str;
539 }