Location	Tool	Test ID	Function	Issue
tracker-parser-libunistring.c:87:36	gcc	pointer-sign	get_word_info	pointer targets in passing argument 2 of 'u8_strmbtouc' differ in signedness
tracker-parser-libunistring.c:178:3	gcc	pointer-sign	parser_unaccent_nfkd_word	pointer targets in passing argument 2 of 'u8_strmbtouc' differ in signedness
tracker-parser-libunistring.c:254:29	gcc	pointer-sign	process_word_utf8	pointer targets in passing argument 5 of 'u8_casefold' differ in signedness
  1 /*
  2  * Copyright (C) 2006, Jamie McCracken <jamiemcc@gnome.org>
  3  * Copyright (C) 2008,2009,2010 Nokia <ivan.frade@nokia.com>
  4  *
  5  * This library is free software; you can redistribute it and/or
  6  * modify it under the terms of the GNU Lesser General Public
  7  * License as published by the Free Software Foundation; either
  8  * version 2.1 of the License, or (at your option) any later version.
  9  *
 10  * This library is distributed in the hope that it will be useful,
 11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 13  * Lesser General Public License for more details.
 14  *
 15  * You should have received a copy of the GNU Lesser General Public
 16  * License along with this library; if not, write to the Free Software
 17  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
 18  * 02110-1301  USA
 19  */
 20 
 21 #include "config.h"
 22 
 23 #include <stdio.h>
 24 #include <string.h>
 25 
 26 /* libunistring versions prior to 9.1.2 need this hack */
 27 #define _UNUSED_PARAMETER_
 28 #include <unistr.h>
 29 #include <uniwbrk.h>
 30 #include <unictype.h>
 31 #include <unicase.h>
 32 
 33 #include "tracker-parser.h"
 34 #include "tracker-parser-utils.h"
 35 
 36 /* Type of words detected */
 37 typedef enum {
 38 	TRACKER_PARSER_WORD_TYPE_ASCII,
 39 	TRACKER_PARSER_WORD_TYPE_OTHER_UNAC,
 40 	TRACKER_PARSER_WORD_TYPE_OTHER_NO_UNAC,
 41 } TrackerParserWordType;
 42 
 43 /* Max possible length of a UTF-8 encoded string (just a safety limit) */
 44 #define WORD_BUFFER_LENGTH 512
 45 
 46 struct TrackerParser {
 47 	const gchar           *txt;
 48 	gint                   txt_size;
 49 
 50 	TrackerLanguage       *language;
 51 	guint                  max_word_length;
 52 	gboolean               enable_stemmer;
 53 	gboolean               enable_unaccent;
 54 	gboolean               ignore_stop_words;
 55 	gboolean               ignore_reserved_words;
 56 	gboolean               ignore_numbers;
 57 	gboolean               enable_forced_wordbreaks;
 58 
 59 	/* Private members */
 60 	gchar                 *word;
 61 	gint                   word_length;
 62 	guint                  word_position;
 63 
 64 	/* Cursor, as index of the input array of bytes */
 65 	gsize                  cursor;
 66 	/* libunistring flags array */
 67 	gchar                 *word_break_flags;
 68 	/* general category of the  start character in words */
 69 	uc_general_category_t  allowed_start;
 70 };
 71 
 72 static gboolean
 73 get_word_info (TrackerParser         *parser,
 74                gsize                 *p_word_length,
 75                gboolean              *p_is_allowed_word_start,
 76                TrackerParserWordType *p_word_type)
 77 {
 78 	ucs4_t first_unichar;
 79 	gint first_unichar_len;
 80 	gboolean ascii_only;
 81 
 82 	/* Defaults */
 83 	*p_is_allowed_word_start = TRUE;
 84 
 85 	/* Get first character of the word as UCS4 */
 86 	first_unichar_len = u8_strmbtouc (&first_unichar,
 87 	                                  &(parser->txt[parser->cursor]));
   pointer targets in passing argument 2 of 'u8_strmbtouc' differ in signedness
   (emitted by gcc) 88 	if (first_unichar_len <= 0) {
 89 		/* This should only happen if NIL was passed to u8_strmbtouc,
 90 		 *  so better just force stop here */
 91 		return FALSE;
 92 	} else  {
 93 		/* If first character has length 1, it's ASCII-7 */
 94 		ascii_only = first_unichar_len == 1 ? TRUE : FALSE;
 95 	}
 96 
 97 	/* Consider word starts with a forced wordbreak */
 98 	if (parser->enable_forced_wordbreaks &&
 99 	    IS_FORCED_WORDBREAK_UCS4 ((guint32)first_unichar)) {
100 		*p_word_length = first_unichar_len;
101 	} else {
102 		gsize i;
103 
104 		/* Find next word break, and in the same loop checking if only ASCII
105 		 *  characters */
106 		i = parser->cursor + first_unichar_len;
107 		while (1) {
108 			/* Text bounds reached? */
109 			if (i >= parser->txt_size)
110 				break;
111 			/* Proper unicode word break detected? */
112 			if (parser->word_break_flags[i])
113 				break;
114 			/* Forced word break detected? */
115 			if (parser->enable_forced_wordbreaks &&
116 			    IS_FORCED_WORDBREAK_UCS4 ((guint32)parser->txt[i]))
117 				break;
118 
119 			if (ascii_only &&
120 			    !IS_ASCII_UCS4 ((guint32)parser->txt[i])) {
121 				ascii_only = FALSE;
122 			}
123 
124 			i++;
125 		}
126 
127 		/* Word end is the first byte after the word, which is either the
128 		 *  start of next word or the end of the string */
129 		*p_word_length = i - parser->cursor;
130 	}
131 
132 	/* We only want the words where the first character
133 	 *  in the word is either a letter, a number or a symbol.
134 	 * This is needed because the word break algorithm also
135 	 *  considers word breaks after for example commas or other
136 	 *  punctuation marks.
137 	 * Note that looking at the first character in the string
138 	 *  should be compatible with all Unicode normalization
139 	 *  methods.
140 	 */
141 	if (!IS_UNDERSCORE_UCS4 ((guint32)first_unichar) &&
142 	    !uc_is_general_category (first_unichar,
143 	                             parser->allowed_start)) {
144 		*p_is_allowed_word_start = FALSE;
145 		return TRUE;
146 	}
147 
148 	/* Decide word type */
149 	if (ascii_only) {
150 		*p_word_type = TRACKER_PARSER_WORD_TYPE_ASCII;
151 	} else if (IS_CJK_UCS4 (first_unichar)) {
152 		*p_word_type = TRACKER_PARSER_WORD_TYPE_OTHER_NO_UNAC;
153 	} else {
154 		*p_word_type = TRACKER_PARSER_WORD_TYPE_OTHER_UNAC;
155 	}
156 	return TRUE;
157 }
158 
159 static gboolean
160 parser_unaccent_nfkd_word (gchar *word,
161 			   gsize *word_length)
162 {
163 	/* The input word in this method MUST be normalized in NFKD form */
164 	gsize i;
165 	gsize j;
166 
167 	g_return_val_if_fail (word, FALSE);
168 	g_return_val_if_fail (word_length, FALSE);
169 	g_return_val_if_fail (*word_length > 0, FALSE);
170 
171 	i = 0;
172 	j = 0;
173 	while (i < *word_length) {
174 		ucs4_t unichar;
175 		gint utf8_len;
176 
177 		/* Get next character of the word as UCS4 */
178 		utf8_len = u8_strmbtouc (&unichar, &word[i]);
   pointer targets in passing argument 2 of 'u8_strmbtouc' differ in signedness
   (emitted by gcc)179 
180 		/* Invalid UTF-8 character or end of original string. */
181 		if (utf8_len <= 0) {
182 			break;
183 		}
184 
185 		/* If the given unichar is a combining diacritical mark,
186 		 * just update the original index, not the output one */
187 		if (IS_CDM_UCS4 ((guint32) unichar)) {
188 			i += utf8_len;
189 			continue;
190 		}
191 
192 		/* If already found a previous combining
193 		 * diacritical mark, indexes are different so
194 		 * need to copy characters. As output and input
195 		 * buffers may overlap, need to use memmove
196 		 * instead of memcpy */
197 		if (i != j) {
198 			memmove (&word[j], &word[i], utf8_len);
199 		}
200 
201 		/* Update both indexes */
202 		i += utf8_len;
203 		j += utf8_len;
204 	}
205 
206 	/* Force proper string end */
207 	word[j] = '\0';
208 
209 	/* Set new output length */
210 	*word_length = j;
211 
212 	return TRUE;
213 }
214 
215 static gchar *
216 process_word_utf8 (TrackerParser         *parser,
217                    const gchar           *word,
218                    gint                   length,
219                    TrackerParserWordType  type,
220                    gboolean              *stop_word)
221 {
222 	gchar word_buffer [WORD_BUFFER_LENGTH];
223 	gchar *normalized = NULL;
224 	gchar *stemmed = NULL;
225 	size_t new_word_length;
226 
227 	g_return_val_if_fail (parser != NULL, NULL);
228 	g_return_val_if_fail (word != NULL, NULL);
229 
230 	/* If length is set as -1, the input word MUST be NIL-terminated.
231 	 * Otherwise, this restriction is not needed as the length to process
232 	 * is given as input argument */
233 	if (length < 0) {
234 		length = strlen (word);
235 	}
236 
237 	/* Log original word */
238 	tracker_parser_message_hex ("ORIGINAL word",
239 	                            word, length);
240 
241 	/* Normalization and case-folding ONLY for non-ASCII */
242 	if (type != TRACKER_PARSER_WORD_TYPE_ASCII) {
243 		/* Leave space for last NIL */
244 		new_word_length = WORD_BUFFER_LENGTH - 1;
245 
246 		/* Casefold and NFKD normalization in output.
247 		 * NOTE: if the output buffer is not big enough, u8_casefold will
248 		 * return a newly-allocated buffer. */
249 		normalized = u8_casefold ((const uint8_t *)word,
250 		                          length,
251 		                          uc_locale_language (),
252 		                          UNINORM_NFKD,
253 		                          word_buffer,
254 		                          &new_word_length);
   pointer targets in passing argument 5 of 'u8_casefold' differ in signedness
   (emitted by gcc)255 
256 		/* Case folding + Normalization failed, ignore this word */
257 		g_return_val_if_fail (normalized != NULL, NULL);
258 
259 		/* If output buffer is not the same as the one passed to
260 		 * u8_casefold, we know it was newly-allocated, so need
261 		 * to resize it in 1 byte to add last NIL */
262 		if (normalized != word_buffer) {
263 			normalized = g_realloc (normalized, new_word_length + 1);
264 		}
265 
266 		/* Log after Normalization */
267 		tracker_parser_message_hex (" After Casefolding and NFKD normalization",
268 		                            normalized, new_word_length);
269 	} else {
270 		/* For ASCII-only, just tolower() each character */
271 		gsize i;
272 
273 		normalized = length > WORD_BUFFER_LENGTH ? g_malloc (length + 1) : word_buffer;
274 
275 		for (i = 0; i < length; i++) {
276 			normalized[i] = g_ascii_tolower (word[i]);
277 		}
278 
279 		new_word_length = length;
280 
281 		/* Log after tolower */
282 		tracker_parser_message_hex (" After Lowercasing",
283 		                            normalized, new_word_length);
284 	}
285 
286 	/* Set output NIL */
287 	normalized[new_word_length] = '\0';
288 
289 	/* UNAC stripping needed? (for non-CJK and non-ASCII) */
290 	if (parser->enable_unaccent &&
291 	    type == TRACKER_PARSER_WORD_TYPE_OTHER_UNAC &&
292 	    parser_unaccent_nfkd_word (normalized, &new_word_length)) {
293 		/* Log after UNAC stripping */
294 		tracker_parser_message_hex ("  After UNAC stripping",
295 		                            normalized, new_word_length);
296 	}
297 
298 	/* Check if stop word */
299 	if (parser->ignore_stop_words) {
300 		*stop_word = tracker_language_is_stop_word (parser->language,
301 		                                            normalized);
302 	}
303 
304 	/* Stemming needed? */
305 	if (parser->enable_stemmer) {
306 		stemmed = tracker_language_stem_word (parser->language,
307 		                                      normalized,
308 		                                      new_word_length);
309 
310 		/* Log after stemming */
311 		tracker_parser_message_hex ("   After stemming",
312 		                            stemmed, strlen (stemmed));
313 	}
314 
315 	/* If stemmed wanted and succeeded, free previous and return it */
316 	if (stemmed) {
317 		if (normalized != word_buffer) {
318 			g_free (normalized);
319 		}
320 		return stemmed;
321 	}
322 
323 	/* It may be the case that no stripping and no stemming was needed, and
324 	 * that the output buffer in stack was enough for case-folding and
325 	 * normalization. In this case, need to strdup() the string to return it */
326 	return normalized == word_buffer ? g_strdup (word_buffer) : normalized;
327 }
328 
329 static gboolean
330 parser_next (TrackerParser *parser,
331              gint          *byte_offset_start,
332              gint          *byte_offset_end,
333              gboolean      *stop_word)
334 {
335 	gsize word_length = 0;
336 	gchar *processed_word = NULL;
337 
338 	*byte_offset_start = 0;
339 	*byte_offset_end = 0;
340 
341 	g_return_val_if_fail (parser, FALSE);
342 
343 	/* Loop to look for next valid word */
344 	while (!processed_word &&
345 	       parser->cursor < parser->txt_size) {
346 		TrackerParserWordType type;
347 		gsize truncated_length;
348 		gboolean is_allowed;
349 
350 		/* Get word info */
351 		if (!get_word_info (parser,
352 		                    &word_length,
353 		                    &is_allowed,
354 		                    &type)) {
355 			/* Quit loop just in case */
356 			parser->cursor = parser->txt_size;
357 			break;
358 		}
359 
360 		/* Ignore the word if not an allowed word start */
361 		if (!is_allowed) {
362 			/* Ignore this word and keep on looping */
363 			parser->cursor += word_length;
364 			continue;
365 		}
366 
367 		/* Ignore the word if longer than the maximum allowed */
368 		if (word_length >= parser->max_word_length) {
369 			/* Ignore this word and keep on looping */
370 			parser->cursor += word_length;
371 			continue;
372 		}
373 
374 		/* check if word is reserved and ignore it if so */
375 		if (parser->ignore_reserved_words &&
376 		    tracker_parser_is_reserved_word_utf8 (&parser->txt[parser->cursor],
377 		                                          word_length)) {
378 			/* Ignore this word and keep on looping */
379 			parser->cursor += word_length;
380 			continue;
381 		}
382 
383 		/* compute truncated word length if needed (to avoid extremely
384 		 *  long words)*/
385 		truncated_length = (word_length < WORD_BUFFER_LENGTH ?
386 		                    word_length :
387 		                    WORD_BUFFER_LENGTH - 1);
388 
389 		/* Process the word here. If it fails, we can still go
390 		 *  to the next one. Returns newly allocated string
391 		 *  always */
392 		processed_word = process_word_utf8 (parser,
393 		                                    &(parser->txt[parser->cursor]),
394 		                                    truncated_length,
395 		                                    type,
396 		                                    stop_word);
397 		if (!processed_word) {
398 			/* Ignore this word and keep on looping */
399 			parser->cursor += word_length;
400 			continue;
401 		}
402 	}
403 
404 	/* If we got a word here, set output */
405 	if (processed_word) {
406 		/* Set outputs */
407 		*byte_offset_start = parser->cursor;
408 		*byte_offset_end = parser->cursor + word_length;
409 
410 		/* Update cursor */
411 		parser->cursor += word_length;
412 
413 		parser->word_length = strlen (processed_word);
414 		parser->word = processed_word;
415 
416 		return TRUE;
417 	}
418 
419 	/* No more words... */
420 	return FALSE;
421 }
422 
423 TrackerParser *
424 tracker_parser_new (TrackerLanguage *language)
425 {
426 	TrackerParser *parser;
427 
428 	g_return_val_if_fail (TRACKER_IS_LANGUAGE (language), NULL);
429 
430 	parser = g_new0 (TrackerParser, 1);
431 
432 	parser->language = g_object_ref (language);
433 
434 	return parser;
435 }
436 
437 void
438 tracker_parser_free (TrackerParser *parser)
439 {
440 	g_return_if_fail (parser != NULL);
441 
442 	if (parser->language) {
443 		g_object_unref (parser->language);
444 	}
445 
446 	g_free (parser->word_break_flags);
447 
448 	g_free (parser->word);
449 
450 	g_free (parser);
451 }
452 
453 void
454 tracker_parser_reset (TrackerParser *parser,
455                       const gchar   *txt,
456                       gint           txt_size,
457                       guint          max_word_length,
458                       gboolean       enable_stemmer,
459                       gboolean       enable_unaccent,
460                       gboolean       ignore_stop_words,
461                       gboolean       ignore_reserved_words,
462                       gboolean       ignore_numbers)
463 {
464 	g_return_if_fail (parser != NULL);
465 	g_return_if_fail (txt != NULL);
466 
467 	parser->max_word_length = max_word_length;
468 	parser->enable_stemmer = enable_stemmer;
469 	parser->enable_unaccent = enable_unaccent;
470 	parser->ignore_stop_words = ignore_stop_words;
471 	parser->ignore_reserved_words = ignore_reserved_words;
472 	parser->ignore_numbers = ignore_numbers;
473 
474 	/* Note: We're forcing some unicode characters to behave
475 	 * as wordbreakers: e.g, the '.' The main reason for this
476 	 * is to enable FTS searches matching file extension. */
477 	parser->enable_forced_wordbreaks = TRUE;
478 
479 	parser->txt_size = txt_size;
480 	parser->txt = txt;
481 
482 	g_free (parser->word);
483 	parser->word = NULL;
484 
485 	parser->word_position = 0;
486 
487 	parser->cursor = 0;
488 
489 	g_free (parser->word_break_flags);
490 
491 	/* Create array of flags, same size as original text. */
492 	parser->word_break_flags = g_malloc (txt_size);
493 
494 	/* Get wordbreak flags in the whole string */
495 	u8_wordbreaks ((const uint8_t *)txt,
496 	               (size_t) txt_size,
497 	               (char *)parser->word_break_flags);
498 
499 	/* Prepare a custom category which is a combination of the
500 	 * desired ones */
501 	parser->allowed_start = UC_LETTER;
502 	if (!parser->ignore_numbers) {
503 		parser->allowed_start = uc_general_category_or (parser->allowed_start, UC_NUMBER);
504 	}
505 }
506 
507 const gchar *
508 tracker_parser_next (TrackerParser *parser,
509                      gint          *position,
510                      gint          *byte_offset_start,
511                      gint          *byte_offset_end,
512                      gboolean      *stop_word,
513                      gint          *word_length)
514 {
515 	const gchar  *str;
516 	gint byte_start = 0, byte_end = 0;
517 
518 	str = NULL;
519 
520 	g_free (parser->word);
521 	parser->word = NULL;
522 
523 	*stop_word = FALSE;
524 
525 	if (parser_next (parser, &byte_start, &byte_end, stop_word)) {
526 		str = parser->word;
527 	}
528 
529 	if (!*stop_word) {
530 		parser->word_position++;
531 	}
532 
533 	*word_length = parser->word_length;
534 	*position = parser->word_position;
535 	*byte_offset_start = byte_start;
536 	*byte_offset_end = byte_end;
537 
538 	return str;
539 }
tracker-0.16.2/src/libtracker-fts/tracker-parser-libunistring.c