tracker-0.16.2/src/libtracker-fts/tracker-parser-libunistring.c

Location Tool Test ID Function Issue
tracker-parser-libunistring.c:87:36 gcc pointer-sign get_word_info pointer targets in passing argument 2 of 'u8_strmbtouc' differ in signedness
tracker-parser-libunistring.c:178:3 gcc pointer-sign parser_unaccent_nfkd_word pointer targets in passing argument 2 of 'u8_strmbtouc' differ in signedness
tracker-parser-libunistring.c:254:29 gcc pointer-sign process_word_utf8 pointer targets in passing argument 5 of 'u8_casefold' differ in signedness
  1 /*
  2  * Copyright (C) 2006, Jamie McCracken <jamiemcc@gnome.org>
  3  * Copyright (C) 2008,2009,2010 Nokia <ivan.frade@nokia.com>
  4  *
  5  * This library is free software; you can redistribute it and/or
  6  * modify it under the terms of the GNU Lesser General Public
  7  * License as published by the Free Software Foundation; either
  8  * version 2.1 of the License, or (at your option) any later version.
  9  *
 10  * This library is distributed in the hope that it will be useful,
 11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 13  * Lesser General Public License for more details.
 14  *
 15  * You should have received a copy of the GNU Lesser General Public
 16  * License along with this library; if not, write to the Free Software
 17  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
 18  * 02110-1301  USA
 19  */
 20 
 21 #include "config.h"
 22 
 23 #include <stdio.h>
 24 #include <string.h>
 25 
 26 /* libunistring versions prior to 9.1.2 need this hack */
 27 #define _UNUSED_PARAMETER_
 28 #include <unistr.h>
 29 #include <uniwbrk.h>
 30 #include <unictype.h>
 31 #include <unicase.h>
 32 
 33 #include "tracker-parser.h"
 34 #include "tracker-parser-utils.h"
 35 
 36 /* Type of words detected */
 37 typedef enum {
 38 	TRACKER_PARSER_WORD_TYPE_ASCII,
 39 	TRACKER_PARSER_WORD_TYPE_OTHER_UNAC,
 40 	TRACKER_PARSER_WORD_TYPE_OTHER_NO_UNAC,
 41 } TrackerParserWordType;
 42 
 43 /* Max possible length of a UTF-8 encoded string (just a safety limit) */
 44 #define WORD_BUFFER_LENGTH 512
 45 
 46 struct TrackerParser {
 47 	const gchar           *txt;
 48 	gint                   txt_size;
 49 
 50 	TrackerLanguage       *language;
 51 	guint                  max_word_length;
 52 	gboolean               enable_stemmer;
 53 	gboolean               enable_unaccent;
 54 	gboolean               ignore_stop_words;
 55 	gboolean               ignore_reserved_words;
 56 	gboolean               ignore_numbers;
 57 	gboolean               enable_forced_wordbreaks;
 58 
 59 	/* Private members */
 60 	gchar                 *word;
 61 	gint                   word_length;
 62 	guint                  word_position;
 63 
 64 	/* Cursor, as index of the input array of bytes */
 65 	gsize                  cursor;
 66 	/* libunistring flags array */
 67 	gchar                 *word_break_flags;
 68 	/* general category of the  start character in words */
 69 	uc_general_category_t  allowed_start;
 70 };
 71 
 72 static gboolean
 73 get_word_info (TrackerParser         *parser,
 74                gsize                 *p_word_length,
 75                gboolean              *p_is_allowed_word_start,
 76                TrackerParserWordType *p_word_type)
 77 {
 78 	ucs4_t first_unichar;
 79 	gint first_unichar_len;
 80 	gboolean ascii_only;
 81 
 82 	/* Defaults */
 83 	*p_is_allowed_word_start = TRUE;
 84 
 85 	/* Get first character of the word as UCS4 */
 86 	first_unichar_len = u8_strmbtouc (&first_unichar,
 87 	                                  &(parser->txt[parser->cursor]));
pointer targets in passing argument 2 of 'u8_strmbtouc' differ in signedness
(emitted by gcc)
88 if (first_unichar_len <= 0) { 89 /* This should only happen if NIL was passed to u8_strmbtouc, 90 * so better just force stop here */ 91 return FALSE; 92 } else { 93 /* If first character has length 1, it's ASCII-7 */ 94 ascii_only = first_unichar_len == 1 ? TRUE : FALSE; 95 } 96 97 /* Consider word starts with a forced wordbreak */ 98 if (parser->enable_forced_wordbreaks && 99 IS_FORCED_WORDBREAK_UCS4 ((guint32)first_unichar)) { 100 *p_word_length = first_unichar_len; 101 } else { 102 gsize i; 103 104 /* Find next word break, and in the same loop checking if only ASCII 105 * characters */ 106 i = parser->cursor + first_unichar_len; 107 while (1) { 108 /* Text bounds reached? */ 109 if (i >= parser->txt_size) 110 break; 111 /* Proper unicode word break detected? */ 112 if (parser->word_break_flags[i]) 113 break; 114 /* Forced word break detected? */ 115 if (parser->enable_forced_wordbreaks && 116 IS_FORCED_WORDBREAK_UCS4 ((guint32)parser->txt[i])) 117 break; 118 119 if (ascii_only && 120 !IS_ASCII_UCS4 ((guint32)parser->txt[i])) { 121 ascii_only = FALSE; 122 } 123 124 i++; 125 } 126 127 /* Word end is the first byte after the word, which is either the 128 * start of next word or the end of the string */ 129 *p_word_length = i - parser->cursor; 130 } 131 132 /* We only want the words where the first character 133 * in the word is either a letter, a number or a symbol. 134 * This is needed because the word break algorithm also 135 * considers word breaks after for example commas or other 136 * punctuation marks. 137 * Note that looking at the first character in the string 138 * should be compatible with all Unicode normalization 139 * methods. 140 */ 141 if (!IS_UNDERSCORE_UCS4 ((guint32)first_unichar) && 142 !uc_is_general_category (first_unichar, 143 parser->allowed_start)) { 144 *p_is_allowed_word_start = FALSE; 145 return TRUE; 146 } 147 148 /* Decide word type */ 149 if (ascii_only) { 150 *p_word_type = TRACKER_PARSER_WORD_TYPE_ASCII; 151 } else if (IS_CJK_UCS4 (first_unichar)) { 152 *p_word_type = TRACKER_PARSER_WORD_TYPE_OTHER_NO_UNAC; 153 } else { 154 *p_word_type = TRACKER_PARSER_WORD_TYPE_OTHER_UNAC; 155 } 156 return TRUE; 157 } 158 159 static gboolean 160 parser_unaccent_nfkd_word (gchar *word, 161 gsize *word_length) 162 { 163 /* The input word in this method MUST be normalized in NFKD form */ 164 gsize i; 165 gsize j; 166 167 g_return_val_if_fail (word, FALSE); 168 g_return_val_if_fail (word_length, FALSE); 169 g_return_val_if_fail (*word_length > 0, FALSE); 170 171 i = 0; 172 j = 0; 173 while (i < *word_length) { 174 ucs4_t unichar; 175 gint utf8_len; 176 177 /* Get next character of the word as UCS4 */ 178 utf8_len = u8_strmbtouc (&unichar, &word[i]);
pointer targets in passing argument 2 of 'u8_strmbtouc' differ in signedness
(emitted by gcc)
179 180 /* Invalid UTF-8 character or end of original string. */ 181 if (utf8_len <= 0) { 182 break; 183 } 184 185 /* If the given unichar is a combining diacritical mark, 186 * just update the original index, not the output one */ 187 if (IS_CDM_UCS4 ((guint32) unichar)) { 188 i += utf8_len; 189 continue; 190 } 191 192 /* If already found a previous combining 193 * diacritical mark, indexes are different so 194 * need to copy characters. As output and input 195 * buffers may overlap, need to use memmove 196 * instead of memcpy */ 197 if (i != j) { 198 memmove (&word[j], &word[i], utf8_len); 199 } 200 201 /* Update both indexes */ 202 i += utf8_len; 203 j += utf8_len; 204 } 205 206 /* Force proper string end */ 207 word[j] = '\0'; 208 209 /* Set new output length */ 210 *word_length = j; 211 212 return TRUE; 213 } 214 215 static gchar * 216 process_word_utf8 (TrackerParser *parser, 217 const gchar *word, 218 gint length, 219 TrackerParserWordType type, 220 gboolean *stop_word) 221 { 222 gchar word_buffer [WORD_BUFFER_LENGTH]; 223 gchar *normalized = NULL; 224 gchar *stemmed = NULL; 225 size_t new_word_length; 226 227 g_return_val_if_fail (parser != NULL, NULL); 228 g_return_val_if_fail (word != NULL, NULL); 229 230 /* If length is set as -1, the input word MUST be NIL-terminated. 231 * Otherwise, this restriction is not needed as the length to process 232 * is given as input argument */ 233 if (length < 0) { 234 length = strlen (word); 235 } 236 237 /* Log original word */ 238 tracker_parser_message_hex ("ORIGINAL word", 239 word, length); 240 241 /* Normalization and case-folding ONLY for non-ASCII */ 242 if (type != TRACKER_PARSER_WORD_TYPE_ASCII) { 243 /* Leave space for last NIL */ 244 new_word_length = WORD_BUFFER_LENGTH - 1; 245 246 /* Casefold and NFKD normalization in output. 247 * NOTE: if the output buffer is not big enough, u8_casefold will 248 * return a newly-allocated buffer. */ 249 normalized = u8_casefold ((const uint8_t *)word, 250 length, 251 uc_locale_language (), 252 UNINORM_NFKD, 253 word_buffer, 254 &new_word_length);
pointer targets in passing argument 5 of 'u8_casefold' differ in signedness
(emitted by gcc)
255 256 /* Case folding + Normalization failed, ignore this word */ 257 g_return_val_if_fail (normalized != NULL, NULL); 258 259 /* If output buffer is not the same as the one passed to 260 * u8_casefold, we know it was newly-allocated, so need 261 * to resize it in 1 byte to add last NIL */ 262 if (normalized != word_buffer) { 263 normalized = g_realloc (normalized, new_word_length + 1); 264 } 265 266 /* Log after Normalization */ 267 tracker_parser_message_hex (" After Casefolding and NFKD normalization", 268 normalized, new_word_length); 269 } else { 270 /* For ASCII-only, just tolower() each character */ 271 gsize i; 272 273 normalized = length > WORD_BUFFER_LENGTH ? g_malloc (length + 1) : word_buffer; 274 275 for (i = 0; i < length; i++) { 276 normalized[i] = g_ascii_tolower (word[i]); 277 } 278 279 new_word_length = length; 280 281 /* Log after tolower */ 282 tracker_parser_message_hex (" After Lowercasing", 283 normalized, new_word_length); 284 } 285 286 /* Set output NIL */ 287 normalized[new_word_length] = '\0'; 288 289 /* UNAC stripping needed? (for non-CJK and non-ASCII) */ 290 if (parser->enable_unaccent && 291 type == TRACKER_PARSER_WORD_TYPE_OTHER_UNAC && 292 parser_unaccent_nfkd_word (normalized, &new_word_length)) { 293 /* Log after UNAC stripping */ 294 tracker_parser_message_hex (" After UNAC stripping", 295 normalized, new_word_length); 296 } 297 298 /* Check if stop word */ 299 if (parser->ignore_stop_words) { 300 *stop_word = tracker_language_is_stop_word (parser->language, 301 normalized); 302 } 303 304 /* Stemming needed? */ 305 if (parser->enable_stemmer) { 306 stemmed = tracker_language_stem_word (parser->language, 307 normalized, 308 new_word_length); 309 310 /* Log after stemming */ 311 tracker_parser_message_hex (" After stemming", 312 stemmed, strlen (stemmed)); 313 } 314 315 /* If stemmed wanted and succeeded, free previous and return it */ 316 if (stemmed) { 317 if (normalized != word_buffer) { 318 g_free (normalized); 319 } 320 return stemmed; 321 } 322 323 /* It may be the case that no stripping and no stemming was needed, and 324 * that the output buffer in stack was enough for case-folding and 325 * normalization. In this case, need to strdup() the string to return it */ 326 return normalized == word_buffer ? g_strdup (word_buffer) : normalized; 327 } 328 329 static gboolean 330 parser_next (TrackerParser *parser, 331 gint *byte_offset_start, 332 gint *byte_offset_end, 333 gboolean *stop_word) 334 { 335 gsize word_length = 0; 336 gchar *processed_word = NULL; 337 338 *byte_offset_start = 0; 339 *byte_offset_end = 0; 340 341 g_return_val_if_fail (parser, FALSE); 342 343 /* Loop to look for next valid word */ 344 while (!processed_word && 345 parser->cursor < parser->txt_size) { 346 TrackerParserWordType type; 347 gsize truncated_length; 348 gboolean is_allowed; 349 350 /* Get word info */ 351 if (!get_word_info (parser, 352 &word_length, 353 &is_allowed, 354 &type)) { 355 /* Quit loop just in case */ 356 parser->cursor = parser->txt_size; 357 break; 358 } 359 360 /* Ignore the word if not an allowed word start */ 361 if (!is_allowed) { 362 /* Ignore this word and keep on looping */ 363 parser->cursor += word_length; 364 continue; 365 } 366 367 /* Ignore the word if longer than the maximum allowed */ 368 if (word_length >= parser->max_word_length) { 369 /* Ignore this word and keep on looping */ 370 parser->cursor += word_length; 371 continue; 372 } 373 374 /* check if word is reserved and ignore it if so */ 375 if (parser->ignore_reserved_words && 376 tracker_parser_is_reserved_word_utf8 (&parser->txt[parser->cursor], 377 word_length)) { 378 /* Ignore this word and keep on looping */ 379 parser->cursor += word_length; 380 continue; 381 } 382 383 /* compute truncated word length if needed (to avoid extremely 384 * long words)*/ 385 truncated_length = (word_length < WORD_BUFFER_LENGTH ? 386 word_length : 387 WORD_BUFFER_LENGTH - 1); 388 389 /* Process the word here. If it fails, we can still go 390 * to the next one. Returns newly allocated string 391 * always */ 392 processed_word = process_word_utf8 (parser, 393 &(parser->txt[parser->cursor]), 394 truncated_length, 395 type, 396 stop_word); 397 if (!processed_word) { 398 /* Ignore this word and keep on looping */ 399 parser->cursor += word_length; 400 continue; 401 } 402 } 403 404 /* If we got a word here, set output */ 405 if (processed_word) { 406 /* Set outputs */ 407 *byte_offset_start = parser->cursor; 408 *byte_offset_end = parser->cursor + word_length; 409 410 /* Update cursor */ 411 parser->cursor += word_length; 412 413 parser->word_length = strlen (processed_word); 414 parser->word = processed_word; 415 416 return TRUE; 417 } 418 419 /* No more words... */ 420 return FALSE; 421 } 422 423 TrackerParser * 424 tracker_parser_new (TrackerLanguage *language) 425 { 426 TrackerParser *parser; 427 428 g_return_val_if_fail (TRACKER_IS_LANGUAGE (language), NULL); 429 430 parser = g_new0 (TrackerParser, 1); 431 432 parser->language = g_object_ref (language); 433 434 return parser; 435 } 436 437 void 438 tracker_parser_free (TrackerParser *parser) 439 { 440 g_return_if_fail (parser != NULL); 441 442 if (parser->language) { 443 g_object_unref (parser->language); 444 } 445 446 g_free (parser->word_break_flags); 447 448 g_free (parser->word); 449 450 g_free (parser); 451 } 452 453 void 454 tracker_parser_reset (TrackerParser *parser, 455 const gchar *txt, 456 gint txt_size, 457 guint max_word_length, 458 gboolean enable_stemmer, 459 gboolean enable_unaccent, 460 gboolean ignore_stop_words, 461 gboolean ignore_reserved_words, 462 gboolean ignore_numbers) 463 { 464 g_return_if_fail (parser != NULL); 465 g_return_if_fail (txt != NULL); 466 467 parser->max_word_length = max_word_length; 468 parser->enable_stemmer = enable_stemmer; 469 parser->enable_unaccent = enable_unaccent; 470 parser->ignore_stop_words = ignore_stop_words; 471 parser->ignore_reserved_words = ignore_reserved_words; 472 parser->ignore_numbers = ignore_numbers; 473 474 /* Note: We're forcing some unicode characters to behave 475 * as wordbreakers: e.g, the '.' The main reason for this 476 * is to enable FTS searches matching file extension. */ 477 parser->enable_forced_wordbreaks = TRUE; 478 479 parser->txt_size = txt_size; 480 parser->txt = txt; 481 482 g_free (parser->word); 483 parser->word = NULL; 484 485 parser->word_position = 0; 486 487 parser->cursor = 0; 488 489 g_free (parser->word_break_flags); 490 491 /* Create array of flags, same size as original text. */ 492 parser->word_break_flags = g_malloc (txt_size); 493 494 /* Get wordbreak flags in the whole string */ 495 u8_wordbreaks ((const uint8_t *)txt, 496 (size_t) txt_size, 497 (char *)parser->word_break_flags); 498 499 /* Prepare a custom category which is a combination of the 500 * desired ones */ 501 parser->allowed_start = UC_LETTER; 502 if (!parser->ignore_numbers) { 503 parser->allowed_start = uc_general_category_or (parser->allowed_start, UC_NUMBER); 504 } 505 } 506 507 const gchar * 508 tracker_parser_next (TrackerParser *parser, 509 gint *position, 510 gint *byte_offset_start, 511 gint *byte_offset_end, 512 gboolean *stop_word, 513 gint *word_length) 514 { 515 const gchar *str; 516 gint byte_start = 0, byte_end = 0; 517 518 str = NULL; 519 520 g_free (parser->word); 521 parser->word = NULL; 522 523 *stop_word = FALSE; 524 525 if (parser_next (parser, &byte_start, &byte_end, stop_word)) { 526 str = parser->word; 527 } 528 529 if (!*stop_word) { 530 parser->word_position++; 531 } 532 533 *word_length = parser->word_length; 534 *position = parser->word_position; 535 *byte_offset_start = byte_start; 536 *byte_offset_end = byte_end; 537 538 return str; 539 }