1 /*
  2  * Copyright (C) 2010, Nokia <ivan.frade@nokia.com>
  3  *
  4  * This library is free software; you can redistribute it and/or
  5  * modify it under the terms of the GNU General Public
  6  * License as published by the Free Software Foundation; either
  7  * version 2 of the License, or (at your option) any later version.
  8  *
  9  * This library is distributed in the hope that it will be useful,
 10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 12  * General Public License for more details.
 13  *
 14  * You should have received a copy of the GNU General Public
 15  * License along with this library; if not, write to the
 16  * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
 17  * Boston, MA  02110-1301, USA.
 18  */
 19 
 20 #include "config.h"
 21 
 22 #include <string.h>
 23 
 24 #include <glib.h>
 25 #include <gio/gio.h>
 26 
 27 #include <libtracker-fts/tracker-parser.h>
 28 
 29 /* Note
 30  *  Currently, three different types of parsers are defined in libtracker-fts:
 31  *    - GNU libunistring-based parser, up to 20% faster than the others, and full
 32  *       unicode compliant.
 33  *    - libicu-based parser, full unicode compliant but slower as it needs
 34  *       conversions to/from UChars (UTF-16 encoded strings)
 35  *    - glib/pango parser, not fully unicode compliant as it doesn't work properly
 36  *       with decomposed strings (NFD normalized), doesn't make a unicode-based
 37  *       word-breaking and doesn't make full-word casefolding.
 38  *
 39  * Some of the tests, thus, will be DISABLED for the GLIB/PANGO parser.
 40  */
 41 #undef FULL_UNICODE_TESTS
 42 #if defined HAVE_LIBUNISTRING || defined HAVE_LIBICU
 43 #define FULL_UNICODE_TESTS
 44 #endif
 45 
 46 /* -------------- COMMON FOR ALL TESTS ----------------- */
 47 
 48 /* Fixture object type */
 49 typedef struct {
 50 	/* The parser object */
 51 	TrackerParser *parser;
 52 
 53 	/* Default parser configuration to use */
 54 	gint max_word_length;
 55 	gboolean enable_stemmer;
 56 	gboolean enable_unaccent;
 57 	gboolean ignore_stop_words;
 58 	gboolean ignore_reserved_words;
 59 	gboolean ignore_numbers;
 60 } TrackerParserTestFixture;
 61 
 62 /* Common setup for all tests */
 63 static void
 64 test_common_setup (TrackerParserTestFixture *fixture,
 65                    gconstpointer data)
 66 {
 67 	TrackerLanguage  *language;
 68 
 69 	/* Setup language for parser. We make sure that always English is used
 70 	 *  in the unit tests, because we want the English stemming method to
 71 	 *  be used. */
 72 	language = tracker_language_new ("en");
 73 	if (!language) {
 74 		g_critical ("Language setup failed!");
 75 		return;
 76 	}
 77 
 78 	/* Default conf parameters */
 79 	fixture->max_word_length = 50;
 80 	fixture->enable_stemmer = TRUE;
 81 	fixture->enable_unaccent = TRUE;
 82 	fixture->ignore_stop_words = TRUE;
 83 	fixture->ignore_reserved_words = TRUE;
 84 	fixture->ignore_numbers = TRUE;
 85 
 86 	/* Create the parser */
 87 	fixture->parser = tracker_parser_new (language);
 88 	if (!fixture->parser) {
 89 		g_critical ("Parser creation failed!");
 90 		return;
 91 	}
 92 
 93 	g_object_unref (language);
 94 }
 95 
 96 /* Common teardown for all tests */
 97 static void
 98 test_common_teardown (TrackerParserTestFixture *fixture,
 99                       gconstpointer data)
100 {
101 	if (fixture->parser) {
102 		tracker_parser_free (fixture->parser);
103 	}
104 }
105 
106 /* -------------- EXPECTED NUMBER OF WORDS TESTS ----------------- */
107 
108 /* Test struct for the expected-nwords tests */
109 typedef struct TestDataExpectedNWords TestDataExpectedNWords;
110 struct TestDataExpectedNWords {
111 	const gchar *str;
112 	gboolean ignore_numbers;
113 	guint expected_nwords;
114 };
115 
116 /* Common expected_word test method */
117 static void
118 expected_nwords_check (TrackerParserTestFixture *fixture,
119                        gconstpointer data)
120 {
121 	const TestDataExpectedNWords *testdata = data;
122 	gint position;
123 	gint byte_offset_start;
124 	gint byte_offset_end;
125 	gboolean stop_word;
126 	gint word_length;
127 	guint nwords = 0;
128 
129 	/* Reset the parser with the test string */
130 	tracker_parser_reset (fixture->parser,
131 	                      testdata->str,
132 	                      strlen (testdata->str),
133 	                      fixture->max_word_length,
134 	                      fixture->enable_stemmer,
135 	                      fixture->enable_unaccent,
136 	                      fixture->ignore_stop_words,
137 	                      fixture->ignore_reserved_words,
138 	                      testdata->ignore_numbers);
139 
140 	/* Count number of output words */
141 	while (tracker_parser_next (fixture->parser,
142 				    &position,
143 				    &byte_offset_start,
144 				    &byte_offset_end,
145 				    &stop_word,
146 				    &word_length)) {
147 		nwords++;
148 	}
149 
150 	/* Check if input is same as expected */
151 	g_assert_cmpuint (nwords, == , testdata->expected_nwords);
152 }
153 
154 /* -------------- EXPECTED WORD TESTS ----------------- */
155 
156 /* Test struct for the expected-word tests */
157 typedef struct TestDataExpectedWord TestDataExpectedWord;
158 struct TestDataExpectedWord {
159 	const gchar *str;
160 	const gchar *expected;
161 	gboolean enable_stemmer;
162 	gboolean enable_unaccent;
163 };
164 
165 /* Common expected_word test method */
166 static void
167 expected_word_check (TrackerParserTestFixture *fixture,
168                      gconstpointer data)
169 {
170 	const TestDataExpectedWord *testdata = data;
171 	const gchar *word;
172 	gchar *expected_nfkd;
173 	gint position;
174 	gint byte_offset_start;
175 	gint byte_offset_end;
176 	gboolean stop_word;
177 	gint word_length;
178 
179 	/* Reset the parser with our string */
180 	tracker_parser_reset (fixture->parser,
181 	                      testdata->str,
182 	                      strlen (testdata->str),
183 	                      fixture->max_word_length,
184 	                      testdata->enable_stemmer,
185 	                      testdata->enable_unaccent,
186 	                      fixture->ignore_stop_words,
187 	                      fixture->ignore_reserved_words,
188 	                      fixture->ignore_numbers);
189 
190 	/* Process next word */
191 	word = tracker_parser_next (fixture->parser,
192 	                            &position,
193 	                            &byte_offset_start,
194 	                            &byte_offset_end,
195 	                            &stop_word,
196 	                            &word_length);
197 
198 	/* Expected word MUST always be in NFKD normalization */
199 	expected_nfkd = g_utf8_normalize (testdata->expected,
200 	                                  -1,
201 	                                  G_NORMALIZE_NFKD);
202 
203 	/* Check if input is same as expected */
204 	g_assert_cmpstr (word, == , expected_nfkd);
205 
206 	g_free (expected_nfkd);
207 }
208 
209 /* -------------- STOP WORD TESTS ----------------- */
210 
211 /* Test struct for the stop-word tests */
212 typedef struct TestDataStopWord TestDataStopWord;
213 struct TestDataStopWord {
214 	const gchar *str;
215 	gboolean ignore_stop_words;
216 	gboolean is_expected_stop_word;
217 };
218 
219 /* Common stop__word test method */
220 static void
221 stop_word_check (TrackerParserTestFixture *fixture,
222                  gconstpointer data)
223 {
224 	const TestDataStopWord *testdata = data;
225 	gint position;
226 	gint byte_offset_start;
227 	gint byte_offset_end;
228 	gboolean stop_word;
229 	gint word_length;
230 
231 	/* Reset the parser with our string */
232 	tracker_parser_reset (fixture->parser,
233 	                      testdata->str,
234 	                      strlen (testdata->str),
235 	                      fixture->max_word_length,
236 	                      fixture->enable_stemmer,
237 	                      fixture->enable_unaccent,
238 	                      testdata->ignore_stop_words,
239 	                      fixture->ignore_reserved_words,
240 	                      fixture->ignore_numbers);
241 
242 	/* Process next word */
243 	tracker_parser_next (fixture->parser,
244 			     &position,
245 			     &byte_offset_start,
246 			     &byte_offset_end,
247 			     &stop_word,
248 			     &word_length);
249 
250 	/* Check if input is same as stop_word */
251 	g_assert_cmpuint (stop_word, == , testdata->is_expected_stop_word);
252 }
253 
254 /* -------------- LIST OF TESTS ----------------- */
255 
256 /* Normalization-related tests (unaccenting) */
257 static const TestDataExpectedWord test_data_normalization[] = {
258 	{ "école",                "ecole", FALSE, TRUE  },
259 	{ "ÉCOLE",                "ecole", FALSE, TRUE  },
260 	{ "École",                "ecole", FALSE, TRUE  },
261 #ifdef FULL_UNICODE_TESTS /* glib/pango doesn't like NFD strings */
262 	{ "e" "\xCC\x81" "cole",  "ecole", FALSE, TRUE  },
263 	{ "E" "\xCC\x81" "COLE",  "ecole", FALSE, TRUE  },
264 	{ "E" "\xCC\x81" "cole",  "ecole", FALSE, TRUE  },
265 #endif
266 	{ NULL,                   NULL,    FALSE, FALSE }
267 };
268 
269 /* Unaccenting-related tests */
270 static const TestDataExpectedWord test_data_unaccent[] = {
271 	{ "Murciélago",   "murcielago", FALSE, TRUE  },
272 	{ "camión",       "camion",     FALSE, TRUE  },
273 	{ "desagüe",      "desague",    FALSE, TRUE  },
274 	{ "Ὰ",            "α",          FALSE, TRUE  }, /* greek capital alpha with U+0300, composed */
275 	{ "ὰ",            "α",          FALSE, TRUE  }, /* greek small alpha with U+0300, composed */
276 	{ "Ὶ",            "ι",          FALSE, TRUE  }, /* greek capital iotta with U+0300, composed */
277 	{ "ὶ",            "ι",          FALSE, TRUE  }, /* greek small iotta with U+0300, composed */
278 	{ "Ὼ",            "ω",          FALSE, TRUE  }, /* greek capital omega with U+0300, composed */
279 	{ "ὼ",            "ω",          FALSE, TRUE  }, /* greek small omega with U+0300, composed */
280 #ifdef FULL_UNICODE_TESTS /* glib/pango does not like NFD strings */
281 	{ "Ὰ",          "α",          FALSE, TRUE  }, /* capital alpha with U+0300, decomposed */
282 	{ "ὰ",          "α",          FALSE, TRUE  }, /* small alpha with U+0300, decomposed */
283 	{ "Ὶ",          "ι",          FALSE, TRUE  }, /* capital iotta with U+0300, decomposed */
284 	{ "ὶ",          "ι",          FALSE, TRUE  }, /* small iotta with U+0300, decomposed */
285 	{ "Ὼ",          "ω",          FALSE, TRUE  }, /* capital omega with U+0300, decomposed */
286 	{ "ὼ",          "ω",          FALSE, TRUE  }, /* small omega with U+0300, decomposed */
287 	{ "aN͡Ga",       "anga",       FALSE, TRUE  }, /* 0x0361 affects to two characters */
288 	{ "aNG͡a",       "anga",       FALSE, TRUE  }, /* 0x0361 affects to two characters */
289 #endif
290 	{ "Murciélago", "murciélago", FALSE, FALSE },
291 	{ "camión",     "camión",     FALSE, FALSE },
292 	{ "desagüe",    "desagüe",    FALSE, FALSE },
293 	{ NULL,         NULL,         FALSE, FALSE }
294 };
295 
296 /* Stemming-related tests */
297 static const TestDataExpectedWord test_data_stemming[] = {
298 	{ "ecole", "ecol",  TRUE,  TRUE  },
299 	{ "ecole", "ecole", FALSE, TRUE  },
300 	{ NULL,    NULL,    FALSE, FALSE }
301 };
302 
303 /* Casefolding-related tests */
304 static const TestDataExpectedWord test_data_casefolding[] = {
305 	{ "gross", "gross", FALSE, TRUE  },
306 	{ "GROSS", "gross", FALSE, TRUE  },
307 	{ "GrOsS", "gross", FALSE, TRUE  },
308 #ifdef FULL_UNICODE_TESTS /* glib/pango doesn't do full-word casefolding */
309 	{ "groß",  "gross", FALSE, TRUE  },
310 #endif
311 	{ NULL,    NULL,    FALSE, FALSE }
312 };
313 
314 /* Number of expected words tests */
315 static const TestDataExpectedNWords test_data_nwords[] = {
316 #ifdef FULL_UNICODE_TESTS /* glib/pango assumes ' is a word breaker */
317 	{ "The quick (\"brown\") fox can’t jump 32.3 feet, right?", TRUE,   8 },
318 	{ "The quick (\"brown\") fox can’t jump 32.3 feet, right?", FALSE, 10 },
319 #endif
320 	/* Note: as of 0.9.15, the dot is always a word breaker, even between
321 	 *  numbers. */
322 	{ "filename.txt",                                           TRUE,   2 },
323 	{ ".hidden.txt",                                            TRUE,   2 },
324 	{ "noextension.",                                           TRUE,   1 },
325 	{ "ホモ・サピエンス",                                          TRUE,   2 }, /* katakana */
326 #ifdef FULL_UNICODE_TESTS /* glib/pango doesn't work properly with chinese */
327 	{ "本州最主流的风味",                                          TRUE,   8 }, /* chinese */
328 #endif
329 	{ "Американские суда находятся в международных водах.",     TRUE,   6 }, /* russian */
330 #ifdef FULL_UNICODE_TESTS /* glib/pango doesn't work properly with chinese */
331 	{ "Bần chỉ là một anh nghèo xác",                            TRUE,   7 }, /* vietnamese */
332 	{ "ホモ・サピエンス 本州最主流的风味 katakana, chinese, english", TRUE,  13 }, /* mixed */
333 #endif
334 	{ NULL,                                                     FALSE,  0 }
335 };
336 
337 /* Stop-word tests (for english only) */
338 static const TestDataStopWord test_data_stop_words[] = {
339 	{ "hello", TRUE,  TRUE  }, /* hello is stop word */
340 	{ "hello", FALSE, FALSE },
341 	{ "world", TRUE,  FALSE }, /* world is not stop word */
342 	{ "world", FALSE, FALSE },
343 	{ NULL,    FALSE, FALSE }
344 };
345 
346 int
347 main (int argc, char **argv)
348 {
349 	gint i;
350 
351 	g_test_init (&argc, &argv, NULL);
352 
353 	/* We want the tests to properly find the stopwords dictionaries, so we
354 	 *  need to set the following envvar with the path where the
355 	 *  dictionaries are. */
356 	g_setenv ("TRACKER_LANGUAGE_STOP_WORDS_DIR",
357 	          TOP_SRCDIR "/data/languages",
358 	          TRUE);
359 
360 	/* Add normalization checks */
361 	for (i = 0; test_data_normalization[i].str != NULL; i++) {
362 		gchar *testpath;
363 
364 		testpath = g_strdup_printf ("/libtracker-fts/parser/normalization_%d", i);
365 		g_test_add (testpath,
366 		            TrackerParserTestFixture,
367 		            &test_data_normalization[i],
368 		            test_common_setup,
369 		            expected_word_check,
370 		            test_common_teardown);
371 		g_free (testpath);
372 	}
373 
374 #ifdef HAVE_UNAC
375 	/* Add unaccent checks */
376 	for (i = 0; test_data_unaccent[i].str != NULL; i++) {
377 		gchar *testpath;
378 
379 		testpath = g_strdup_printf ("/libtracker-fts/parser/unaccent_%d", i);
380 		g_test_add (testpath,
381 		            TrackerParserTestFixture,
382 		            &test_data_unaccent[i],
383 		            test_common_setup,
384 		            expected_word_check,
385 		            test_common_teardown);
386 		g_free (testpath);
387 	}
388 #endif
389 
390 	/* Add casefolding checks */
391 	for (i = 0; test_data_casefolding[i].str != NULL; i++) {
392 		gchar *testpath;
393 
394 		testpath = g_strdup_printf ("/libtracker-fts/parser/casefolding_%d", i);
395 		g_test_add (testpath,
396 		            TrackerParserTestFixture,
397 		            &test_data_casefolding[i],
398 		            test_common_setup,
399 		            expected_word_check,
400 		            test_common_teardown);
401 		g_free (testpath);
402 	}
403 
404 	/* Add stemming checks */
405 	for (i = 0; test_data_stemming[i].str != NULL; i++) {
406 		gchar *testpath;
407 
408 		testpath = g_strdup_printf ("/libtracker-fts/parser/stemming_%d", i);
409 		g_test_add (testpath,
410 		            TrackerParserTestFixture,
411 		            &test_data_stemming[i],
412 		            test_common_setup,
413 		            expected_word_check,
414 		            test_common_teardown);
415 		g_free (testpath);
416 	}
417 
418 	/* Add expected number of words checks */
419 	for (i = 0; test_data_nwords[i].str != NULL; i++) {
420 		gchar *testpath;
421 
422 		testpath = g_strdup_printf ("/libtracker-fts/parser/nwords_%d", i);
423 		g_test_add (testpath,
424 		            TrackerParserTestFixture,
425 		            &test_data_nwords[i],
426 		            test_common_setup,
427 		            expected_nwords_check,
428 		            test_common_teardown);
429 		g_free (testpath);
430 	}
431 
432 	/* Add stop word checks */
433 	for (i = 0; test_data_stop_words[i].str != NULL; i++) {
434 		gchar *testpath;
435 
436 		testpath = g_strdup_printf ("/libtracker-fts/parser/stop_words_%d", i);
437 		g_test_add (testpath,
438 		            TrackerParserTestFixture,
439 		            &test_data_stop_words[i],
440 		            test_common_setup,
441 		            stop_word_check,
442 		            test_common_teardown);
443 		g_free (testpath);
444 	}
445 
446 	return g_test_run ();
447 }
tracker-0.16.2/tests/libtracker-fts/tracker-parser-test.c