No issues found
1 /*
2 * Copyright (C) 2010, Nokia <ivan.frade@nokia.com>
3 *
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License as published by the Free Software Foundation; either
7 * version 2 of the License, or (at your option) any later version.
8 *
9 * This library is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public
15 * License along with this library; if not, write to the
16 * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
17 * Boston, MA 02110-1301, USA.
18 */
19
20 #include "config.h"
21
22 #include <string.h>
23
24 #include <glib.h>
25 #include <gio/gio.h>
26
27 #include <libtracker-fts/tracker-parser.h>
28
29 /* Note
30 * Currently, three different types of parsers are defined in libtracker-fts:
31 * - GNU libunistring-based parser, up to 20% faster than the others, and full
32 * unicode compliant.
33 * - libicu-based parser, full unicode compliant but slower as it needs
34 * conversions to/from UChars (UTF-16 encoded strings)
35 * - glib/pango parser, not fully unicode compliant as it doesn't work properly
36 * with decomposed strings (NFD normalized), doesn't make a unicode-based
37 * word-breaking and doesn't make full-word casefolding.
38 *
39 * Some of the tests, thus, will be DISABLED for the GLIB/PANGO parser.
40 */
41 #undef FULL_UNICODE_TESTS
42 #if defined HAVE_LIBUNISTRING || defined HAVE_LIBICU
43 #define FULL_UNICODE_TESTS
44 #endif
45
46 /* -------------- COMMON FOR ALL TESTS ----------------- */
47
48 /* Fixture object type */
49 typedef struct {
50 /* The parser object */
51 TrackerParser *parser;
52
53 /* Default parser configuration to use */
54 gint max_word_length;
55 gboolean enable_stemmer;
56 gboolean enable_unaccent;
57 gboolean ignore_stop_words;
58 gboolean ignore_reserved_words;
59 gboolean ignore_numbers;
60 } TrackerParserTestFixture;
61
62 /* Common setup for all tests */
63 static void
64 test_common_setup (TrackerParserTestFixture *fixture,
65 gconstpointer data)
66 {
67 TrackerLanguage *language;
68
69 /* Setup language for parser. We make sure that always English is used
70 * in the unit tests, because we want the English stemming method to
71 * be used. */
72 language = tracker_language_new ("en");
73 if (!language) {
74 g_critical ("Language setup failed!");
75 return;
76 }
77
78 /* Default conf parameters */
79 fixture->max_word_length = 50;
80 fixture->enable_stemmer = TRUE;
81 fixture->enable_unaccent = TRUE;
82 fixture->ignore_stop_words = TRUE;
83 fixture->ignore_reserved_words = TRUE;
84 fixture->ignore_numbers = TRUE;
85
86 /* Create the parser */
87 fixture->parser = tracker_parser_new (language);
88 if (!fixture->parser) {
89 g_critical ("Parser creation failed!");
90 return;
91 }
92
93 g_object_unref (language);
94 }
95
96 /* Common teardown for all tests */
97 static void
98 test_common_teardown (TrackerParserTestFixture *fixture,
99 gconstpointer data)
100 {
101 if (fixture->parser) {
102 tracker_parser_free (fixture->parser);
103 }
104 }
105
106 /* -------------- EXPECTED NUMBER OF WORDS TESTS ----------------- */
107
108 /* Test struct for the expected-nwords tests */
109 typedef struct TestDataExpectedNWords TestDataExpectedNWords;
110 struct TestDataExpectedNWords {
111 const gchar *str;
112 gboolean ignore_numbers;
113 guint expected_nwords;
114 };
115
116 /* Common expected_word test method */
117 static void
118 expected_nwords_check (TrackerParserTestFixture *fixture,
119 gconstpointer data)
120 {
121 const TestDataExpectedNWords *testdata = data;
122 gint position;
123 gint byte_offset_start;
124 gint byte_offset_end;
125 gboolean stop_word;
126 gint word_length;
127 guint nwords = 0;
128
129 /* Reset the parser with the test string */
130 tracker_parser_reset (fixture->parser,
131 testdata->str,
132 strlen (testdata->str),
133 fixture->max_word_length,
134 fixture->enable_stemmer,
135 fixture->enable_unaccent,
136 fixture->ignore_stop_words,
137 fixture->ignore_reserved_words,
138 testdata->ignore_numbers);
139
140 /* Count number of output words */
141 while (tracker_parser_next (fixture->parser,
142 &position,
143 &byte_offset_start,
144 &byte_offset_end,
145 &stop_word,
146 &word_length)) {
147 nwords++;
148 }
149
150 /* Check if input is same as expected */
151 g_assert_cmpuint (nwords, == , testdata->expected_nwords);
152 }
153
154 /* -------------- EXPECTED WORD TESTS ----------------- */
155
156 /* Test struct for the expected-word tests */
157 typedef struct TestDataExpectedWord TestDataExpectedWord;
158 struct TestDataExpectedWord {
159 const gchar *str;
160 const gchar *expected;
161 gboolean enable_stemmer;
162 gboolean enable_unaccent;
163 };
164
165 /* Common expected_word test method */
166 static void
167 expected_word_check (TrackerParserTestFixture *fixture,
168 gconstpointer data)
169 {
170 const TestDataExpectedWord *testdata = data;
171 const gchar *word;
172 gchar *expected_nfkd;
173 gint position;
174 gint byte_offset_start;
175 gint byte_offset_end;
176 gboolean stop_word;
177 gint word_length;
178
179 /* Reset the parser with our string */
180 tracker_parser_reset (fixture->parser,
181 testdata->str,
182 strlen (testdata->str),
183 fixture->max_word_length,
184 testdata->enable_stemmer,
185 testdata->enable_unaccent,
186 fixture->ignore_stop_words,
187 fixture->ignore_reserved_words,
188 fixture->ignore_numbers);
189
190 /* Process next word */
191 word = tracker_parser_next (fixture->parser,
192 &position,
193 &byte_offset_start,
194 &byte_offset_end,
195 &stop_word,
196 &word_length);
197
198 /* Expected word MUST always be in NFKD normalization */
199 expected_nfkd = g_utf8_normalize (testdata->expected,
200 -1,
201 G_NORMALIZE_NFKD);
202
203 /* Check if input is same as expected */
204 g_assert_cmpstr (word, == , expected_nfkd);
205
206 g_free (expected_nfkd);
207 }
208
209 /* -------------- STOP WORD TESTS ----------------- */
210
211 /* Test struct for the stop-word tests */
212 typedef struct TestDataStopWord TestDataStopWord;
213 struct TestDataStopWord {
214 const gchar *str;
215 gboolean ignore_stop_words;
216 gboolean is_expected_stop_word;
217 };
218
219 /* Common stop__word test method */
220 static void
221 stop_word_check (TrackerParserTestFixture *fixture,
222 gconstpointer data)
223 {
224 const TestDataStopWord *testdata = data;
225 gint position;
226 gint byte_offset_start;
227 gint byte_offset_end;
228 gboolean stop_word;
229 gint word_length;
230
231 /* Reset the parser with our string */
232 tracker_parser_reset (fixture->parser,
233 testdata->str,
234 strlen (testdata->str),
235 fixture->max_word_length,
236 fixture->enable_stemmer,
237 fixture->enable_unaccent,
238 testdata->ignore_stop_words,
239 fixture->ignore_reserved_words,
240 fixture->ignore_numbers);
241
242 /* Process next word */
243 tracker_parser_next (fixture->parser,
244 &position,
245 &byte_offset_start,
246 &byte_offset_end,
247 &stop_word,
248 &word_length);
249
250 /* Check if input is same as stop_word */
251 g_assert_cmpuint (stop_word, == , testdata->is_expected_stop_word);
252 }
253
254 /* -------------- LIST OF TESTS ----------------- */
255
256 /* Normalization-related tests (unaccenting) */
257 static const TestDataExpectedWord test_data_normalization[] = {
258 { "école", "ecole", FALSE, TRUE },
259 { "ÉCOLE", "ecole", FALSE, TRUE },
260 { "École", "ecole", FALSE, TRUE },
261 #ifdef FULL_UNICODE_TESTS /* glib/pango doesn't like NFD strings */
262 { "e" "\xCC\x81" "cole", "ecole", FALSE, TRUE },
263 { "E" "\xCC\x81" "COLE", "ecole", FALSE, TRUE },
264 { "E" "\xCC\x81" "cole", "ecole", FALSE, TRUE },
265 #endif
266 { NULL, NULL, FALSE, FALSE }
267 };
268
269 /* Unaccenting-related tests */
270 static const TestDataExpectedWord test_data_unaccent[] = {
271 { "Murciélago", "murcielago", FALSE, TRUE },
272 { "camión", "camion", FALSE, TRUE },
273 { "desagüe", "desague", FALSE, TRUE },
274 { "Ὰ", "α", FALSE, TRUE }, /* greek capital alpha with U+0300, composed */
275 { "ὰ", "α", FALSE, TRUE }, /* greek small alpha with U+0300, composed */
276 { "Ὶ", "ι", FALSE, TRUE }, /* greek capital iotta with U+0300, composed */
277 { "ὶ", "ι", FALSE, TRUE }, /* greek small iotta with U+0300, composed */
278 { "Ὼ", "ω", FALSE, TRUE }, /* greek capital omega with U+0300, composed */
279 { "ὼ", "ω", FALSE, TRUE }, /* greek small omega with U+0300, composed */
280 #ifdef FULL_UNICODE_TESTS /* glib/pango does not like NFD strings */
281 { "Ὰ", "α", FALSE, TRUE }, /* capital alpha with U+0300, decomposed */
282 { "ὰ", "α", FALSE, TRUE }, /* small alpha with U+0300, decomposed */
283 { "Ὶ", "ι", FALSE, TRUE }, /* capital iotta with U+0300, decomposed */
284 { "ὶ", "ι", FALSE, TRUE }, /* small iotta with U+0300, decomposed */
285 { "Ὼ", "ω", FALSE, TRUE }, /* capital omega with U+0300, decomposed */
286 { "ὼ", "ω", FALSE, TRUE }, /* small omega with U+0300, decomposed */
287 { "aN͡Ga", "anga", FALSE, TRUE }, /* 0x0361 affects to two characters */
288 { "aNG͡a", "anga", FALSE, TRUE }, /* 0x0361 affects to two characters */
289 #endif
290 { "Murciélago", "murciélago", FALSE, FALSE },
291 { "camión", "camión", FALSE, FALSE },
292 { "desagüe", "desagüe", FALSE, FALSE },
293 { NULL, NULL, FALSE, FALSE }
294 };
295
296 /* Stemming-related tests */
297 static const TestDataExpectedWord test_data_stemming[] = {
298 { "ecole", "ecol", TRUE, TRUE },
299 { "ecole", "ecole", FALSE, TRUE },
300 { NULL, NULL, FALSE, FALSE }
301 };
302
303 /* Casefolding-related tests */
304 static const TestDataExpectedWord test_data_casefolding[] = {
305 { "gross", "gross", FALSE, TRUE },
306 { "GROSS", "gross", FALSE, TRUE },
307 { "GrOsS", "gross", FALSE, TRUE },
308 #ifdef FULL_UNICODE_TESTS /* glib/pango doesn't do full-word casefolding */
309 { "groß", "gross", FALSE, TRUE },
310 #endif
311 { NULL, NULL, FALSE, FALSE }
312 };
313
314 /* Number of expected words tests */
315 static const TestDataExpectedNWords test_data_nwords[] = {
316 #ifdef FULL_UNICODE_TESTS /* glib/pango assumes ' is a word breaker */
317 { "The quick (\"brown\") fox can’t jump 32.3 feet, right?", TRUE, 8 },
318 { "The quick (\"brown\") fox can’t jump 32.3 feet, right?", FALSE, 10 },
319 #endif
320 /* Note: as of 0.9.15, the dot is always a word breaker, even between
321 * numbers. */
322 { "filename.txt", TRUE, 2 },
323 { ".hidden.txt", TRUE, 2 },
324 { "noextension.", TRUE, 1 },
325 { "ホモ・サピエンス", TRUE, 2 }, /* katakana */
326 #ifdef FULL_UNICODE_TESTS /* glib/pango doesn't work properly with chinese */
327 { "本州最主流的风味", TRUE, 8 }, /* chinese */
328 #endif
329 { "Американские суда находятся в международных водах.", TRUE, 6 }, /* russian */
330 #ifdef FULL_UNICODE_TESTS /* glib/pango doesn't work properly with chinese */
331 { "Bần chỉ là một anh nghèo xác", TRUE, 7 }, /* vietnamese */
332 { "ホモ・サピエンス 本州最主流的风味 katakana, chinese, english", TRUE, 13 }, /* mixed */
333 #endif
334 { NULL, FALSE, 0 }
335 };
336
337 /* Stop-word tests (for english only) */
338 static const TestDataStopWord test_data_stop_words[] = {
339 { "hello", TRUE, TRUE }, /* hello is stop word */
340 { "hello", FALSE, FALSE },
341 { "world", TRUE, FALSE }, /* world is not stop word */
342 { "world", FALSE, FALSE },
343 { NULL, FALSE, FALSE }
344 };
345
346 int
347 main (int argc, char **argv)
348 {
349 gint i;
350
351 g_test_init (&argc, &argv, NULL);
352
353 /* We want the tests to properly find the stopwords dictionaries, so we
354 * need to set the following envvar with the path where the
355 * dictionaries are. */
356 g_setenv ("TRACKER_LANGUAGE_STOP_WORDS_DIR",
357 TOP_SRCDIR "/data/languages",
358 TRUE);
359
360 /* Add normalization checks */
361 for (i = 0; test_data_normalization[i].str != NULL; i++) {
362 gchar *testpath;
363
364 testpath = g_strdup_printf ("/libtracker-fts/parser/normalization_%d", i);
365 g_test_add (testpath,
366 TrackerParserTestFixture,
367 &test_data_normalization[i],
368 test_common_setup,
369 expected_word_check,
370 test_common_teardown);
371 g_free (testpath);
372 }
373
374 #ifdef HAVE_UNAC
375 /* Add unaccent checks */
376 for (i = 0; test_data_unaccent[i].str != NULL; i++) {
377 gchar *testpath;
378
379 testpath = g_strdup_printf ("/libtracker-fts/parser/unaccent_%d", i);
380 g_test_add (testpath,
381 TrackerParserTestFixture,
382 &test_data_unaccent[i],
383 test_common_setup,
384 expected_word_check,
385 test_common_teardown);
386 g_free (testpath);
387 }
388 #endif
389
390 /* Add casefolding checks */
391 for (i = 0; test_data_casefolding[i].str != NULL; i++) {
392 gchar *testpath;
393
394 testpath = g_strdup_printf ("/libtracker-fts/parser/casefolding_%d", i);
395 g_test_add (testpath,
396 TrackerParserTestFixture,
397 &test_data_casefolding[i],
398 test_common_setup,
399 expected_word_check,
400 test_common_teardown);
401 g_free (testpath);
402 }
403
404 /* Add stemming checks */
405 for (i = 0; test_data_stemming[i].str != NULL; i++) {
406 gchar *testpath;
407
408 testpath = g_strdup_printf ("/libtracker-fts/parser/stemming_%d", i);
409 g_test_add (testpath,
410 TrackerParserTestFixture,
411 &test_data_stemming[i],
412 test_common_setup,
413 expected_word_check,
414 test_common_teardown);
415 g_free (testpath);
416 }
417
418 /* Add expected number of words checks */
419 for (i = 0; test_data_nwords[i].str != NULL; i++) {
420 gchar *testpath;
421
422 testpath = g_strdup_printf ("/libtracker-fts/parser/nwords_%d", i);
423 g_test_add (testpath,
424 TrackerParserTestFixture,
425 &test_data_nwords[i],
426 test_common_setup,
427 expected_nwords_check,
428 test_common_teardown);
429 g_free (testpath);
430 }
431
432 /* Add stop word checks */
433 for (i = 0; test_data_stop_words[i].str != NULL; i++) {
434 gchar *testpath;
435
436 testpath = g_strdup_printf ("/libtracker-fts/parser/stop_words_%d", i);
437 g_test_add (testpath,
438 TrackerParserTestFixture,
439 &test_data_stop_words[i],
440 test_common_setup,
441 stop_word_check,
442 test_common_teardown);
443 g_free (testpath);
444 }
445
446 return g_test_run ();
447 }