No issues found
1 /*
2 * Copyright (C) 2006, Jamie McCracken <jamiemcc@gnome.org>
3 * Copyright (C) 2008, Nokia <ivan.frade@nokia.com>
4 *
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2.1 of the License, or (at your option) any later version.
9 *
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
14 *
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, write to the
17 * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
18 * Boston, MA 02110-1301, USA.
19 */
20
21 #include "config.h"
22
23 #include <string.h>
24
25 #include <glib.h>
26
27 #include <libstemmer/libstemmer.h>
28
29 #include "tracker-log.h"
30 #include "tracker-language.h"
31
32 #define GET_PRIV(obj) (G_TYPE_INSTANCE_GET_PRIVATE ((obj), TRACKER_TYPE_LANGUAGE, TrackerLanguagePriv))
33
34 typedef struct _TrackerLanguagePriv TrackerLanguagePriv;
35 typedef struct _Languages Languages;
36
37 struct _TrackerLanguagePriv {
38 GHashTable *stop_words;
39 gboolean enable_stemmer;
40 gchar *language_code;
41
42 #if GLIB_CHECK_VERSION (2,31,0)
43 GMutex stemmer_mutex;
44 #else
45 GMutex *stemmer_mutex;
46 #endif
47 gpointer stemmer;
48 };
49
50 struct _Languages {
51 const gchar *code;
52 const gchar *name;
53 };
54
55 static Languages all_langs[] = {
56 { "da", "Danish" },
57 { "nl", "Dutch" },
58 { "en", "English" },
59 { "fi", "Finnish" },
60 { "fr", "French" },
61 { "de", "German" },
62 { "hu", "Hungarian" },
63 { "it", "Italian" },
64 { "nb", "Norwegian" },
65 { "pt", "Portuguese" },
66 { "ru", "Russian" },
67 { "es", "Spanish" },
68 { "sv", "Swedish" },
69 { NULL, NULL },
70 };
71
72 /* GObject properties */
73 enum {
74 PROP_0,
75
76 PROP_ENABLE_STEMMER,
77 PROP_STOP_WORDS,
78 PROP_LANGUAGE_CODE,
79 };
80
81 static void language_finalize (GObject *object);
82 static void language_get_property (GObject *object,
83 guint param_id,
84 GValue *value,
85 GParamSpec *pspec);
86 static void language_set_property (GObject *object,
87 guint param_id,
88 const GValue *value,
89 GParamSpec *pspec);
90
91 G_DEFINE_TYPE (TrackerLanguage, tracker_language, G_TYPE_OBJECT);
92
93 static void
94 tracker_language_class_init (TrackerLanguageClass *klass)
95 {
96 GObjectClass *object_class = G_OBJECT_CLASS (klass);
97
98 object_class->finalize = language_finalize;
99 object_class->get_property = language_get_property;
100 object_class->set_property = language_set_property;
101
102 g_object_class_install_property (object_class,
103 PROP_ENABLE_STEMMER,
104 g_param_spec_boolean ("enable-stemmer",
105 "Enable stemmer",
106 "Enable stemmer",
107 TRUE,
108 G_PARAM_WRITABLE | G_PARAM_CONSTRUCT));
109
110 g_object_class_install_property (object_class,
111 PROP_STOP_WORDS,
112 g_param_spec_boxed ("stop-words",
113 "Stop words",
114 "Stop words",
115 g_hash_table_get_type (),
116 G_PARAM_READABLE));
117
118 g_object_class_install_property (object_class,
119 PROP_LANGUAGE_CODE,
120 g_param_spec_string ("language-code",
121 "Language code",
122 "Language code",
123 "en",
124 G_PARAM_WRITABLE | G_PARAM_CONSTRUCT));
125
126 g_type_class_add_private (object_class, sizeof (TrackerLanguagePriv));
127 }
128
129 static void
130 tracker_language_init (TrackerLanguage *language)
131 {
132 TrackerLanguagePriv *priv;
133 const gchar *stem_language;
134
135 priv = GET_PRIV (language);
136
137 priv->stop_words = g_hash_table_new_full (g_str_hash,
138 g_str_equal,
139 g_free,
140 NULL);
141 #if GLIB_CHECK_VERSION (2,31,0)
142 g_mutex_init (&priv->stemmer_mutex);
143 #else
144 priv->stemmer_mutex = g_mutex_new ();
145 #endif
146
147 stem_language = tracker_language_get_name_by_code (NULL);
148 priv->stemmer = sb_stemmer_new (stem_language, NULL);
149 }
150
151 static void
152 language_finalize (GObject *object)
153 {
154 TrackerLanguagePriv *priv;
155
156 priv = GET_PRIV (object);
157
158 #if GLIB_CHECK_VERSION (2,31,0)
159 if (priv->stemmer) {
160 g_mutex_lock (&priv->stemmer_mutex);
161 sb_stemmer_delete (priv->stemmer);
162 g_mutex_unlock (&priv->stemmer_mutex);
163 }
164 g_mutex_clear (&priv->stemmer_mutex);
165 #else
166 if (priv->stemmer) {
167 g_mutex_lock (priv->stemmer_mutex);
168 sb_stemmer_delete (priv->stemmer);
169 g_mutex_unlock (priv->stemmer_mutex);
170 }
171 g_mutex_free (priv->stemmer_mutex);
172 #endif
173
174 if (priv->stop_words) {
175 g_hash_table_unref (priv->stop_words);
176 }
177
178 g_free (priv->language_code);
179
180 (G_OBJECT_CLASS (tracker_language_parent_class)->finalize) (object);
181 }
182
183 static void
184 language_get_property (GObject *object,
185 guint param_id,
186 GValue *value,
187 GParamSpec *pspec)
188 {
189 TrackerLanguagePriv *priv;
190
191 priv = GET_PRIV (object);
192
193 switch (param_id) {
194 case PROP_ENABLE_STEMMER:
195 g_value_set_boolean (value, priv->enable_stemmer);
196 break;
197 case PROP_STOP_WORDS:
198 g_value_set_boxed (value, priv->stop_words);
199 break;
200 case PROP_LANGUAGE_CODE:
201 g_value_set_string (value, priv->language_code);
202 break;
203
204 default:
205 G_OBJECT_WARN_INVALID_PROPERTY_ID (object, param_id, pspec);
206 break;
207 };
208 }
209
210 static void
211 language_set_property (GObject *object,
212 guint param_id,
213 const GValue *value,
214 GParamSpec *pspec)
215 {
216 switch (param_id) {
217 case PROP_ENABLE_STEMMER:
218 tracker_language_set_enable_stemmer (TRACKER_LANGUAGE (object),
219 g_value_get_boolean (value));
220 break;
221 case PROP_LANGUAGE_CODE:
222 tracker_language_set_language_code (TRACKER_LANGUAGE (object),
223 g_value_get_string (value));
224 break;
225 default:
226 G_OBJECT_WARN_INVALID_PROPERTY_ID (object, param_id, pspec);
227 break;
228 };
229 }
230
231 static gchar *
232 language_get_stopword_filename (const gchar *language_code)
233 {
234 gchar *str;
235 gchar *filename;
236 const gchar *testpath;
237
238 str = g_strconcat ("stopwords.", language_code, NULL);
239
240 /* Look if the testpath for stopwords dictionary was set
241 * (used during unit tests) */
242 testpath = g_getenv ("TRACKER_LANGUAGE_STOP_WORDS_DIR");
243 if (!testpath) {
244 filename = g_build_filename (SHAREDIR,
245 "tracker",
246 "languages",
247 str,
248 NULL);
249 } else {
250 filename = g_build_filename (testpath,
251 str,
252 NULL);
253 }
254
255 g_free (str);
256 return filename;
257 }
258
259 static void
260 language_add_stopwords (TrackerLanguage *language,
261 const gchar *filename)
262 {
263 TrackerLanguagePriv *priv;
264 GMappedFile *mapped_file;
265 GError *error = NULL;
266 gchar *content;
267 gchar **words, **p;
268
269 priv = GET_PRIV (language);
270
271 mapped_file = g_mapped_file_new (filename, FALSE, &error);
272 if (error) {
273 g_message ("Tracker couldn't read stopword file:'%s', %s",
274 filename, error->message);
275 g_clear_error (&error);
276 return;
277 }
278
279 content = g_mapped_file_get_contents (mapped_file);
280 words = g_strsplit_set (content, "\n" , -1);
281
282 #if GLIB_CHECK_VERSION(2,22,0)
283 g_mapped_file_unref (mapped_file);
284 #else
285 g_mapped_file_free (mapped_file);
286 #endif
287
288 /* FIXME: Shouldn't clear the hash table first? */
289 for (p = words; *p; p++) {
290 g_hash_table_insert (priv->stop_words,
291 g_strdup (g_strstrip (*p)),
292 GINT_TO_POINTER (1));
293 }
294
295 g_strfreev (words);
296 }
297
298 static void
299 language_set_stopword_list (TrackerLanguage *language,
300 const gchar *language_code)
301 {
302 TrackerLanguagePriv *priv;
303 gchar *stopword_filename;
304 gchar *stem_language_lower;
305 const gchar *stem_language;
306
307 g_return_if_fail (TRACKER_IS_LANGUAGE (language));
308
309 priv = GET_PRIV (language);
310
311 /* Set up stopwords list */
312 /* g_message ("Setting up stopword list for language code:'%s'", language_code); */
313
314 stopword_filename = language_get_stopword_filename (language_code);
315 language_add_stopwords (language, stopword_filename);
316 g_free (stopword_filename);
317
318 if (!language_code || strcmp (language_code, "en") != 0) {
319 stopword_filename = language_get_stopword_filename ("en");
320 language_add_stopwords (language, stopword_filename);
321 g_free (stopword_filename);
322 }
323
324 /* g_message ("Setting up stemmer for language code:'%s'", language_code); */
325
326 stem_language = tracker_language_get_name_by_code (language_code);
327 stem_language_lower = g_ascii_strdown (stem_language, -1);
328
329 #if GLIB_CHECK_VERSION (2,31,0)
330 g_mutex_lock (&priv->stemmer_mutex);
331 #else
332 g_mutex_lock (priv->stemmer_mutex);
333 #endif
334
335 if (priv->stemmer) {
336 sb_stemmer_delete (priv->stemmer);
337 }
338
339 priv->stemmer = sb_stemmer_new (stem_language_lower, NULL);
340 if (!priv->stemmer) {
341 g_message ("No stemmer could be found for language:'%s'",
342 stem_language_lower);
343 }
344
345 #if GLIB_CHECK_VERSION (2,31,0)
346 g_mutex_unlock (&priv->stemmer_mutex);
347 #else
348 g_mutex_unlock (priv->stemmer_mutex);
349 #endif
350
351 g_free (stem_language_lower);
352 }
353
354 /**
355 * tracker_language_new:
356 * @language_code: language code in ISO 639-1 format
357 *
358 * Creates a new #TrackerLanguage instance for the passed language code.
359 *
360 * Returns: a newly created #TrackerLanguage
361 **/
362 TrackerLanguage *
363 tracker_language_new (const gchar *language_code)
364 {
365 TrackerLanguage *language;
366
367 language = g_object_new (TRACKER_TYPE_LANGUAGE,
368 "language-code", language_code,
369 NULL);
370
371 return language;
372 }
373
374 /**
375 * tracker_language_get_enable_stemmer:
376 * @language: a #TrackerLanguage
377 *
378 * Returns whether words stemming is enabled for @language.
379 *
380 * Returns: %TRUE if word stemming is enabled.
381 **/
382 gboolean
383 tracker_language_get_enable_stemmer (TrackerLanguage *language)
384 {
385 TrackerLanguagePriv *priv;
386
387 g_return_val_if_fail (TRACKER_IS_LANGUAGE (language), TRUE);
388
389 priv = GET_PRIV (language);
390
391 return priv->enable_stemmer;
392 }
393
394 /**
395 * tracker_language_get_stop_words:
396 * @language: a #TrackerLanguage
397 *
398 * Returns the stop words for @language. Stop words are really common
399 * words that are not worth to index for the language handled by @language.
400 *
401 * Returns: A #GHashTable with the stop words as the value, this memory
402 * is owned by @language and should not be modified nor freed.
403 **/
404 GHashTable *
405 tracker_language_get_stop_words (TrackerLanguage *language)
406 {
407 TrackerLanguagePriv *priv;
408
409 g_return_val_if_fail (TRACKER_IS_LANGUAGE (language), NULL);
410
411 priv = GET_PRIV (language);
412
413 return priv->stop_words;
414 }
415
416 /**
417 * tracker_language_is_stop_word:
418 * @language: a #TrackerLanguage
419 * @word: a string containing a word
420 *
421 * Returns %TRUE if the given @word is in the list of stop words of the
422 * given @language.
423 *
424 * Returns: %TRUE if @word is a stop word. %FALSE otherwise.
425 */
426 gboolean
427 tracker_language_is_stop_word (TrackerLanguage *language,
428 const gchar *word)
429 {
430 TrackerLanguagePriv *priv;
431
432 g_return_val_if_fail (TRACKER_IS_LANGUAGE (language), FALSE);
433 g_return_val_if_fail (word, FALSE);
434
435 priv = GET_PRIV (language);
436
437 return g_hash_table_lookup (priv->stop_words, word) != NULL;
438 }
439
440 /**
441 * tracker_language_get_language_code:
442 * @language: a #TrackerLanguage
443 *
444 * Returns the language code in ISO 639-1 handled by @language.
445 *
446 * Returns: the language code.
447 **/
448 const gchar *
449 tracker_language_get_language_code (TrackerLanguage *language)
450 {
451 TrackerLanguagePriv *priv;
452
453 g_return_val_if_fail (TRACKER_IS_LANGUAGE (language), NULL);
454
455 priv = GET_PRIV (language);
456
457 return priv->language_code;
458 }
459
460 /**
461 * tracker_language_set_enable_stemmer:
462 * @language: a #TrackerLanguage
463 * @value: %TRUE to enable word stemming
464 *
465 * Enables or disables word stemming for @language.
466 **/
467 void
468 tracker_language_set_enable_stemmer (TrackerLanguage *language,
469 gboolean value)
470 {
471 TrackerLanguagePriv *priv;
472
473 g_return_if_fail (TRACKER_IS_LANGUAGE (language));
474
475 priv = GET_PRIV (language);
476
477 priv->enable_stemmer = value;
478
479 g_object_notify (G_OBJECT (language), "enable-stemmer");
480 }
481
482 /**
483 * tracker_language_set_language_code:
484 * @language: a #TrackerLanguage
485 * @language_code: an ISO 639-1 language code
486 *
487 * Sets the @language to @language_code, a %NULL value will reset this
488 * to "en" (English).
489 **/
490 void
491 tracker_language_set_language_code (TrackerLanguage *language,
492 const gchar *language_code)
493 {
494 TrackerLanguagePriv *priv;
495
496 g_return_if_fail (TRACKER_IS_LANGUAGE (language));
497
498 priv = GET_PRIV (language);
499
500 g_free (priv->language_code);
501
502 priv->language_code = g_strdup (language_code);
503
504 if (!priv->language_code) {
505 priv->language_code = g_strdup ("en");
506 }
507
508 language_set_stopword_list (language, priv->language_code);
509
510 g_object_notify (G_OBJECT (language), "language-code");
511 }
512
513 /**
514 * tracker_language_stem_word:
515 * @language: a #TrackerLanguage
516 * @word: string pointing to a word
517 * @word_length: word ascii length
518 *
519 * If the stemmer is enabled, it will return the stem word for @word.
520 * If it's disabled, it will return the passed word.
521 *
522 * Returns: a string with the processed word. This string must be
523 * freed with g_free()
524 **/
525 gchar *
526 tracker_language_stem_word (TrackerLanguage *language,
527 const gchar *word,
528 gint word_length)
529 {
530 TrackerLanguagePriv *priv;
531 const gchar *stem_word;
532
533 g_return_val_if_fail (TRACKER_IS_LANGUAGE (language), NULL);
534
535 if (word_length < 0) {
536 word_length = strlen (word);
537 }
538
539 priv = GET_PRIV (language);
540
541 if (!priv->enable_stemmer) {
542 return g_strndup (word, word_length);
543 }
544
545 #if GLIB_CHECK_VERSION (2,31,0)
546 g_mutex_lock (&priv->stemmer_mutex);
547 #else
548 g_mutex_lock (priv->stemmer_mutex);
549 #endif
550
551 stem_word = (const gchar*) sb_stemmer_stem (priv->stemmer,
552 (guchar*) word,
553 word_length);
554
555 #if GLIB_CHECK_VERSION (2,31,0)
556 g_mutex_unlock (&priv->stemmer_mutex);
557 #else
558 g_mutex_unlock (priv->stemmer_mutex);
559 #endif
560
561 return g_strdup (stem_word);
562 }
563
564 /**
565 * tracker_language_get_name_by_code:
566 * @language_code: a ISO 639-1 language code.
567 *
568 * Returns a human readable language name for the given
569 * ISO 639-1 code, if supported by #TrackerLanguage
570 *
571 * Returns: the language name.
572 **/
573 const gchar *
574 tracker_language_get_name_by_code (const gchar *language_code)
575 {
576 gint i;
577
578 if (!language_code || language_code[0] == '\0') {
579 return "english";
580 }
581
582 for (i = 0; all_langs[i].code; i++) {
583 if (g_str_has_prefix (language_code, all_langs[i].code)) {
584 return all_langs[i].name;
585 }
586 }
587
588 return "";
589 }