tracker-0.16.2/src/libtracker-common/tracker-language.c

No issues found

  1 /*
  2  * Copyright (C) 2006, Jamie McCracken <jamiemcc@gnome.org>
  3  * Copyright (C) 2008, Nokia <ivan.frade@nokia.com>
  4  *
  5  * This library is free software; you can redistribute it and/or
  6  * modify it under the terms of the GNU Lesser General Public
  7  * License as published by the Free Software Foundation; either
  8  * version 2.1 of the License, or (at your option) any later version.
  9  *
 10  * This library is distributed in the hope that it will be useful,
 11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 13  * Lesser General Public License for more details.
 14  *
 15  * You should have received a copy of the GNU Lesser General Public
 16  * License along with this library; if not, write to the
 17  * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
 18  * Boston, MA  02110-1301, USA.
 19  */
 20 
 21 #include "config.h"
 22 
 23 #include <string.h>
 24 
 25 #include <glib.h>
 26 
 27 #include <libstemmer/libstemmer.h>
 28 
 29 #include "tracker-log.h"
 30 #include "tracker-language.h"
 31 
 32 #define GET_PRIV(obj) (G_TYPE_INSTANCE_GET_PRIVATE ((obj), TRACKER_TYPE_LANGUAGE, TrackerLanguagePriv))
 33 
 34 typedef struct _TrackerLanguagePriv TrackerLanguagePriv;
 35 typedef struct _Languages           Languages;
 36 
 37 struct _TrackerLanguagePriv {
 38 	GHashTable    *stop_words;
 39 	gboolean       enable_stemmer;
 40 	gchar         *language_code;
 41 
 42 #if GLIB_CHECK_VERSION (2,31,0)
 43 	GMutex         stemmer_mutex;
 44 #else
 45 	GMutex        *stemmer_mutex;
 46 #endif
 47 	gpointer       stemmer;
 48 };
 49 
 50 struct _Languages {
 51 	const gchar *code;
 52 	const gchar *name;
 53 };
 54 
 55 static Languages all_langs[] = {
 56 	{ "da", "Danish" },
 57 	{ "nl", "Dutch" },
 58 	{ "en", "English" },
 59 	{ "fi", "Finnish" },
 60 	{ "fr", "French" },
 61 	{ "de", "German" },
 62 	{ "hu", "Hungarian" },
 63 	{ "it", "Italian" },
 64 	{ "nb", "Norwegian" },
 65 	{ "pt", "Portuguese" },
 66 	{ "ru", "Russian" },
 67 	{ "es", "Spanish" },
 68 	{ "sv", "Swedish" },
 69 	{ NULL, NULL },
 70 };
 71 
 72 /* GObject properties */
 73 enum {
 74 	PROP_0,
 75 
 76 	PROP_ENABLE_STEMMER,
 77 	PROP_STOP_WORDS,
 78 	PROP_LANGUAGE_CODE,
 79 };
 80 
 81 static void         language_finalize          (GObject       *object);
 82 static void         language_get_property      (GObject       *object,
 83                                                 guint          param_id,
 84                                                 GValue        *value,
 85                                                 GParamSpec    *pspec);
 86 static void         language_set_property      (GObject       *object,
 87                                                 guint          param_id,
 88                                                 const GValue  *value,
 89                                                 GParamSpec    *pspec);
 90 
 91 G_DEFINE_TYPE (TrackerLanguage, tracker_language, G_TYPE_OBJECT);
 92 
 93 static void
 94 tracker_language_class_init (TrackerLanguageClass *klass)
 95 {
 96 	GObjectClass *object_class = G_OBJECT_CLASS (klass);
 97 
 98 	object_class->finalize     = language_finalize;
 99 	object_class->get_property = language_get_property;
100 	object_class->set_property = language_set_property;
101 
102 	g_object_class_install_property (object_class,
103 	                                 PROP_ENABLE_STEMMER,
104 	                                 g_param_spec_boolean ("enable-stemmer",
105 	                                                       "Enable stemmer",
106 	                                                       "Enable stemmer",
107 	                                                       TRUE,
108 	                                                       G_PARAM_WRITABLE | G_PARAM_CONSTRUCT));
109 
110 	g_object_class_install_property (object_class,
111 	                                 PROP_STOP_WORDS,
112 	                                 g_param_spec_boxed ("stop-words",
113 	                                                     "Stop words",
114 	                                                     "Stop words",
115 	                                                     g_hash_table_get_type (),
116 	                                                     G_PARAM_READABLE));
117 
118 	g_object_class_install_property (object_class,
119 	                                 PROP_LANGUAGE_CODE,
120 	                                 g_param_spec_string ("language-code",
121 	                                                      "Language code",
122 	                                                      "Language code",
123 	                                                      "en",
124 	                                                      G_PARAM_WRITABLE | G_PARAM_CONSTRUCT));
125 
126 	g_type_class_add_private (object_class, sizeof (TrackerLanguagePriv));
127 }
128 
129 static void
130 tracker_language_init (TrackerLanguage *language)
131 {
132 	TrackerLanguagePriv *priv;
133 	const gchar         *stem_language;
134 
135 	priv = GET_PRIV (language);
136 
137 	priv->stop_words = g_hash_table_new_full (g_str_hash,
138 	                                          g_str_equal,
139 	                                          g_free,
140 	                                          NULL);
141 #if GLIB_CHECK_VERSION (2,31,0)
142 	g_mutex_init (&priv->stemmer_mutex);
143 #else
144 	priv->stemmer_mutex = g_mutex_new ();
145 #endif
146 
147 	stem_language = tracker_language_get_name_by_code (NULL);
148 	priv->stemmer = sb_stemmer_new (stem_language, NULL);
149 }
150 
151 static void
152 language_finalize (GObject *object)
153 {
154 	TrackerLanguagePriv *priv;
155 
156 	priv = GET_PRIV (object);
157 
158 #if GLIB_CHECK_VERSION (2,31,0)
159 	if (priv->stemmer) {
160 		g_mutex_lock (&priv->stemmer_mutex);
161 		sb_stemmer_delete (priv->stemmer);
162 		g_mutex_unlock (&priv->stemmer_mutex);
163 	}
164 	g_mutex_clear (&priv->stemmer_mutex);
165 #else
166 	if (priv->stemmer) {
167 		g_mutex_lock (priv->stemmer_mutex);
168 		sb_stemmer_delete (priv->stemmer);
169 		g_mutex_unlock (priv->stemmer_mutex);
170 	}
171 	g_mutex_free (priv->stemmer_mutex);
172 #endif
173 
174 	if (priv->stop_words) {
175 		g_hash_table_unref (priv->stop_words);
176 	}
177 
178 	g_free (priv->language_code);
179 
180 	(G_OBJECT_CLASS (tracker_language_parent_class)->finalize) (object);
181 }
182 
183 static void
184 language_get_property (GObject    *object,
185                        guint       param_id,
186                        GValue     *value,
187                        GParamSpec *pspec)
188 {
189 	TrackerLanguagePriv *priv;
190 
191 	priv = GET_PRIV (object);
192 
193 	switch (param_id) {
194 	case PROP_ENABLE_STEMMER:
195 		g_value_set_boolean (value, priv->enable_stemmer);
196 		break;
197 	case PROP_STOP_WORDS:
198 		g_value_set_boxed (value, priv->stop_words);
199 		break;
200 	case PROP_LANGUAGE_CODE:
201 		g_value_set_string (value, priv->language_code);
202 		break;
203 
204 	default:
205 		G_OBJECT_WARN_INVALID_PROPERTY_ID (object, param_id, pspec);
206 		break;
207 	};
208 }
209 
210 static void
211 language_set_property (GObject      *object,
212                        guint         param_id,
213                        const GValue *value,
214                        GParamSpec   *pspec)
215 {
216 	switch (param_id) {
217 	case PROP_ENABLE_STEMMER:
218 		tracker_language_set_enable_stemmer (TRACKER_LANGUAGE (object),
219 		                                     g_value_get_boolean (value));
220 		break;
221 	case PROP_LANGUAGE_CODE:
222 		tracker_language_set_language_code (TRACKER_LANGUAGE (object),
223 		                                    g_value_get_string (value));
224 		break;
225 	default:
226 		G_OBJECT_WARN_INVALID_PROPERTY_ID (object, param_id, pspec);
227 		break;
228 	};
229 }
230 
231 static gchar *
232 language_get_stopword_filename (const gchar *language_code)
233 {
234 	gchar *str;
235 	gchar *filename;
236 	const gchar *testpath;
237 
238 	str = g_strconcat ("stopwords.", language_code, NULL);
239 
240 	/* Look if the testpath for stopwords dictionary was set
241 	 *  (used during unit tests) */
242 	testpath = g_getenv ("TRACKER_LANGUAGE_STOP_WORDS_DIR");
243 	if (!testpath) {
244 		filename = g_build_filename (SHAREDIR,
245 		                             "tracker",
246 		                             "languages",
247 		                             str,
248 		                             NULL);
249 	} else {
250 		filename = g_build_filename (testpath,
251 		                             str,
252 		                             NULL);
253 	}
254 
255 	g_free (str);
256 	return filename;
257 }
258 
259 static void
260 language_add_stopwords (TrackerLanguage *language,
261                         const gchar     *filename)
262 {
263 	TrackerLanguagePriv  *priv;
264 	GMappedFile          *mapped_file;
265 	GError               *error = NULL;
266 	gchar                *content;
267 	gchar               **words, **p;
268 
269 	priv = GET_PRIV (language);
270 
271 	mapped_file = g_mapped_file_new (filename, FALSE, &error);
272 	if (error) {
273 		g_message ("Tracker couldn't read stopword file:'%s', %s",
274 		           filename, error->message);
275 		g_clear_error (&error);
276 		return;
277 	}
278 
279 	content = g_mapped_file_get_contents (mapped_file);
280 	words = g_strsplit_set (content, "\n" , -1);
281 
282 #if GLIB_CHECK_VERSION(2,22,0)
283 	g_mapped_file_unref (mapped_file);
284 #else
285 	g_mapped_file_free (mapped_file);
286 #endif
287 
288 	/* FIXME: Shouldn't clear the hash table first? */
289 	for (p = words; *p; p++) {
290 		g_hash_table_insert (priv->stop_words,
291 		                     g_strdup (g_strstrip (*p)),
292 		                     GINT_TO_POINTER (1));
293 	}
294 
295 	g_strfreev (words);
296 }
297 
298 static void
299 language_set_stopword_list (TrackerLanguage *language,
300                             const gchar     *language_code)
301 {
302 	TrackerLanguagePriv *priv;
303 	gchar               *stopword_filename;
304 	gchar               *stem_language_lower;
305 	const gchar         *stem_language;
306 
307 	g_return_if_fail (TRACKER_IS_LANGUAGE (language));
308 
309 	priv = GET_PRIV (language);
310 
311 	/* Set up stopwords list */
312 	/* g_message ("Setting up stopword list for language code:'%s'", language_code); */
313 
314 	stopword_filename = language_get_stopword_filename (language_code);
315 	language_add_stopwords (language, stopword_filename);
316 	g_free (stopword_filename);
317 
318 	if (!language_code || strcmp (language_code, "en") != 0) {
319 		stopword_filename = language_get_stopword_filename ("en");
320 		language_add_stopwords (language, stopword_filename);
321 		g_free (stopword_filename);
322 	}
323 
324 	/* g_message ("Setting up stemmer for language code:'%s'", language_code); */
325 
326 	stem_language = tracker_language_get_name_by_code (language_code);
327 	stem_language_lower = g_ascii_strdown (stem_language, -1);
328 
329 #if GLIB_CHECK_VERSION (2,31,0)
330 	g_mutex_lock (&priv->stemmer_mutex);
331 #else
332 	g_mutex_lock (priv->stemmer_mutex);
333 #endif
334 
335 	if (priv->stemmer) {
336 		sb_stemmer_delete (priv->stemmer);
337 	}
338 
339 	priv->stemmer = sb_stemmer_new (stem_language_lower, NULL);
340 	if (!priv->stemmer) {
341 		g_message ("No stemmer could be found for language:'%s'",
342 		           stem_language_lower);
343 	}
344 
345 #if GLIB_CHECK_VERSION (2,31,0)
346 	g_mutex_unlock (&priv->stemmer_mutex);
347 #else
348 	g_mutex_unlock (priv->stemmer_mutex);
349 #endif
350 
351 	g_free (stem_language_lower);
352 }
353 
354 /**
355  * tracker_language_new:
356  * @language_code: language code in ISO 639-1 format
357  *
358  * Creates a new #TrackerLanguage instance for the passed language code.
359  *
360  * Returns: a newly created #TrackerLanguage
361  **/
362 TrackerLanguage *
363 tracker_language_new (const gchar *language_code)
364 {
365 	TrackerLanguage *language;
366 
367 	language = g_object_new (TRACKER_TYPE_LANGUAGE,
368 	                         "language-code", language_code,
369 	                         NULL);
370 
371 	return language;
372 }
373 
374 /**
375  * tracker_language_get_enable_stemmer:
376  * @language: a #TrackerLanguage
377  *
378  * Returns whether words stemming is enabled for @language.
379  *
380  * Returns: %TRUE if word stemming is enabled.
381  **/
382 gboolean
383 tracker_language_get_enable_stemmer (TrackerLanguage *language)
384 {
385 	TrackerLanguagePriv *priv;
386 
387 	g_return_val_if_fail (TRACKER_IS_LANGUAGE (language), TRUE);
388 
389 	priv = GET_PRIV (language);
390 
391 	return priv->enable_stemmer;
392 }
393 
394 /**
395  * tracker_language_get_stop_words:
396  * @language: a #TrackerLanguage
397  *
398  * Returns the stop words for @language. Stop words are really common
399  * words that are not worth to index for the language handled by @language.
400  *
401  * Returns: A #GHashTable with the stop words as the value, this memory
402  *          is owned by @language and should not be modified nor freed.
403  **/
404 GHashTable *
405 tracker_language_get_stop_words (TrackerLanguage *language)
406 {
407 	TrackerLanguagePriv *priv;
408 
409 	g_return_val_if_fail (TRACKER_IS_LANGUAGE (language), NULL);
410 
411 	priv = GET_PRIV (language);
412 
413 	return priv->stop_words;
414 }
415 
416 /**
417  * tracker_language_is_stop_word:
418  * @language: a #TrackerLanguage
419  * @word: a string containing a word
420  *
421  * Returns %TRUE if the given @word is in the list of stop words of the
422  *  given @language.
423  *
424  * Returns: %TRUE if @word is a stop word. %FALSE otherwise.
425  */
426 gboolean
427 tracker_language_is_stop_word (TrackerLanguage *language,
428                                const gchar     *word)
429 {
430 	TrackerLanguagePriv *priv;
431 
432 	g_return_val_if_fail (TRACKER_IS_LANGUAGE (language), FALSE);
433 	g_return_val_if_fail (word, FALSE);
434 
435 	priv = GET_PRIV (language);
436 
437 	return g_hash_table_lookup (priv->stop_words, word) != NULL;
438 }
439 
440 /**
441  * tracker_language_get_language_code:
442  * @language: a #TrackerLanguage
443  *
444  * Returns the language code in ISO 639-1 handled by @language.
445  *
446  * Returns: the language code.
447  **/
448 const gchar *
449 tracker_language_get_language_code (TrackerLanguage *language)
450 {
451 	TrackerLanguagePriv *priv;
452 
453 	g_return_val_if_fail (TRACKER_IS_LANGUAGE (language), NULL);
454 
455 	priv = GET_PRIV (language);
456 
457 	return priv->language_code;
458 }
459 
460 /**
461  * tracker_language_set_enable_stemmer:
462  * @language: a #TrackerLanguage
463  * @value: %TRUE to enable word stemming
464  *
465  * Enables or disables word stemming for @language.
466  **/
467 void
468 tracker_language_set_enable_stemmer (TrackerLanguage *language,
469                                      gboolean         value)
470 {
471 	TrackerLanguagePriv *priv;
472 
473 	g_return_if_fail (TRACKER_IS_LANGUAGE (language));
474 
475 	priv = GET_PRIV (language);
476 
477 	priv->enable_stemmer = value;
478 
479 	g_object_notify (G_OBJECT (language), "enable-stemmer");
480 }
481 
482 /**
483  * tracker_language_set_language_code:
484  * @language: a #TrackerLanguage
485  * @language_code: an ISO 639-1 language code
486  *
487  * Sets the @language to @language_code, a %NULL value will reset this
488  * to "en" (English).
489  **/
490 void
491 tracker_language_set_language_code (TrackerLanguage *language,
492                                     const gchar     *language_code)
493 {
494 	TrackerLanguagePriv *priv;
495 
496 	g_return_if_fail (TRACKER_IS_LANGUAGE (language));
497 
498 	priv = GET_PRIV (language);
499 
500 	g_free (priv->language_code);
501 
502 	priv->language_code = g_strdup (language_code);
503 
504 	if (!priv->language_code) {
505 		priv->language_code = g_strdup ("en");
506 	}
507 
508 	language_set_stopword_list (language, priv->language_code);
509 
510 	g_object_notify (G_OBJECT (language), "language-code");
511 }
512 
513 /**
514  * tracker_language_stem_word:
515  * @language: a #TrackerLanguage
516  * @word: string pointing to a word
517  * @word_length: word ascii length
518  *
519  * If the stemmer is enabled, it will return the stem word for @word.
520  * If it's disabled, it will return the passed word.
521  *
522  * Returns: a string with the processed word. This string must be
523  *          freed with g_free()
524  **/
525 gchar *
526 tracker_language_stem_word (TrackerLanguage *language,
527                             const gchar     *word,
528                             gint             word_length)
529 {
530 	TrackerLanguagePriv *priv;
531 	const gchar         *stem_word;
532 
533 	g_return_val_if_fail (TRACKER_IS_LANGUAGE (language), NULL);
534 
535 	if (word_length < 0) {
536 		word_length = strlen (word);
537 	}
538 
539 	priv = GET_PRIV (language);
540 
541 	if (!priv->enable_stemmer) {
542 		return g_strndup (word, word_length);
543 	}
544 
545 #if GLIB_CHECK_VERSION (2,31,0)
546 	g_mutex_lock (&priv->stemmer_mutex);
547 #else
548 	g_mutex_lock (priv->stemmer_mutex);
549 #endif
550 
551 	stem_word = (const gchar*) sb_stemmer_stem (priv->stemmer,
552 	                                            (guchar*) word,
553 	                                            word_length);
554 
555 #if GLIB_CHECK_VERSION (2,31,0)
556 	g_mutex_unlock (&priv->stemmer_mutex);
557 #else
558 	g_mutex_unlock (priv->stemmer_mutex);
559 #endif
560 
561 	return g_strdup (stem_word);
562 }
563 
564 /**
565  * tracker_language_get_name_by_code:
566  * @language_code: a ISO 639-1 language code.
567  *
568  * Returns a human readable language name for the given
569  * ISO 639-1 code, if supported by #TrackerLanguage
570  *
571  * Returns: the language name.
572  **/
573 const gchar *
574 tracker_language_get_name_by_code (const gchar *language_code)
575 {
576 	gint i;
577 
578 	if (!language_code || language_code[0] == '\0') {
579 		return "english";
580 	}
581 
582 	for (i = 0; all_langs[i].code; i++) {
583 		if (g_str_has_prefix (language_code, all_langs[i].code)) {
584 			return all_langs[i].name;
585 		}
586 	}
587 
588 	return "";
589 }