1 /*
  2  * Copyright (C) 2009, Nokia <ivan.frade@nokia.com>
  3  *
  4  * This library is free software; you can redistribute it and/or
  5  * modify it under the terms of the GNU Lesser General Public
  6  * License as published by the Free Software Foundation; either
  7  * version 2.1 of the License, or (at your option) any later version.
  8  *
  9  * This library is distributed in the hope that it will be useful,
 10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 12  * Lesser General Public License for more details.
 13  *
 14  * You should have received a copy of the GNU Lesser General Public
 15  * License along with this library; if not, write to the
 16  * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
 17  * Boston, MA  02110-1301, USA.
 18  */
 19 
 20 #include "config.h"
 21 
 22 #define _XOPEN_SOURCE
 23 #define _XOPEN_SOURCE_EXTENDED 1	/* strptime is XPG4v2 */
 24 
 25 #ifndef _GNU_SOURCE
 26 #define _GNU_SOURCE
 27 #endif
 28 
 29 #include <time.h>
 30 #include <string.h>
 31 #include <stdio.h>
 32 
 33 #include <libtracker-common/tracker-utils.h>
 34 #include <libtracker-common/tracker-date-time.h>
 35 
 36 #include "tracker-utils.h"
 37 
 38 #ifndef HAVE_GETLINE
 39 
 40 #include <stddef.h>
 41 #include <stdlib.h>
 42 #include <limits.h>
 43 #include <errno.h>
 44 
 45 #undef getdelim
 46 #undef getline
 47 
 48 #define GROW_BY 80
 49 
 50 #endif /* HAVE_GETLINE */
 51 
 52 #define DATE_FORMAT_ISO8601 "%Y-%m-%dT%H:%M:%S%z"
 53 
 54 /**
 55  * SECTION:tracker-utils
 56  * @title: Data utilities
 57  * @short_description: Functions for coalescing, merging, date
 58  * handling and normalizing
 59  * @stability: Stable
 60  * @include: libtracker-extract/tracker-extract.h
 61  *
 62  * This API is provided to facilitate common more general functions
 63  * which extractors may find useful. These functions are also used by
 64  * the in-house extractors quite frequently.
 65  **/
 66 
 67 static const char *months[] = {
 68 	"Jan", "Feb", "Mar", "Apr", "May", "Jun",
 69 	"Jul", "Aug", "Sep", "Oct", "Nov", "Dec"
 70 };
 71 
 72 static const char imonths[] = {
 73 	'1', '2', '3', '4', '5',
 74 	'6', '7', '8', '9', '0', '1', '2'
 75 };
 76 
 77 
 78 /**
 79  * tracker_coalesce_strip:
 80  * @n_values: the number of @... supplied
 81  * @...: the string pointers to coalesce
 82  *
 83  * This function iterates through a series of string pointers passed
 84  * using @... and returns the first which is not %NULL, not empty
 85  * (i.e. "") and not comprised of one or more spaces (i.e. " ").
 86  *
 87  * The returned value is stripped using g_strstrip(). It is MOST
 88  * important NOT to pass constant string pointers to this function!
 89  *
 90  * Returns: the first string pointer from those provided which
 91  * matches, otherwise %NULL.
 92  *
 93  * Since: 0.10
 94  **/
 95 const gchar *
 96 tracker_coalesce_strip (gint n_values,
 97                         ...)
 98 {
 99 	va_list args;
100 	gint    i;
101 	const gchar *result = NULL;
102 
103 	va_start (args, n_values);
104 
105 	for (i = 0; i < n_values; i++) {
106 		gchar *value;
107 
108 		value = va_arg (args, gchar *);
109 		if (!result && !tracker_is_blank_string (value)) {
110 			result = (const gchar *) g_strstrip (value);
111 			break;
112 		}
113 	}
114 
115 	va_end (args);
116 
117 	return result;
118 }
119 
120 // LCOV_EXCL_START
121 
122 /**
123  * tracker_coalesce:
124  * @n_values: the number of @Varargs supplied
125  * @...: the string pointers to coalesce
126  *
127  * This function iterates through a series of string pointers passed
128  * using @... and returns the first which is not %NULL, not empty
129  * (i.e. "") and not comprised of one or more spaces (i.e. " ").
130  *
131  * The returned value is stripped using g_strstrip(). All other values
132  * supplied are freed. It is MOST important NOT to pass constant
133  * string pointers to this function!
134  *
135  * Returns: the first string pointer from those provided which
136  * matches, otherwise %NULL.
137  *
138  * Since: 0.8
139  *
140  * Deprecated: 0.10: Use tracker_coalesce_strip() instead.
141  *
142  **/
143 gchar *
144 tracker_coalesce (gint n_values,
145                   ...)
146 {
147 	va_list args;
148 	gint    i;
149 	gchar *result = NULL;
150 
151 	va_start (args, n_values);
152 
153 	for (i = 0; i < n_values; i++) {
154 		gchar *value;
155 
156 		value = va_arg (args, gchar *);
157 		if (!result && !tracker_is_blank_string (value)) {
158 			result = g_strstrip (value);
159 		} else {
160 			g_free (value);
161 		}
162 	}
163 
164 	va_end (args);
165 
166 	return result;
167 }
168 // LCOV_EXCL_STOP
169 
170 /**
171  * tracker_merge_const:
172  * @delimiter: the delimiter to use when merging
173  * @n_values: the number of @... supplied
174  * @...: the string pointers to merge
175  *
176  * This function iterates through a series of string pointers passed
177  * using @... and returns a newly allocated string of the merged
178  * strings.
179  *
180  * The @delimiter can be %NULL. If specified, it will be used in
181  * between each merged string in the result.
182  *
183  * Returns: a newly-allocated string holding the result which should
184  * be freed with g_free() when finished with, otherwise %NULL.
185  *
186  * Since: 0.10
187  **/
188 gchar *
189 tracker_merge_const (const gchar *delimiter,
190                      gint         n_values,
191                      ...)
192 {
193 	va_list args;
194 	gint    i;
195 	GString *str = NULL;
196 
197 	va_start (args, n_values);
198 
199 	for (i = 0; i < n_values; i++) {
200 		gchar *value;
201 
202 		value = va_arg (args, gchar *);
203 		if (value) {
204 			if (!str) {
205 				str = g_string_new (value);
206 			} else {
207 				if (delimiter) {
208 					g_string_append (str, delimiter);
209 				}
210 				g_string_append (str, value);
211 			}
212 		}
213 	}
214 
215 	va_end (args);
216 
217 	if (!str) {
218 		return NULL;
219 	}
220 
221 	return g_string_free (str, FALSE);
222 }
223 
224 // LCOV_EXCL_START
225 
226 /**
227  * tracker_merge:
228  * @delimiter: the delimiter to use when merging
229  * @n_values: the number of @... supplied
230  * @...: the string pointers to merge
231  *
232  * This function iterates through a series of string pointers passed
233  * using @... and returns a newly allocated string of the merged
234  * strings. All passed strings are freed (don't pass const values)/
235  *
236  * The @delimiter can be %NULL. If specified, it will be used in
237  * between each merged string in the result.
238  *
239  * Returns: a newly-allocated string holding the result which should
240  * be freed with g_free() when finished with, otherwise %NULL.
241  *
242  * Since: 0.8
243  *
244  * Deprecated: 0.10: Use tracker_merge_const() instead.
245  **/
246 gchar *
247 tracker_merge (const gchar *delimiter,
248                gint         n_values,
249                ...)
250 {
251 	va_list args;
252 	gint    i;
253 	GString *str = NULL;
254 
255 	va_start (args, n_values);
256 
257 	for (i = 0; i < n_values; i++) {
258 		gchar *value;
259 
260 		value = va_arg (args, gchar *);
261 		if (value) {
262 			if (!str) {
263 				str = g_string_new (value);
264 			} else {
265 				if (delimiter) {
266 					g_string_append (str, delimiter);
267 				}
268 				g_string_append (str, value);
269 			}
270 			g_free (value);
271 		}
272 	}
273 
274 	va_end (args);
275 
276 	if (!str) {
277 		return NULL;
278 	}
279 
280 	return g_string_free (str, FALSE);
281 }
282 
283 /**
284  * tracker_text_normalize:
285  * @text: the text to normalize
286  * @max_words: the maximum words of @text to normalize
287  * @n_words: the number of words actually normalized
288  *
289  * This function iterates through @text checking for UTF-8 validity
290  * using g_utf8_get_char_validated(). For each character found, the
291  * %GUnicodeType is checked to make sure it is one fo the following
292  * values:
293  * <itemizedlist>
294  *  <listitem><para>%G_UNICODE_LOWERCASE_LETTER</para></listitem>
295  *  <listitem><para>%G_UNICODE_MODIFIER_LETTER</para></listitem>
296  *  <listitem><para>%G_UNICODE_OTHER_LETTER</para></listitem>
297  *  <listitem><para>%G_UNICODE_TITLECASE_LETTER</para></listitem>
298  *  <listitem><para>%G_UNICODE_UPPERCASE_LETTER</para></listitem>
299  * </itemizedlist>
300  *
301  * All other symbols, punctuation, marks, numbers and separators are
302  * stripped. A regular space (i.e. " ") is used to separate the words
303  * in the returned string.
304  *
305  * The @n_words can be %NULL. If specified, it will be populated with
306  * the number of words that were normalized in the result.
307  *
308  * Returns: a newly-allocated string holding the result which should
309  * be freed with g_free() when finished with, otherwise %NULL.
310  *
311  * Since: 0.8
312  *
313  * Deprecated: 0.10: Use tracker_text_validate_utf8() instead.
314  **/
315 gchar *
316 tracker_text_normalize (const gchar *text,
317                         guint        max_words,
318                         guint       *n_words)
319 {
320 	GString *string;
321 	gboolean in_break = TRUE;
322 	gunichar ch;
323 	gint words = 0;
324 
325 	string = g_string_new (NULL);
326 
327 	while ((ch = g_utf8_get_char_validated (text, -1)) > 0) {
328 		GUnicodeType type;
329 
330 		type = g_unichar_type (ch);
331 
332 		if (type == G_UNICODE_LOWERCASE_LETTER ||
333 		    type == G_UNICODE_MODIFIER_LETTER ||
334 		    type == G_UNICODE_OTHER_LETTER ||
335 		    type == G_UNICODE_TITLECASE_LETTER ||
336 		    type == G_UNICODE_UPPERCASE_LETTER) {
337 			/* Append regular chars */
338 			g_string_append_unichar (string, ch);
339 			in_break = FALSE;
340 		} else if (!in_break) {
341 			/* Non-regular char found, treat as word break */
342 			g_string_append_c (string, ' ');
343 			in_break = TRUE;
344 			words++;
345 
346 			if (words > max_words) {
347 				break;
348 			}
349 		}
350 
351 		text = g_utf8_find_next_char (text, NULL);
352 	}
353 
354 	if (n_words) {
355 		if (!in_break) {
356 			/* Count the last word */
357 			words += 1;
358 		}
359 		*n_words = words;
360 	}
361 
362 	return g_string_free (string, FALSE);
363 }
364 
365 // LCOV_EXCL_STOP
366 
367 /**
368  * tracker_text_validate_utf8:
369  * @text: the text to validate
370  * @text_len: length of @text, or -1 if NUL-terminated
371  * @str: the string where to place the validated UTF-8 characters, or %NULL if
372  *  not needed.
373  * @valid_len: Output number of valid UTF-8 bytes found, or %NULL if not needed
374  *
375  * This function iterates through @text checking for UTF-8 validity
376  * using g_utf8_validate(), appends the first chunk of valid characters
377  * to @str, and gives the number of valid UTF-8 bytes in @valid_len.
378  *
379  * Returns: %TRUE if some bytes were found to be valid, %FALSE otherwise.
380  *
381  * Since: 0.10
382  **/
383 gboolean
384 tracker_text_validate_utf8 (const gchar  *text,
385                             gssize        text_len,
386                             GString     **str,
387                             gsize        *valid_len)
388 {
389 	gsize len_to_validate;
390 
391 	g_return_val_if_fail (text, FALSE);
392 
393 	len_to_validate = text_len >= 0 ? text_len : strlen (text);
394 
395 	if (len_to_validate > 0) {
396 		const gchar *end = text;
397 
398 		/* Validate string, getting the pointer to first non-valid character
399 		 *  (if any) or to the end of the string. */
400 		g_utf8_validate (text, len_to_validate, &end);
401 		if (end > text) {
402 			/* If str output required... */
403 			if (str) {
404 				/* Create string to output if not already as input */
405 				*str = (*str == NULL ?
406 				        g_string_new_len (text, end - text) :
407 				        g_string_append_len (*str, text, end - text));
408 			}
409 
410 			/* If utf8 len output required... */
411 			if (valid_len) {
412 				*valid_len = end - text;
413 			}
414 
415 			return TRUE;
416 		}
417 	}
418 
419 	return FALSE;
420 }
421 
422 /**
423  * tracker_date_format_to_iso8601:
424  * @date_string: the date in a string pointer
425  * @format: the format of the @date_string
426  *
427  * This function uses strptime() to create a time tm structure using
428  * @date_string and @format.
429  *
430  * Returns: a newly-allocated string with the time represented in
431  * ISO8601 date format which should be freed with g_free() when
432  * finished with, otherwise %NULL.
433  *
434  * Since: 0.8
435  **/
436 gchar *
437 tracker_date_format_to_iso8601 (const gchar *date_string,
438                                 const gchar *format)
439 {
440 	gchar *result;
441 	struct tm date_tm = { 0 };
442 
443 	g_return_val_if_fail (date_string != NULL, NULL);
444 	g_return_val_if_fail (format != NULL, NULL);
445 
446 	if (strptime (date_string, format, &date_tm) == 0) {
447 		return NULL;
448 	}
449 
450 	/* If the input format string doesn't parse timezone information with
451 	 * either %z or %Z, strptime() won't set the tm_gmtoff member in the
452 	 * broken-down time, and the value during initialization (0) will be
453 	 * left. This effectively means that every broken-down time obtained
454 	 * with strptime() without parsing timezone information will be based
455 	 * on UTC, instead of being treated as localtime. In order to fix this
456 	 * and set the correct value for the offset w.r.t gmt, we can just
457 	 * use mktime() to fill in the daylight saving flag as well as the
458 	 * gmt offset value. */
459 	if (!strstr (format, "%z") && !strstr (format, "%Z")) {
460 		/* tm_isdst not set by strptime(), we set -1 on it in order to ask
461 		 * mktime to 'normalize' its contents and fill in the gmt offset
462 		 * and daylight saving time information */
463 		date_tm.tm_isdst = -1;
464 
465 		/* Note: no real problem if mktime() fails. In this case, tm_isdst
466 		 * will be -1, and therefore strftime() will not write the timezone
467 		 * information, which is equally right to represent localtime. */
468 		mktime (&date_tm);
469 	}
470 
471 	result = g_malloc (sizeof (char) * 25);
472 	strftime (result, 25, DATE_FORMAT_ISO8601 , &date_tm);
473 	return result;
474 }
475 
476 static gboolean
477 is_int (const gchar *str)
478 {
479 	gint i, len;
480 
481 	if (!str || str[0] == '\0') {
482 		return FALSE;
483 	}
484 
485 	len = strlen (str);
486 
487 	for (i = 0; i < len; i++) {
488 		if (!g_ascii_isdigit (str[i])) {
489 			return FALSE;
490 		}
491 	}
492 
493 	return TRUE ;
494 }
495 
496 static gint
497 parse_month (const gchar *month)
498 {
499 	gint i;
500 
501 	for (i = 0; i < 12; i++) {
502 		if (!strncmp (month, months[i], 3)) {
503 			return i;
504 		}
505 	}
506 
507 	return -1;
508 }
509 
510 /* Determine date format and convert to ISO 8601 format */
511 /* FIXME We should handle all the fractions here (see ISO 8601), as well as YYYY:DDD etc */
512 
513 /**
514  * tracker_date_guess:
515  * @date_string: the date in a string pointer
516  *
517  * This function uses a number of methods to try and guess the date
518  * held in @date_string. The @date_string must be at least 5
519  * characters in length or longer for any guessing to be attempted.
520  * Some of the string formats guessed include:
521  *
522  * <itemizedlist>
523  *  <listitem><para>"YYYY-MM-DD" (Simple format)</para></listitem>
524  *  <listitem><para>"20050315113224-08'00'" (PDF format)</para></listitem>
525  *  <listitem><para>"20050216111533Z" (PDF format)</para></listitem>
526  *  <listitem><para>"Mon Feb  9 10:10:00 2004" (Microsoft Office format)</para></listitem>
527  *  <listitem><para>"2005:04:29 14:56:54" (Exif format)</para></listitem>
528  *  <listitem><para>"YYYY-MM-DDThh:mm:ss.ff+zz:zz</para></listitem>
529  * </itemizedlist>
530  *
531  * Returns: a newly-allocated string with the time represented in
532  * ISO8601 date format which should be freed with g_free() when
533  * finished with, otherwise %NULL.
534  *
535  * Since: 0.8
536  **/
537 gchar *
538 tracker_date_guess (const gchar *date_string)
539 {
540 	gchar buf[30];
541 	gint  len;
542 	GError *error = NULL;
543 
544 	if (!date_string) {
545 		return NULL;
546 	}
547 
548 	len = strlen (date_string);
549 
550 	/* We cannot format a date without at least a four digit
551 	 * year.
552 	 */
553 	if (len < 4) {
554 		return NULL;
555 	}
556 
557 	/* Check for year only dates (EG ID3 music tags might have
558 	 * Audio.ReleaseDate as 4 digit year)
559 	 */
560 	if (len == 4) {
561 		if (is_int (date_string)) {
562 			buf[0] = date_string[0];
563 			buf[1] = date_string[1];
564 			buf[2] = date_string[2];
565 			buf[3] = date_string[3];
566 			buf[4] = '-';
567 			buf[5] = '0';
568 			buf[6] = '1';
569 			buf[7] = '-';
570 			buf[8] = '0';
571 			buf[9] = '1';
572 			buf[10] = 'T';
573 			buf[11] = '0';
574 			buf[12] = '0';
575 			buf[13] = ':';
576 			buf[14] = '0';
577 			buf[15] = '0';
578 			buf[16] = ':';
579 			buf[17] = '0';
580 			buf[18] = '0';
581 			buf[19] = 'Z';
582 			buf[20] = '\0';
583 
584 			tracker_string_to_date (buf, NULL, &error);
585 
586 			if (error != NULL) {
587 				g_error_free (error);
588 				return NULL;
589 			}
590 
591 			return g_strdup (buf);
592 		} else {
593 			return NULL;
594 		}
595 	} else if (len == 10)  {
596 		/* Check for date part only YYYY-MM-DD */
597 		buf[0] = date_string[0];
598 		buf[1] = date_string[1];
599 		buf[2] = date_string[2];
600 		buf[3] = date_string[3];
601 		buf[4] = '-';
602 		buf[5] = date_string[5];
603 		buf[6] = date_string[6];
604 		buf[7] = '-';
605 		buf[8] = date_string[8];
606 		buf[9] = date_string[9];
607 		buf[10] = 'T';
608 		buf[11] = '0';
609 		buf[12] = '0';
610 		buf[13] = ':';
611 		buf[14] = '0';
612 		buf[15] = '0';
613 		buf[16] = ':';
614 		buf[17] = '0';
615 		buf[18] = '0';
616 		buf[19] = '\0';
617 
618 		tracker_string_to_date (buf, NULL, &error);
619 
620 		if (error != NULL) {
621 			g_error_free (error);
622 			return NULL;
623 		}
624 
625 		return g_strdup (buf);
626 	} else if (len == 14) {
627 		/* Check for pdf format EG 20050315113224-08'00' or
628 		 * 20050216111533Z
629 		 */
630 		buf[0] = date_string[0];
631 		buf[1] = date_string[1];
632 		buf[2] = date_string[2];
633 		buf[3] = date_string[3];
634 		buf[4] = '-';
635 		buf[5] = date_string[4];
636 		buf[6] = date_string[5];
637 		buf[7] = '-';
638 		buf[8] = date_string[6];
639 		buf[9] = date_string[7];
640 		buf[10] = 'T';
641 		buf[11] = date_string[8];
642 		buf[12] = date_string[9];
643 		buf[13] = ':';
644 		buf[14] = date_string[10];
645 		buf[15] = date_string[11];
646 		buf[16] = ':';
647 		buf[17] = date_string[12];
648 		buf[18] = date_string[13];
649 		buf[19] = '\0';
650 
651 		tracker_string_to_date (buf, NULL, &error);
652 
653 		if (error != NULL) {
654 			g_error_free (error);
655 			return NULL;
656 		}
657 
658 		return g_strdup (buf);
659 	} else if (len == 15 && date_string[14] == 'Z') {
660 		buf[0] = date_string[0];
661 		buf[1] = date_string[1];
662 		buf[2] = date_string[2];
663 		buf[3] = date_string[3];
664 		buf[4] = '-';
665 		buf[5] = date_string[4];
666 		buf[6] = date_string[5];
667 		buf[7] = '-';
668 		buf[8] = date_string[6];
669 		buf[9] = date_string[7];
670 		buf[10] = 'T';
671 		buf[11] = date_string[8];
672 		buf[12] = date_string[9];
673 		buf[13] = ':';
674 		buf[14] = date_string[10];
675 		buf[15] = date_string[11];
676 		buf[16] = ':';
677 		buf[17] = date_string[12];
678 		buf[18] = date_string[13];
679 		buf[19] = 'Z';
680 		buf[20] = '\0';
681 
682 		tracker_string_to_date (buf, NULL, &error);
683 
684 		if (error != NULL) {
685 			g_error_free (error);
686 			return NULL;
687 		}
688 
689 		return g_strdup (buf);
690 	} else if (len == 21 && (date_string[14] == '-' || date_string[14] == '+' )) {
691 		buf[0] = date_string[0];
692 		buf[1] = date_string[1];
693 		buf[2] = date_string[2];
694 		buf[3] = date_string[3];
695 		buf[4] = '-';
696 		buf[5] = date_string[4];
697 		buf[6] = date_string[5];
698 		buf[7] = '-';
699 		buf[8] = date_string[6];
700 		buf[9] = date_string[7];
701 		buf[10] = 'T';
702 		buf[11] = date_string[8];
703 		buf[12] = date_string[9];
704 		buf[13] = ':';
705 		buf[14] = date_string[10];
706 		buf[15] = date_string[11];
707 		buf[16] = ':';
708 		buf[17] = date_string[12];
709 		buf[18] = date_string[13];
710 		buf[19] = date_string[14];
711 		buf[20] = date_string[15];
712 		buf[21] = date_string[16];
713 		buf[22] =  ':';
714 		buf[23] = date_string[18];
715 		buf[24] = date_string[19];
716 		buf[25] = '\0';
717 
718 		tracker_string_to_date (buf, NULL, &error);
719 
720 		if (error != NULL) {
721 			g_error_free (error);
722 			return NULL;
723 		}
724 
725 		return g_strdup (buf);
726 	} else if ((len == 24) && (date_string[3] == ' ')) {
727 		/* Check for msoffice date format "Mon Feb  9 10:10:00 2004" */
728 		gint  num_month;
729 		gchar mon1;
730 		gchar day1;
731 
732 		num_month = parse_month (date_string + 4);
733 
734 		if (num_month < 0) {
735 			return NULL;
736 		}
737 
738 		mon1 = imonths[num_month];
739 
740 		if (date_string[8] == ' ') {
741 			day1 = '0';
742 		} else {
743 			day1 = date_string[8];
744 		}
745 
746 		buf[0] = date_string[20];
747 		buf[1] = date_string[21];
748 		buf[2] = date_string[22];
749 		buf[3] = date_string[23];
750 		buf[4] = '-';
751 
752 		if (num_month < 10) {
753 			buf[5] = '0';
754 			buf[6] = mon1;
755 		} else {
756 			buf[5] = '1';
757 			buf[6] = mon1;
758 		}
759 
760 		buf[7] = '-';
761 		buf[8] = day1;
762 		buf[9] = date_string[9];
763 		buf[10] = 'T';
764 		buf[11] = date_string[11];
765 		buf[12] = date_string[12];
766 		buf[13] = ':';
767 		buf[14] = date_string[14];
768 		buf[15] = date_string[15];
769 		buf[16] = ':';
770 		buf[17] = date_string[17];
771 		buf[18] = date_string[18];
772 		buf[19] = '\0';
773 
774 		tracker_string_to_date (buf, NULL, &error);
775 
776 		if (error != NULL) {
777 			g_error_free (error);
778 			return NULL;
779 		}
780 
781 		return g_strdup (buf);
782 	} else if ((len == 19) && (date_string[4] == ':') && (date_string[7] == ':')) {
783 		/* Check for Exif date format "2005:04:29 14:56:54" */
784 		buf[0] = date_string[0];
785 		buf[1] = date_string[1];
786 		buf[2] = date_string[2];
787 		buf[3] = date_string[3];
788 		buf[4] = '-';
789 		buf[5] = date_string[5];
790 		buf[6] = date_string[6];
791 		buf[7] = '-';
792 		buf[8] = date_string[8];
793 		buf[9] = date_string[9];
794 		buf[10] = 'T';
795 		buf[11] = date_string[11];
796 		buf[12] = date_string[12];
797 		buf[13] = ':';
798 		buf[14] = date_string[14];
799 		buf[15] = date_string[15];
800 		buf[16] = ':';
801 		buf[17] = date_string[17];
802 		buf[18] = date_string[18];
803 		buf[19] = '\0';
804 
805 		tracker_string_to_date (buf, NULL, &error);
806 
807 		if (error != NULL) {
808 			g_error_free (error);
809 			return NULL;
810 		}
811 
812 		return g_strdup (buf);
813 	} 
814 
815 	tracker_string_to_date (date_string, NULL, &error);
816 
817 	if (error != NULL) {
818 		g_error_free (error);
819 		return NULL;
820 	}
821 
822 	return g_strdup (date_string);
823 }
824 
825 #ifndef HAVE_GETLINE
826 
827 static gint
828 my_igetdelim (gchar  **linebuf,
829               guint   *linebufsz,
830               gint     delimiter,
831               FILE    *file)
832 {
833 	gint ch;
834 	gint idx;
835 
836 	if ((file == NULL || linebuf == NULL || *linebuf == NULL || *linebufsz == 0) &&
837 	    !(*linebuf == NULL && *linebufsz == 0)) {
838 		errno = EINVAL;
839 		return -1;
840 	}
841 
842 	if (*linebuf == NULL && *linebufsz == 0) {
843 		*linebuf = g_malloc (GROW_BY);
844 
845 		if (!*linebuf) {
846 			errno = ENOMEM;
847 			return -1;
848 		}
849 
850 		*linebufsz += GROW_BY;
851 	}
852 
853 	idx = 0;
854 
855 	while ((ch = fgetc (file)) != EOF) {
856 		/* Grow the line buffer as necessary */
857 		while (idx > *linebufsz - 2) {
858 			*linebuf = g_realloc (*linebuf, *linebufsz += GROW_BY);
859 
860 			if (!*linebuf) {
861 				errno = ENOMEM;
862 				return -1;
863 			}
864 		}
865 		(*linebuf)[idx++] = (gchar) ch;
866 
867 		if ((gchar) ch == delimiter) {
868 			break;
869 		}
870 	}
871 
872 	if (idx != 0) {
873 		(*linebuf)[idx] = 0;
874 	} else if ( ch == EOF ) {
875 		return -1;
876 	}
877 
878 	return idx;
879 }
880 
881 #endif /* HAVE_GETLINE */
882 
883 /**
884  * tracker_getline:
885  * @lineptr: Buffer to write into
886  * @n: Max bytes of linebuf
887  * @stream: Filestream to read from
888  *
889  * Reads an entire line from stream, storing the address of the buffer
890  * containing  the  text into *lineptr.  The buffer is null-terminated
891  * and includes the newline character, if one was found.
892  *
893  * Read GNU getline()'s manpage for more information
894  *
895  * Returns: the number of characters read, including the delimiter
896  * character, but not including the terminating %NULL byte. This value
897  * can be used to handle embedded %NULL bytes in the line read. Upon
898  * failure, -1 is returned.
899  *
900  * Since: 0.10
901  **/
902 gssize
903 tracker_getline (gchar **lineptr,
904                  gsize  *n,
905                  FILE *stream)
906 {
907 #ifndef HAVE_GETLINE
908 	return my_igetdelim (lineptr, n, '\n', stream);
909 #else  /* HAVE_GETLINE */
910 	return getline (lineptr, n, stream);
911 #endif /* HAVE_GETLINE */
912 }
913 
914 /**
915  * tracker_keywords_parse:
916  * @store: Array where to store the keywords
917  * @keywords: Keywords line to parse
918  *
919  * Parses a keywords line into store, avoiding duplicates and stripping leading
920  * and trailing spaces from keywords. Allowed delimiters are , and ;
921  *
922  * Since: 0.10
923  **/
924 void
925 tracker_keywords_parse (GPtrArray   *store,
926                         const gchar *keywords)
927 {
928 	gchar *orig, *keywords_d;
929 	char *saveptr, *p;
930 	size_t len;
931 
932 	keywords_d = orig = g_strdup (keywords);
933 	p = keywords_d;
934 	keywords_d = strchr (keywords_d, '"');
935 
936 	if (keywords_d) {
937 		keywords_d++;
938 	} else {
939 		keywords_d = p;
940 	}
941 
942 	len = strlen (keywords_d);
943 	if (keywords_d[len - 1] == '"') {
944 		keywords_d[len - 1] = '\0';
945 	}
946 
947 	for (p = strtok_r (keywords_d, ",;", &saveptr); p;
948 	     p = strtok_r (NULL, ",;", &saveptr)) {
949 		guint i;
950 		gboolean found = FALSE;
951 		gchar *p_do = g_strdup (p);
952 		gchar *p_dup = p_do;
953 		guint len = strlen (p_dup);
954 
955 		if (*p_dup == ' ')
956 			p_dup++;
957 
958 		if (p_dup[len-1] == ' ')
959 			p_dup[len-1] = '\0';
960 
961 		/* ignore keywords containing invalid UTF-8 */
962 		if (!g_utf8_validate (p_dup, -1, NULL)) {
963 			g_free (p_do);
964 			continue;
965 		}
966 
967 		for (i = 0; i < store->len; i++) {
968 			const gchar *earlier = g_ptr_array_index (store, i);
969 			if (g_strcmp0 (earlier, p_dup) == 0) {
970 				found = TRUE;
971 				break;
972 			}
973 		}
974 
975 		if (!found) {
976 			g_ptr_array_add (store, g_strdup (p_dup));
977 		}
978 
979 		g_free (p_do);
980 	}
981 
982 	g_free (orig);
983 }
tracker-0.16.2/src/libtracker-extract/tracker-utils.c