tracker-0.16.2/src/tracker-extract/tracker-extract-oasis.c

No issues found

  1 /*
  2  * Copyright (C) 2006, Jamie McCracken <jamiemcc@gnome.org>
  3  * Copyright (C) 2008, Nokia <ivan.frade@nokia.com>
  4  *
  5  * This library is free software; you can redistribute it and/or
  6  * modify it under the terms of the GNU Lesser General Public
  7  * License as published by the Free Software Foundation; either
  8  * version 2.1 of the License, or (at your option) any later version.
  9  *
 10  * This library is distributed in the hope that it will be useful,
 11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 13  * Lesser General Public License for more details.
 14  *
 15  * You should have received a copy of the GNU Lesser General Public
 16  * License along with this library; if not, write to the
 17  * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
 18  * Boston, MA  02110-1301, USA.
 19  */
 20 
 21 #include <libtracker-common/tracker-os-dependant.h>
 22 
 23 #include <libtracker-extract/tracker-extract.h>
 24 
 25 #include "tracker-main.h"
 26 #include "tracker-gsf.h"
 27 #include "tracker-read.h"
 28 
 29 #include <unistd.h>
 30 
 31 typedef enum {
 32 	ODT_TAG_TYPE_UNKNOWN,
 33 	ODT_TAG_TYPE_TITLE,
 34 	ODT_TAG_TYPE_SUBJECT,
 35 	ODT_TAG_TYPE_AUTHOR,
 36 	ODT_TAG_TYPE_KEYWORDS,
 37 	ODT_TAG_TYPE_COMMENTS,
 38 	ODT_TAG_TYPE_STATS,
 39 	ODT_TAG_TYPE_CREATED,
 40 	ODT_TAG_TYPE_GENERATOR,
 41 	ODT_TAG_TYPE_WORD_TEXT,
 42 	ODT_TAG_TYPE_SLIDE_TEXT,
 43 	ODT_TAG_TYPE_SPREADSHEET_TEXT,
 44 	ODT_TAG_TYPE_GRAPHICS_TEXT
 45 } ODTTagType;
 46 
 47 typedef enum {
 48 	FILE_TYPE_INVALID,
 49 	FILE_TYPE_ODP,
 50 	FILE_TYPE_ODT,
 51 	FILE_TYPE_ODS,
 52 	FILE_TYPE_ODG
 53 } ODTFileType;
 54 
 55 typedef struct {
 56 	TrackerSparqlBuilder *metadata;
 57 	ODTTagType current;
 58 	const gchar *uri;
 59 	gboolean title_already_set;
 60 } ODTMetadataParseInfo;
 61 
 62 typedef struct {
 63 	ODTTagType current;
 64 	ODTFileType file_type;
 65 	GString *content;
 66 	gulong bytes_pending;
 67 } ODTContentParseInfo;
 68 
 69 GQuark maximum_size_error_quark = 0;
 70 
 71 static void xml_start_element_handler_metadata (GMarkupParseContext   *context,
 72                                                 const gchar           *element_name,
 73                                                 const gchar          **attribute_names,
 74                                                 const gchar          **attribute_values,
 75                                                 gpointer               user_data,
 76                                                 GError               **error);
 77 static void xml_end_element_handler_metadata   (GMarkupParseContext   *context,
 78                                                 const gchar           *element_name,
 79                                                 gpointer               user_data,
 80                                                 GError               **error);
 81 static void xml_text_handler_metadata          (GMarkupParseContext   *context,
 82                                                 const gchar           *text,
 83                                                 gsize                  text_len,
 84                                                 gpointer               user_data,
 85                                                 GError               **error);
 86 static void xml_start_element_handler_content  (GMarkupParseContext   *context,
 87                                                 const gchar           *element_name,
 88                                                 const gchar          **attribute_names,
 89                                                 const gchar          **attribute_values,
 90                                                 gpointer               user_data,
 91                                                 GError               **error);
 92 static void xml_end_element_handler_content    (GMarkupParseContext   *context,
 93                                                 const gchar           *element_name,
 94                                                 gpointer               user_data,
 95                                                 GError               **error);
 96 static void xml_text_handler_content           (GMarkupParseContext   *context,
 97                                                 const gchar           *text,
 98                                                 gsize                  text_len,
 99                                                 gpointer               user_data,
100                                                 GError               **error);
101 static void extract_oasis_content              (const gchar           *uri,
102                                                 gulong                 total_bytes,
103                                                 ODTFileType            file_type,
104                                                 TrackerSparqlBuilder  *metadata);
105 
106 static void
107 extract_oasis_content (const gchar          *uri,
108                        gulong                total_bytes,
109                        ODTFileType           file_type,
110                        TrackerSparqlBuilder *metadata)
111 {
112 	gchar *content = NULL;
113 	ODTContentParseInfo info;
114 	GMarkupParseContext *context;
115 	GError *error = NULL;
116 	GMarkupParser parser = {
117 		xml_start_element_handler_content,
118 		xml_end_element_handler_content,
119 		xml_text_handler_content,
120 		NULL,
121 		NULL
122 	};
123 
124 	/* If no content requested, return */
125 	if (total_bytes == 0) {
126 		return;
127 	}
128 
129 	/* Create parse info */
130 	info.current = ODT_TAG_TYPE_UNKNOWN;
131 	info.file_type = file_type;
132 	info.content = g_string_new ("");
133 	info.bytes_pending = total_bytes;
134 
135 	/* Create parsing context */
136 	context = g_markup_parse_context_new (&parser, 0, &info, NULL);
137 
138 	/* Load the internal XML file from the Zip archive, and parse it
139 	 * using the given context */
140 	tracker_gsf_parse_xml_in_zip (uri, "content.xml", context, &error);
141 
142 	if (!error || g_error_matches (error, maximum_size_error_quark, 0)) {
143 		content = g_string_free (info.content, FALSE);
144 		tracker_sparql_builder_predicate (metadata, "nie:plainTextContent");
145 		tracker_sparql_builder_object_unvalidated (metadata, content);
146 	} else {
147 		g_warning ("Got error parsing XML file: %s\n", error->message);
148 		g_string_free (info.content, TRUE);
149 	}
150 
151 	if (error) {
152 		g_error_free (error);
153 	}
154 
155 	g_free (content);
156 	g_markup_parse_context_free (context);
157 }
158 
159 G_MODULE_EXPORT gboolean
160 tracker_extract_get_metadata (TrackerExtractInfo *extract_info)
161 {
162 	TrackerSparqlBuilder *metadata;
163 	TrackerConfig *config;
164 	ODTMetadataParseInfo info;
165 	ODTFileType file_type;
166 	GFile *file;
167 	gchar *uri;
168 	const gchar *mime_used;
169 	GMarkupParseContext *context;
170 	GMarkupParser parser = {
171 		xml_start_element_handler_metadata,
172 		xml_end_element_handler_metadata,
173 		xml_text_handler_metadata,
174 		NULL,
175 		NULL
176 	};
177 
178 	if (G_UNLIKELY (maximum_size_error_quark == 0)) {
179 		maximum_size_error_quark = g_quark_from_static_string ("maximum_size_error");
180 	}
181 
182 	metadata = tracker_extract_info_get_metadata_builder (extract_info);
183 	mime_used = tracker_extract_info_get_mimetype (extract_info);
184 
185 	file = tracker_extract_info_get_file (extract_info);
186 	uri = g_file_get_uri (file);
187 
188 	/* Setup conf */
189 	config = tracker_main_get_config ();
190 
191 	g_debug ("Extracting OASIS metadata and contents from '%s'", uri);
192 
193 	/* First, parse metadata */
194 
195 	tracker_sparql_builder_predicate (metadata, "a");
196 	tracker_sparql_builder_object (metadata, "nfo:PaginatedTextDocument");
197 
198 	/* Create parse info */
199 	info.metadata = metadata;
200 	info.current = ODT_TAG_TYPE_UNKNOWN;
201 	info.uri = uri;
202 	info.title_already_set = FALSE;
203 
204 	/* Create parsing context */
205 	context = g_markup_parse_context_new (&parser, 0, &info, NULL);
206 
207 	/* Load the internal XML file from the Zip archive, and parse it
208 	 * using the given context */
209 	tracker_gsf_parse_xml_in_zip (uri, "meta.xml", context, NULL);
210 	g_markup_parse_context_free (context);
211 
212 	if (g_ascii_strcasecmp (mime_used, "application/vnd.oasis.opendocument.text") == 0) {
213 		file_type = FILE_TYPE_ODT;
214 	} else if (g_ascii_strcasecmp (mime_used, "application/vnd.oasis.opendocument.presentation") == 0) {
215 		file_type = FILE_TYPE_ODP;
216 	} else if (g_ascii_strcasecmp (mime_used, "application/vnd.oasis.opendocument.spreadsheet") == 0) {
217 		file_type = FILE_TYPE_ODS;
218 	} else if (g_ascii_strcasecmp (mime_used, "application/vnd.oasis.opendocument.graphics") == 0) {
219 		file_type = FILE_TYPE_ODG;
220 	} else {
221 		g_message ("Mime type was not recognised:'%s'", mime_used);
222 		file_type = FILE_TYPE_INVALID;
223 	}
224 
225 	/* Extract content with the given limitations */
226 	extract_oasis_content (uri,
227 	                       tracker_config_get_max_bytes (config),
228 	                       file_type,
229 	                       metadata);
230 
231 	g_free (uri);
232 
233 	return TRUE;
234 }
235 
236 static void
237 xml_start_element_handler_metadata (GMarkupParseContext  *context,
238                                     const gchar          *element_name,
239                                     const gchar         **attribute_names,
240                                     const gchar         **attribute_values,
241                                     gpointer              user_data,
242                                     GError              **error)
243 {
244 	ODTMetadataParseInfo *data = user_data;
245 
246 	if (g_ascii_strcasecmp (element_name, "dc:title") == 0) {
247 		data->current = ODT_TAG_TYPE_TITLE;
248 	} else if (g_ascii_strcasecmp (element_name, "dc:subject") == 0) {
249 		data->current = ODT_TAG_TYPE_SUBJECT;
250 	} else if (g_ascii_strcasecmp (element_name, "dc:creator") == 0) {
251 		data->current = ODT_TAG_TYPE_AUTHOR;
252 	} else if (g_ascii_strcasecmp (element_name, "meta:keyword") == 0) {
253 		data->current = ODT_TAG_TYPE_KEYWORDS;
254 	} else if (g_ascii_strcasecmp (element_name, "dc:description") == 0) {
255 		data->current = ODT_TAG_TYPE_COMMENTS;
256 	} else if (g_ascii_strcasecmp (element_name, "meta:document-statistic") == 0) {
257 		TrackerSparqlBuilder *metadata;
258 		const gchar **a, **v;
259 
260 		metadata = data->metadata;
261 
262 		for (a = attribute_names, v = attribute_values; *a; ++a, ++v) {
263 			if (g_ascii_strcasecmp (*a, "meta:word-count") == 0) {
264 				tracker_sparql_builder_predicate (metadata, "nfo:wordCount");
265 				tracker_sparql_builder_object_unvalidated (metadata, *v);
266 			} else if (g_ascii_strcasecmp (*a, "meta:page-count") == 0) {
267 				tracker_sparql_builder_predicate (metadata, "nfo:pageCount");
268 				tracker_sparql_builder_object_unvalidated (metadata, *v);
269 			}
270 		}
271 
272 		data->current = ODT_TAG_TYPE_STATS;
273 	} else if (g_ascii_strcasecmp (element_name, "meta:creation-date") == 0) {
274 		data->current = ODT_TAG_TYPE_CREATED;
275 	} else if (g_ascii_strcasecmp (element_name, "meta:generator") == 0) {
276 		data->current = ODT_TAG_TYPE_GENERATOR;
277 	} else {
278 		data->current = -1;
279 	}
280 }
281 
282 static void
283 xml_end_element_handler_metadata (GMarkupParseContext  *context,
284                                   const gchar          *element_name,
285                                   gpointer              user_data,
286                                   GError              **error)
287 {
288 	((ODTMetadataParseInfo*) user_data)->current = -1;
289 }
290 
291 static void
292 xml_text_handler_metadata (GMarkupParseContext  *context,
293                            const gchar          *text,
294                            gsize                 text_len,
295                            gpointer              user_data,
296                            GError              **error)
297 {
298 	ODTMetadataParseInfo *data;
299 	TrackerSparqlBuilder *metadata;
300 	gchar *date;
301 
302 	data = user_data;
303 	metadata = data->metadata;
304 
305 	if (text_len == 0) {
306 		/* ignore empty values */
307 		return;
308 	}
309 
310 	switch (data->current) {
311 	case ODT_TAG_TYPE_TITLE:
312 		if (data->title_already_set) {
313 			g_warning ("Avoiding additional title (%s) in OASIS document '%s'",
314 			           text, data->uri);
315 		} else {
316 			data->title_already_set = TRUE;
317 			tracker_sparql_builder_predicate (metadata, "nie:title");
318 			tracker_sparql_builder_object_unvalidated (metadata, text);
319 		}
320 		break;
321 
322 	case ODT_TAG_TYPE_SUBJECT:
323 		tracker_sparql_builder_predicate (metadata, "nie:subject");
324 		tracker_sparql_builder_object_unvalidated (metadata, text);
325 		break;
326 
327 	case ODT_TAG_TYPE_AUTHOR:
328 		tracker_sparql_builder_predicate (metadata, "nco:publisher");
329 
330 		tracker_sparql_builder_object_blank_open (metadata);
331 		tracker_sparql_builder_predicate (metadata, "a");
332 		tracker_sparql_builder_object (metadata, "nco:Contact");
333 
334 		tracker_sparql_builder_predicate (metadata, "nco:fullname");
335 		tracker_sparql_builder_object_unvalidated (metadata, text);
336 		tracker_sparql_builder_object_blank_close (metadata);
337 		break;
338 
339 	case ODT_TAG_TYPE_KEYWORDS: {
340 		gchar *keywords;
341 		gchar *lasts, *keyw;
342 
343 		keywords = g_strdup (text);
344 
345 		for (keyw = strtok_r (keywords, ",; ", &lasts);
346 		     keyw;
347 		     keyw = strtok_r (NULL, ",; ", &lasts)) {
348 			tracker_sparql_builder_predicate (metadata, "nie:keyword");
349 			tracker_sparql_builder_object_unvalidated (metadata, keyw);
350 		}
351 
352 		g_free (keywords);
353 
354 		break;
355 	}
356 
357 	case ODT_TAG_TYPE_COMMENTS:
358 		tracker_sparql_builder_predicate (metadata, "nie:comment");
359 		tracker_sparql_builder_object_unvalidated (metadata, text);
360 		break;
361 
362 	case ODT_TAG_TYPE_CREATED:
363 		date = tracker_date_guess (text);
364 		if (date) {
365 			tracker_sparql_builder_predicate (metadata, "nie:contentCreated");
366 			tracker_sparql_builder_object_unvalidated (metadata, date);
367 			g_free (date);
368 		}
369 		break;
370 
371 	case ODT_TAG_TYPE_GENERATOR:
372 		tracker_sparql_builder_predicate (metadata, "nie:generator");
373 		tracker_sparql_builder_object_unvalidated (metadata, text);
374 		break;
375 
376 	default:
377 	case ODT_TAG_TYPE_STATS:
378 		break;
379 	}
380 }
381 
382 static void
383 xml_start_element_handler_content (GMarkupParseContext  *context,
384                                    const gchar          *element_name,
385                                    const gchar         **attribute_names,
386                                    const gchar         **attribute_values,
387                                    gpointer              user_data,
388                                    GError              **error)
389 {
390 	ODTContentParseInfo *data = user_data;
391 
392 	switch (data->file_type) {
393 	case FILE_TYPE_ODT:
394 		if ((g_ascii_strcasecmp (element_name, "text:p") == 0) ||
395 		    (g_ascii_strcasecmp (element_name, "text:h") == 0) ||
396 		    (g_ascii_strcasecmp (element_name, "text:a") == 0) ||
397 		    (g_ascii_strcasecmp (element_name, "text:span") == 0) ||
398 		    (g_ascii_strcasecmp (element_name, "table:table-cell")) == 0) {
399 			data->current = ODT_TAG_TYPE_WORD_TEXT;
400 		} else {
401 			data->current = -1;
402 		}
403 		break;
404 
405 	case FILE_TYPE_ODP:
406 		data->current = ODT_TAG_TYPE_SLIDE_TEXT;
407 		break;
408 
409 	case FILE_TYPE_ODS:
410 		if (g_ascii_strncasecmp (element_name, "text", 4) == 0) {
411 			data->current = ODT_TAG_TYPE_SPREADSHEET_TEXT;
412 		} else {
413 			data->current = -1;
414 		}
415 		break;
416 
417 	case FILE_TYPE_ODG:
418 		if (g_ascii_strncasecmp (element_name, "text", 4) == 0) {
419 			data->current = ODT_TAG_TYPE_GRAPHICS_TEXT;
420 		} else {
421 			data->current = -1;
422 		}
423 		break;
424 
425 	case FILE_TYPE_INVALID:
426 		g_message ("Open Office Document type: %d invalid", data->file_type);
427 		break;
428 	}
429 }
430 
431 static void
432 xml_end_element_handler_content (GMarkupParseContext  *context,
433                                  const gchar          *element_name,
434                                  gpointer              user_data,
435                                  GError              **error)
436 {
437 	ODTContentParseInfo *data = user_data;
438 
439 	data->current = -1;
440 }
441 
442 static void
443 xml_text_handler_content (GMarkupParseContext  *context,
444                           const gchar          *text,
445                           gsize                 text_len,
446                           gpointer              user_data,
447                           GError              **error)
448 {
449 	ODTContentParseInfo *data = user_data;
450 	gsize written_bytes = 0;
451 
452 	switch (data->current) {
453 	case ODT_TAG_TYPE_WORD_TEXT:
454 	case ODT_TAG_TYPE_SLIDE_TEXT:
455 	case ODT_TAG_TYPE_SPREADSHEET_TEXT:
456 	case ODT_TAG_TYPE_GRAPHICS_TEXT:
457 		if (data->bytes_pending == 0) {
458 			g_set_error_literal (error,
459 			                     maximum_size_error_quark, 0,
460 			                     "Maximum text limit reached");
461 			break;
462 		}
463 
464 		/* Look for valid UTF-8 text */
465 		if (tracker_text_validate_utf8 (text,
466 		                                MIN (text_len, data->bytes_pending),
467 		                                &data->content,
468 		                                &written_bytes)) {
469 			if (data->content->str[data->content->len - 1] != ' ') {
470 				/* If some bytes found to be valid, append an extra whitespace
471 				 * as separator */
472 				g_string_append_c (data->content, ' ');
473 			}
474 		}
475 
476 		data->bytes_pending -= written_bytes;
477 		break;
478 
479 	default:
480 		break;
481 	}
482 }