Tool	Failure ID	Location	Function	Message	Data
clang-analyzer	no-output-found	tracker-extract-msoffice-xml.c		Message(text='Unable to locate XML output from invoke-clang-analyzer')	None
  1 /*
  2  * Copyright (C) 2008-2010 Nokia <ivan.frade@nokia.com>
  3  *
  4  * This library is free software; you can redistribute it and/or
  5  * modify it under the terms of the GNU Lesser General Public
  6  * License as published by the Free Software Foundation; either
  7  * version 2.1 of the License, or (at your option) any later version.
  8  *
  9  * This library is distributed in the hope that it will be useful,
 10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 12  * Lesser General Public License for more details.
 13  *
 14  * You should have received a copy of the GNU Lesser General Public
 15  * License along with this library; if not, write to the
 16  * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
 17  * Boston, MA  02110-1301, USA.
 18  */
 19 
 20 #include "config.h"
 21 
 22 #include <string.h>
 23 
 24 #include <glib.h>
 25 
 26 #include <gsf/gsf.h>
 27 #include <gsf/gsf-doc-meta-data.h>
 28 #include <gsf/gsf-infile.h>
 29 #include <gsf/gsf-infile-msole.h>
 30 #include <gsf/gsf-input-stdio.h>
 31 #include <gsf/gsf-msole-utils.h>
 32 #include <gsf/gsf-utils.h>
 33 #include <gsf/gsf-infile-zip.h>
 34 
 35 #include <libtracker-common/tracker-utils.h>
 36 #include <libtracker-common/tracker-os-dependant.h>
 37 
 38 #include <libtracker-extract/tracker-extract.h>
 39 
 40 #include "tracker-main.h"
 41 #include "tracker-gsf.h"
 42 
 43 typedef enum {
 44 	MS_OFFICE_XML_TAG_INVALID,
 45 	MS_OFFICE_XML_TAG_TITLE,
 46 	MS_OFFICE_XML_TAG_SUBJECT,
 47 	MS_OFFICE_XML_TAG_AUTHOR,
 48 	MS_OFFICE_XML_TAG_MODIFIED,
 49 	MS_OFFICE_XML_TAG_COMMENTS,
 50 	MS_OFFICE_XML_TAG_CREATED,
 51 	MS_OFFICE_XML_TAG_GENERATOR,
 52 	MS_OFFICE_XML_TAG_NUM_OF_PAGES,
 53 	MS_OFFICE_XML_TAG_NUM_OF_CHARACTERS,
 54 	MS_OFFICE_XML_TAG_NUM_OF_WORDS,
 55 	MS_OFFICE_XML_TAG_NUM_OF_LINES,
 56 	MS_OFFICE_XML_TAG_APPLICATION,
 57 	MS_OFFICE_XML_TAG_NUM_OF_PARAGRAPHS,
 58 	MS_OFFICE_XML_TAG_SLIDE_TEXT,
 59 	MS_OFFICE_XML_TAG_WORD_TEXT,
 60 	MS_OFFICE_XML_TAG_XLS_SHARED_TEXT,
 61 	MS_OFFICE_XML_TAG_DOCUMENT_CORE_DATA,
 62 	MS_OFFICE_XML_TAG_DOCUMENT_TEXT_DATA
 63 } MsOfficeXMLTagType;
 64 
 65 typedef enum {
 66 	FILE_TYPE_INVALID,
 67 	FILE_TYPE_PPTX,
 68 	FILE_TYPE_PPSX,
 69 	FILE_TYPE_DOCX,
 70 	FILE_TYPE_XLSX
 71 } MsOfficeXMLFileType;
 72 
 73 typedef struct {
 74 	/* Common constant stuff */
 75 	const gchar *uri;
 76 	MsOfficeXMLFileType file_type;
 77 
 78 	/* Tag type, reused by Content and Metadata parsers */
 79 	MsOfficeXMLTagType tag_type;
 80 
 81 	/* Metadata-parsing specific things */
 82 	TrackerSparqlBuilder *metadata;
 83 	gboolean title_already_set;
 84 	gboolean generator_already_set;
 85 
 86 	/* Content-parsing specific things */
 87 	GString *content;
 88 	gulong bytes_pending;
 89 	gboolean style_element_present;
 90 	gboolean preserve_attribute_present;
 91 	GTimer *timer;
 92 	GList *parts;
 93 } MsOfficeXMLParserInfo;
 94 
 95 static void msoffice_xml_content_parse_start       (GMarkupParseContext  *context,
 96                                                     const gchar          *element_name,
 97                                                     const gchar         **attribute_names,
 98                                                     const gchar         **attribute_values,
 99                                                     gpointer              user_data,
100                                                     GError              **error);
101 static void msoffice_xml_content_parse_stop        (GMarkupParseContext  *context,
102                                                     const gchar          *element_name,
103                                                     gpointer              user_data,
104                                                     GError              **error);
105 static void msoffice_xml_content_parse             (GMarkupParseContext  *context,
106                                                     const gchar          *text,
107                                                     gsize                 text_len,
108                                                     gpointer              user_data,
109                                                     GError              **error);
110 
111 static void msoffice_xml_metadata_parse_start      (GMarkupParseContext  *context,
112                                                     const gchar           *element_name,
113                                                     const gchar          **attribute_names,
114                                                     const gchar          **attribute_values,
115                                                     gpointer               user_data,
116                                                     GError               **error);
117 static void msoffice_xml_metadata_parse_stop       (GMarkupParseContext  *context,
118                                                     const gchar          *element_name,
119                                                     gpointer              user_data,
120                                                     GError              **error);
121 static void msoffice_xml_metadata_parse            (GMarkupParseContext  *context,
122                                                     const gchar          *text,
123                                                     gsize                 text_len,
124                                                     gpointer              user_data,
125                                                     GError              **error);
126 
127 static void msoffice_xml_content_types_parse_start (GMarkupParseContext  *context,
128                                                     const gchar          *element_name,
129                                                     const gchar         **attribute_names,
130                                                     const gchar         **attribute_values,
131                                                     gpointer              user_data,
132                                                     GError              **error);
133 
134 static const GMarkupParser metadata_parser = {
135 	msoffice_xml_metadata_parse_start,
136 	msoffice_xml_metadata_parse_stop,
137 	msoffice_xml_metadata_parse,
138 	NULL,
139 	NULL
140 };
141 
142 static const GMarkupParser content_parser = {
143 	msoffice_xml_content_parse_start,
144 	msoffice_xml_content_parse_stop,
145 	msoffice_xml_content_parse,
146 	NULL,
147 	NULL
148 };
149 
150 static const GMarkupParser content_types_parser = {
151 	msoffice_xml_content_types_parse_start,
152 	NULL,
153 	NULL,
154 	NULL,
155 	NULL
156 };
157 
158 static GQuark maximum_size_error_quark = 0;
159 
160 /* ------------------------- CONTENT files parsing -----------------------------------*/
161 
162 static void
163 msoffice_xml_content_parse_start (GMarkupParseContext  *context,
164                                   const gchar          *element_name,
165                                   const gchar         **attribute_names,
166                                   const gchar         **attribute_values,
167                                   gpointer              user_data,
168                                   GError              **error)
169 {
170 	MsOfficeXMLParserInfo *info = user_data;
171 	const gchar **a;
172 	const gchar **v;
173 
174 	switch (info->file_type) {
175 	case FILE_TYPE_DOCX:
176 		if (g_ascii_strcasecmp (element_name, "w:pStyle") == 0) {
177 			for (a = attribute_names, v = attribute_values; *a; ++a, ++v) {
178 				if (g_ascii_strcasecmp (*a, "w:val") != 0) {
179 					continue;
180 				}
181 
182 				if (g_ascii_strncasecmp (*v, "Heading", 7) == 0) {
183 					info->style_element_present = TRUE;
184 				} else if (g_ascii_strncasecmp (*v, "TOC", 3) == 0) {
185 					info->style_element_present = TRUE;
186 				} else if (g_ascii_strncasecmp (*v, "Section", 7) == 0) {
187 					info->style_element_present = TRUE;
188 				} else if (g_ascii_strncasecmp (*v, "Title", 5) == 0) {
189 					info->style_element_present = TRUE;
190 				} else if (g_ascii_strncasecmp (*v, "Subtitle", 8) == 0) {
191 					info->style_element_present = TRUE;
192 				}
193 			}
194 		} else if (g_ascii_strcasecmp (element_name, "w:rStyle") == 0) {
195 			for (a = attribute_names, v = attribute_values; *a; ++a, ++v) {
196 				if (g_ascii_strcasecmp (*a, "w:val") != 0) {
197 					continue;
198 				}
199 
200 				if (g_ascii_strncasecmp (*v, "SubtleEmphasis", 14) == 0) {
201 					info->style_element_present = TRUE;
202 				} else if (g_ascii_strncasecmp (*v, "SubtleReference", 15) == 0) {
203 					info->style_element_present = TRUE;
204 				}
205 			}
206 		} else if (g_ascii_strcasecmp (element_name, "w:sz") == 0) {
207 			for (a = attribute_names, v = attribute_values; *a; ++a, ++v) {
208 				if (g_ascii_strcasecmp (*a, "w:val") != 0) {
209 					continue;
210 				}
211 
212 				if (atoi (*v) >= 38) {
213 					info->style_element_present = TRUE;
214 				}
215 			}
216 		} else if (g_ascii_strcasecmp (element_name, "w:smartTag") == 0) {
217 			info->style_element_present = TRUE;
218 		} else if (g_ascii_strcasecmp (element_name, "w:sdtContent") == 0) {
219 			info->style_element_present = TRUE;
220 		} else if (g_ascii_strcasecmp (element_name, "w:hyperlink") == 0) {
221 			info->style_element_present = TRUE;
222 		} else if (g_ascii_strcasecmp (element_name, "w:t") == 0) {
223 			for (a = attribute_names, v = attribute_values; *a; ++a, ++v) {
224 				if (g_ascii_strcasecmp (*a, "xml:space") != 0) {
225 					continue;
226 				}
227 
228 				if (g_ascii_strncasecmp (*v, "preserve", 8) == 0) {
229 					info->preserve_attribute_present = TRUE;
230 				}
231 			}
232 
233 			info->tag_type = MS_OFFICE_XML_TAG_WORD_TEXT;
234 		}
235 		break;
236 
237 	case FILE_TYPE_XLSX:
238 		if (g_ascii_strcasecmp (element_name, "sheet") == 0) {
239 			for (a = attribute_names, v = attribute_values; *a; ++a, ++v) {
240 				if (g_ascii_strcasecmp (*a, "name") == 0) {
241 					info->tag_type = MS_OFFICE_XML_TAG_XLS_SHARED_TEXT;
242 				}
243 			}
244 
245 		} else if (g_ascii_strcasecmp (element_name, "t") == 0) {
246 			info->tag_type = MS_OFFICE_XML_TAG_XLS_SHARED_TEXT;
247 		}
248 		break;
249 
250 	case FILE_TYPE_PPTX:
251 	case FILE_TYPE_PPSX:
252 		info->tag_type = MS_OFFICE_XML_TAG_SLIDE_TEXT;
253 		break;
254 
255 	case FILE_TYPE_INVALID:
256 		g_message ("Microsoft document type:%d invalid", info->file_type);
257 		break;
258 	}
259 }
260 
261 static void
262 msoffice_xml_content_parse_stop (GMarkupParseContext  *context,
263                                  const gchar          *element_name,
264                                  gpointer              user_data,
265                                  GError              **error)
266 {
267 	MsOfficeXMLParserInfo *info = user_data;
268 
269 	if (g_ascii_strcasecmp (element_name, "w:p") == 0) {
270 		info->style_element_present = FALSE;
271 		info->preserve_attribute_present = FALSE;
272 	}
273 
274 	/* Reset tag */
275 	info->tag_type = MS_OFFICE_XML_TAG_INVALID;
276 }
277 
278 static void
279 msoffice_xml_content_parse (GMarkupParseContext  *context,
280                             const gchar          *text,
281                             gsize                 text_len,
282                             gpointer              user_data,
283                             GError              **error)
284 {
285 	MsOfficeXMLParserInfo *info = user_data;
286 	gsize written_bytes = 0;
287 
288 	/* If reached max bytes to extract, just return */
289 	if (info->bytes_pending == 0) {
290 		g_set_error_literal (error,
291 		                     maximum_size_error_quark,
292 		                     0,
293 		                     "Maximum text limit reached");
294 		return;
295 	}
296 
297 	/* Create content string if not already done before */
298 	if (G_UNLIKELY (info->content == NULL)) {
299 		info->content =	g_string_new ("");
300 	}
301 
302 	switch (info->tag_type) {
303 	case MS_OFFICE_XML_TAG_WORD_TEXT:
304 		tracker_text_validate_utf8 (text,
305 		                            MIN (text_len, info->bytes_pending),
306 		                            &info->content,
307 		                            &written_bytes);
308 		g_string_append_c (info->content, ' ');
309 		info->bytes_pending -= written_bytes;
310 		break;
311 
312 	case MS_OFFICE_XML_TAG_SLIDE_TEXT:
313 		tracker_text_validate_utf8 (text,
314 		                            MIN (text_len, info->bytes_pending),
315 		                            &info->content,
316 		                            &written_bytes);
317 		g_string_append_c (info->content, ' ');
318 		info->bytes_pending -= written_bytes;
319 		break;
320 
321 	case MS_OFFICE_XML_TAG_XLS_SHARED_TEXT:
322 		if (atoi (text) == 0)  {
323 			tracker_text_validate_utf8 (text,
324 			                            MIN (text_len, info->bytes_pending),
325 			                            &info->content,
326 			                            &written_bytes);
327 			g_string_append_c (info->content, ' ');
328 			info->bytes_pending -= written_bytes;
329 		}
330 		break;
331 
332 	/* Ignore tags that may not happen inside the text subdocument */
333 	case MS_OFFICE_XML_TAG_TITLE:
334 	case MS_OFFICE_XML_TAG_SUBJECT:
335 	case MS_OFFICE_XML_TAG_AUTHOR:
336 	case MS_OFFICE_XML_TAG_COMMENTS:
337 	case MS_OFFICE_XML_TAG_CREATED:
338 	case MS_OFFICE_XML_TAG_GENERATOR:
339 	case MS_OFFICE_XML_TAG_APPLICATION:
340 	case MS_OFFICE_XML_TAG_MODIFIED:
341 	case MS_OFFICE_XML_TAG_NUM_OF_PAGES:
342 	case MS_OFFICE_XML_TAG_NUM_OF_CHARACTERS:
343 	case MS_OFFICE_XML_TAG_NUM_OF_WORDS:
344 	case MS_OFFICE_XML_TAG_NUM_OF_LINES:
345 	case MS_OFFICE_XML_TAG_NUM_OF_PARAGRAPHS:
346 	case MS_OFFICE_XML_TAG_DOCUMENT_CORE_DATA:
347 	case MS_OFFICE_XML_TAG_DOCUMENT_TEXT_DATA:
348 	case MS_OFFICE_XML_TAG_INVALID:
349 		break;
350 	}
351 }
352 
353 /* ------------------------- METADATA files parsing -----------------------------------*/
354 
355 static void
356 msoffice_xml_metadata_parse_start (GMarkupParseContext  *context,
357                                    const gchar           *element_name,
358                                    const gchar          **attribute_names,
359                                    const gchar          **attribute_values,
360                                    gpointer               user_data,
361                                    GError               **error)
362 {
363 	MsOfficeXMLParserInfo *info = user_data;
364 
365 	/* Setup the proper tag type */
366 	if (g_ascii_strcasecmp (element_name, "dc:title") == 0) {
367 		info->tag_type = MS_OFFICE_XML_TAG_TITLE;
368 	} else if (g_ascii_strcasecmp (element_name, "dc:subject") == 0) {
369 		info->tag_type = MS_OFFICE_XML_TAG_SUBJECT;
370 	} else if (g_ascii_strcasecmp (element_name, "dc:creator") == 0) {
371 		info->tag_type = MS_OFFICE_XML_TAG_AUTHOR;
372 	} else if (g_ascii_strcasecmp (element_name, "dc:description") == 0) {
373 		info->tag_type = MS_OFFICE_XML_TAG_COMMENTS;
374 	} else if (g_ascii_strcasecmp (element_name, "dcterms:created") == 0) {
375 		info->tag_type = MS_OFFICE_XML_TAG_CREATED;
376 	} else if (g_ascii_strcasecmp (element_name, "meta:generator") == 0) {
377 		info->tag_type = MS_OFFICE_XML_TAG_GENERATOR;
378 	} else if (g_ascii_strcasecmp (element_name, "dcterms:modified") == 0) {
379 		info->tag_type = MS_OFFICE_XML_TAG_MODIFIED;
380 	} else if (g_ascii_strcasecmp (element_name, "Pages") == 0) {
381 		info->tag_type = MS_OFFICE_XML_TAG_NUM_OF_PAGES;
382 	} else if (g_ascii_strcasecmp (element_name, "Slides") == 0) {
383 		info->tag_type = MS_OFFICE_XML_TAG_NUM_OF_PAGES;
384 	} else if (g_ascii_strcasecmp (element_name, "Paragraphs") == 0) {
385 		info->tag_type = MS_OFFICE_XML_TAG_NUM_OF_PARAGRAPHS;
386 	} else if (g_ascii_strcasecmp (element_name, "Characters") == 0) {
387 		info->tag_type = MS_OFFICE_XML_TAG_NUM_OF_CHARACTERS;
388 	} else if (g_ascii_strcasecmp (element_name, "Words") == 0) {
389 		info->tag_type = MS_OFFICE_XML_TAG_NUM_OF_WORDS;
390 	} else if (g_ascii_strcasecmp (element_name, "Lines") == 0) {
391 		info->tag_type = MS_OFFICE_XML_TAG_NUM_OF_LINES;
392 	} else if (g_ascii_strcasecmp (element_name, "Application") == 0) {
393 		info->tag_type = MS_OFFICE_XML_TAG_APPLICATION;
394 	} else {
395 		info->tag_type = MS_OFFICE_XML_TAG_INVALID;
396 	}
397 }
398 
399 static void
400 msoffice_xml_metadata_parse_stop (GMarkupParseContext  *context,
401                                   const gchar          *element_name,
402                                   gpointer              user_data,
403                                   GError              **error)
404 {
405 	/* Reset tag */
406 	((MsOfficeXMLParserInfo *)user_data)->tag_type = MS_OFFICE_XML_TAG_INVALID;
407 }
408 
409 static void
410 msoffice_xml_metadata_parse (GMarkupParseContext  *context,
411                              const gchar          *text,
412                              gsize                 text_len,
413                              gpointer              user_data,
414                              GError              **error)
415 {
416 	MsOfficeXMLParserInfo *info = user_data;
417 
418 	switch (info->tag_type) {
419 	/* Ignore tags that may not happen inside the core subdocument */
420 	case MS_OFFICE_XML_TAG_WORD_TEXT:
421 	case MS_OFFICE_XML_TAG_SLIDE_TEXT:
422 	case MS_OFFICE_XML_TAG_XLS_SHARED_TEXT:
423 		break;
424 
425 	case MS_OFFICE_XML_TAG_TITLE:
426 		if (info->title_already_set) {
427 			g_warning ("Avoiding additional title (%s) in MsOffice XML document '%s'",
428 			           text, info->uri);
429 		} else {
430 			info->title_already_set = TRUE;
431 			tracker_sparql_builder_predicate (info->metadata, "nie:title");
432 			tracker_sparql_builder_object_unvalidated (info->metadata, text);
433 		}
434 		break;
435 
436 	case MS_OFFICE_XML_TAG_SUBJECT:
437 		tracker_sparql_builder_predicate (info->metadata, "nie:subject");
438 		tracker_sparql_builder_object_unvalidated (info->metadata, text);
439 		break;
440 
441 	case MS_OFFICE_XML_TAG_AUTHOR:
442 		tracker_sparql_builder_predicate (info->metadata, "nco:publisher");
443 
444 		tracker_sparql_builder_object_blank_open (info->metadata);
445 		tracker_sparql_builder_predicate (info->metadata, "a");
446 		tracker_sparql_builder_object (info->metadata, "nco:Contact");
447 
448 		tracker_sparql_builder_predicate (info->metadata, "nco:fullname");
449 		tracker_sparql_builder_object_unvalidated (info->metadata, text);
450 		tracker_sparql_builder_object_blank_close (info->metadata);
451 		break;
452 
453 	case MS_OFFICE_XML_TAG_COMMENTS:
454 		tracker_sparql_builder_predicate (info->metadata, "nie:comment");
455 		tracker_sparql_builder_object_unvalidated (info->metadata, text);
456 		break;
457 
458 	case MS_OFFICE_XML_TAG_CREATED: {
459 		gchar *date;
460 
461 		date = tracker_date_guess (text);
462 		tracker_sparql_builder_predicate (info->metadata, "nie:contentCreated");
463 		tracker_sparql_builder_object_unvalidated (info->metadata, date);
464 		g_free (date);
465 		break;
466 	}
467 
468 	case MS_OFFICE_XML_TAG_GENERATOR:
469 		if (info->generator_already_set) {
470 			g_warning ("Avoiding additional generator (%s) in MsOffice XML document '%s'",
471 			           text, info->uri);
472 		} else {
473 			info->generator_already_set = TRUE;
474 			tracker_sparql_builder_predicate (info->metadata, "nie:generator");
475 			tracker_sparql_builder_object_unvalidated (info->metadata, text);
476 		}
477 		break;
478 
479 	case MS_OFFICE_XML_TAG_APPLICATION:
480 		/* FIXME: Same code as MS_OFFICE_XML_TAG_GENERATOR should be
481 		 * used, but nie:generator has max cardinality of 1
482 		 * and this would cause errors.
483 		 */
484 		break;
485 
486 	case MS_OFFICE_XML_TAG_MODIFIED: {
487 		gchar *date;
488 
489                 date = tracker_date_guess (text);
490 		tracker_sparql_builder_predicate (info->metadata, "nie:contentLastModified");
491 		tracker_sparql_builder_object_unvalidated (info->metadata, date);
492                 g_free (date);
493 		break;
494 	}
495 
496 	case MS_OFFICE_XML_TAG_NUM_OF_PAGES:
497 		tracker_sparql_builder_predicate (info->metadata, "nfo:pageCount");
498 		tracker_sparql_builder_object_unvalidated (info->metadata, text);
499 		break;
500 
501 	case MS_OFFICE_XML_TAG_NUM_OF_CHARACTERS:
502 		tracker_sparql_builder_predicate (info->metadata, "nfo:characterCount");
503 		tracker_sparql_builder_object_unvalidated (info->metadata, text);
504 		break;
505 
506 	case MS_OFFICE_XML_TAG_NUM_OF_WORDS:
507 		tracker_sparql_builder_predicate (info->metadata, "nfo:wordCount");
508 		tracker_sparql_builder_object_unvalidated (info->metadata, text);
509 		break;
510 
511 	case MS_OFFICE_XML_TAG_NUM_OF_LINES:
512 		tracker_sparql_builder_predicate (info->metadata, "nfo:lineCount");
513 		tracker_sparql_builder_object_unvalidated (info->metadata, text);
514 		break;
515 
516 	case MS_OFFICE_XML_TAG_NUM_OF_PARAGRAPHS:
517 		/* TODO: There is no ontology for this. */
518 		break;
519 
520 	case MS_OFFICE_XML_TAG_DOCUMENT_CORE_DATA:
521 	case MS_OFFICE_XML_TAG_DOCUMENT_TEXT_DATA:
522 		/* Nothing as we are using it in defining type of data */
523 		break;
524 
525 	case MS_OFFICE_XML_TAG_INVALID:
526 		/* Here we cant use log otheriwse it will print for other non useful files */
527 		break;
528 	}
529 }
530 
531 /* ------------------------- CONTENT-TYPES file parsing -----------------------------------*/
532 
533 static gboolean
534 xml_read (MsOfficeXMLParserInfo *parser_info,
535           const gchar           *xml_filename,
536           MsOfficeXMLTagType     type)
537 {
538 	GMarkupParseContext *context;
539 
540 	switch (type) {
541 	case MS_OFFICE_XML_TAG_DOCUMENT_CORE_DATA: {
542 		/* Reset these flags before going on */
543 		parser_info->tag_type = MS_OFFICE_XML_TAG_INVALID;
544 
545 		context = g_markup_parse_context_new (&metadata_parser,
546 		                                      0,
547 		                                      parser_info,
548 		                                      NULL);
549 		break;
550 	}
551 	case MS_OFFICE_XML_TAG_DOCUMENT_TEXT_DATA: {
552 		/* Reset these flags before going on */
553 		parser_info->tag_type = MS_OFFICE_XML_TAG_INVALID;
554 		parser_info->style_element_present = FALSE;
555 		parser_info->preserve_attribute_present = FALSE;
556 
557 		context = g_markup_parse_context_new (&content_parser,
558 		                                      0,
559 		                                      parser_info,
560 		                                      NULL);
561 		break;
562 	}
563 	default:
564 		context = NULL;
565 		break;
566 	}
567 
568 	if (context) {
569 		GError *error = NULL;
570 
571 		/* Load the internal XML file from the Zip archive, and parse it
572 		 * using the given context */
573 		tracker_gsf_parse_xml_in_zip (parser_info->uri,
574 		                              xml_filename,
575 		                              context,
576 		                              &error);
577 		g_markup_parse_context_free (context);
578 
579 		if (error) {
580 			g_debug ("Parsing internal '%s' gave error: '%s'",
581 			         xml_filename,
582 			         error->message);
583 			g_error_free (error);
584 		}
585 	}
586 
587 	return TRUE;
588 }
589 
590 static gint
591 compare_slide_name (gconstpointer a,
592                     gconstpointer b)
593 {
594 	gchar *col_a, *col_b;
595 	gint result;
596 
597 	col_a = g_utf8_collate_key_for_filename (a, -1);
598 	col_b = g_utf8_collate_key_for_filename (b, -1);
599 	result = strcmp (col_a, col_b);
600 
601 	g_free (col_a);
602 	g_free (col_b);
603 
604 	return result;
605 }
606 
607 static void
608 msoffice_xml_content_types_parse_start (GMarkupParseContext  *context,
609                                         const gchar          *element_name,
610                                         const gchar         **attribute_names,
611                                         const gchar         **attribute_values,
612                                         gpointer              user_data,
613                                         GError              **error)
614 {
615 	MsOfficeXMLParserInfo *info = user_data;
616 	const gchar *part_name = NULL;
617 	const gchar *content_type = NULL;
618 	gint i;
619 
620 	if (g_ascii_strcasecmp (element_name, "Override") != 0) {
621 		return;
622 	}
623 
624 	/* Look for part name and content type */
625 	for (i = 0; attribute_names[i]; i++) {
626 		if (g_ascii_strcasecmp (attribute_names[i], "PartName") == 0) {
627 			part_name = attribute_values[i];
628 		} else if (g_ascii_strcasecmp (attribute_names[i], "ContentType") == 0) {
629 			content_type = attribute_values[i];
630 		}
631 	}
632 
633 	/* Both part_name and content_type MUST be NON-NULL */
634 	if (!part_name || !content_type) {
635 		g_message ("Invalid file (part_name:%s, content_type:%s)",
636 		           part_name ? part_name : "none",
637 		           content_type ? content_type : "none");
638 		return;
639 	}
640 
641 	/* Metadata part? */
642 	if ((g_ascii_strcasecmp (content_type, "application/vnd.openxmlformats-package.core-properties+xml") == 0) ||
643 	    (g_ascii_strcasecmp (content_type, "application/vnd.openxmlformats-officedocument.extended-properties+xml") == 0)) {
644 		xml_read (info, part_name + 1, MS_OFFICE_XML_TAG_DOCUMENT_CORE_DATA);
645 		return;
646 	}
647 
648 	/* If the file type is unknown, skip trying to extract content */
649 	if (info->file_type == FILE_TYPE_INVALID) {
650 		g_message ("Invalid file type, not extracting content from '%s'",
651 		           part_name + 1);
652 		return;
653 	}
654 
655 	/* Content part? */
656 	if ((info->file_type == FILE_TYPE_DOCX &&
657 	     g_ascii_strcasecmp (content_type, "application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml") == 0) ||
658 	    ((info->file_type == FILE_TYPE_PPTX || info->file_type == FILE_TYPE_PPSX) &&
659 	     (g_ascii_strcasecmp (content_type, "application/vnd.openxmlformats-officedocument.presentationml.slide+xml") == 0 ||
660 	      g_ascii_strcasecmp (content_type, "application/vnd.openxmlformats-officedocument.drawingml.diagramData+xml") == 0)) ||
661 	    (info->file_type == FILE_TYPE_XLSX &&
662 	     (g_ascii_strcasecmp (content_type, "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet.main+xml") == 0 ||
663 	      g_ascii_strcasecmp (content_type, "application/vnd.openxmlformats-officedocument.spreadsheetml.sharedStrings+xml") == 0))) {
664 		if (info->file_type == FILE_TYPE_PPTX) {
665 			info->parts = g_list_insert_sorted (info->parts, g_strdup (part_name + 1),
666 			                                    compare_slide_name);
667 		} else {
668 			info->parts = g_list_append (info->parts, g_strdup (part_name + 1));
669 		}
670 	}
671 }
672 
673 /* ------------------------- Main methods -----------------------------------*/
674 
675 static MsOfficeXMLFileType
676 msoffice_xml_get_file_type (const gchar *uri)
677 {
678 	GFile *file;
679 	GFileInfo *file_info;
680 	const gchar *mime_used;
681 	MsOfficeXMLFileType file_type;
682 
683 	/* Get GFile from uri... */
684 	file = g_file_new_for_uri (uri);
685 	if (!file) {
686 		g_warning ("Could not create GFile for URI:'%s'", uri);
687 		return FILE_TYPE_INVALID;
688 	}
689 
690 	/* Get GFileInfo from GFile... (synchronous) */
691 	file_info = g_file_query_info (file,
692 	                               G_FILE_ATTRIBUTE_STANDARD_CONTENT_TYPE,
693 	                               G_FILE_QUERY_INFO_NONE,
694 	                               NULL,
695 	                               NULL);
696 	g_object_unref (file);
697 	if (!file_info) {
698 		g_warning ("Could not get GFileInfo for URI:'%s'", uri);
699 		return FILE_TYPE_INVALID;
700 	}
701 
702 	/* Get Content Type from GFileInfo. The constant string will be valid
703 	 * as long as the file info reference is valid */
704 	mime_used = g_file_info_get_content_type (file_info);
705 	if (g_ascii_strcasecmp (mime_used, "application/vnd.openxmlformats-officedocument.wordprocessingml.document") == 0) {
706 		/* MsOffice Word document */
707 		file_type = FILE_TYPE_DOCX;
708 	} else if (g_ascii_strcasecmp (mime_used, "application/vnd.openxmlformats-officedocument.presentationml.presentation") == 0) {
709 		/* MsOffice Powerpoint document */
710 		file_type = FILE_TYPE_PPTX;
711 	} else if (g_ascii_strcasecmp (mime_used, "application/vnd.openxmlformats-officedocument.presentationml.slideshow") == 0) {
712 		/* MsOffice Powerpoint (slideshow) document */
713 		file_type = FILE_TYPE_PPSX;
714 	} else if (g_ascii_strcasecmp (mime_used, "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet") == 0) {
715 		/* MsOffice Excel document */
716 		file_type = FILE_TYPE_XLSX;
717 	} else {
718 		g_message ("Mime type was not recognised:'%s'", mime_used);
719 		file_type = FILE_TYPE_INVALID;
720 	}
721 
722 	g_object_unref (file_info);
723 
724 	return file_type;
725 }
726 
727 static void
728 extract_content (MsOfficeXMLParserInfo *info)
729 {
730 	GList *parts;
731 
732 	if (!info->parts) {
733 		return;
734 	}
735 
736 	for (parts = info->parts; parts; parts = parts->next) {
737 		const gchar *part_name;
738 
739 		part_name = parts->data;
740 		/* If reached max bytes to extract, don't event start parsing the file... just return */
741 		if (info->bytes_pending == 0) {
742 			g_debug ("Skipping '%s' as already reached max bytes to extract",
743 			         part_name);
744 			break;
745 		} else if (g_timer_elapsed (info->timer, NULL) > 5) {
746 			g_debug ("Skipping '%s' as already reached max time to extract",
747 			         part_name);
748 			break;
749 		} else {
750 			xml_read (info, part_name, MS_OFFICE_XML_TAG_DOCUMENT_TEXT_DATA);
751 		}
752 	}
753 }
754 
755 G_MODULE_EXPORT gboolean
756 tracker_extract_get_metadata (TrackerExtractInfo *extract_info)
757 {
758 	MsOfficeXMLParserInfo info = { 0 };
759 	MsOfficeXMLFileType file_type;
760 	TrackerSparqlBuilder *metadata;
761 	TrackerConfig *config;
762 	GMarkupParseContext *context = NULL;
763 	GError *error = NULL;
764 	GFile *file;
765 	gchar *uri;
766 
767 	if (G_UNLIKELY (maximum_size_error_quark == 0)) {
768 		maximum_size_error_quark = g_quark_from_static_string ("maximum_size_error");
769 	}
770 
771 	metadata = tracker_extract_info_get_metadata_builder (extract_info);
772 	file = tracker_extract_info_get_file (extract_info);
773 	uri = g_file_get_uri (file);
774 
775 	/* Get current Content Type */
776 	file_type = msoffice_xml_get_file_type (uri);
777 
778 	/* Setup conf */
779 	config = tracker_main_get_config ();
780 
781 	g_debug ("Extracting MsOffice XML format...");
782 
783 	tracker_sparql_builder_predicate (metadata, "a");
784 	tracker_sparql_builder_object (metadata, "nfo:PaginatedTextDocument");
785 
786 	/* Setup Parser info */
787 	info.metadata = metadata;
788 	info.file_type = file_type;
789 	info.tag_type = MS_OFFICE_XML_TAG_INVALID;
790 	info.style_element_present = FALSE;
791 	info.preserve_attribute_present = FALSE;
792 	info.uri = uri;
793 	info.content = NULL;
794 	info.title_already_set = FALSE;
795 	info.generator_already_set = FALSE;
796 	info.bytes_pending = tracker_config_get_max_bytes (config);
797 
798 	/* Create content-type parser context */
799 	context = g_markup_parse_context_new (&content_types_parser,
800 	                                      0,
801 	                                      &info,
802 	                                      NULL);
803 
804 	info.timer = g_timer_new ();
805 	/* Load the internal XML file from the Zip archive, and parse it
806 	 * using the given context */
807 	tracker_gsf_parse_xml_in_zip (uri,
808 	                              "[Content_Types].xml",
809 	                              context,
810 	                              &error);
811 	if (error) {
812 		g_debug ("Parsing the content-types file gave an error: '%s'",
813 		         error->message);
814 		g_error_free (error);
815 	}
816 
817 	extract_content (&info);
818 
819 	/* If we got any content, add it */
820 	if (info.content) {
821 		gchar *content;
822 
823 		content = g_string_free (info.content, FALSE);
824 		info.content = NULL;
825 
826 		if (content) {
827 			tracker_sparql_builder_predicate (metadata, "nie:plainTextContent");
828 			tracker_sparql_builder_object_unvalidated (metadata, content);
829 			g_free (content);
830 		}
831 	}
832 
833 	if (info.parts) {
834 		g_list_foreach (info.parts, (GFunc) g_free, NULL);
835 		g_list_free (info.parts);
836 	}
837 
838 	g_timer_destroy (info.timer);
839 	g_markup_parse_context_free (context);
840 	g_free (uri);
841 
842 	return TRUE;
843 }
tracker-0.16.2/src/tracker-extract/tracker-extract-msoffice-xml.c

Incomplete coverage