No issues found
Tool | Failure ID | Location | Function | Message | Data |
---|---|---|---|---|---|
clang-analyzer | no-output-found | tracker-extract-msoffice-xml.c | Message(text='Unable to locate XML output from invoke-clang-analyzer') | None |
1 /*
2 * Copyright (C) 2008-2010 Nokia <ivan.frade@nokia.com>
3 *
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public
6 * License as published by the Free Software Foundation; either
7 * version 2.1 of the License, or (at your option) any later version.
8 *
9 * This library is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
13 *
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the
16 * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
17 * Boston, MA 02110-1301, USA.
18 */
19
20 #include "config.h"
21
22 #include <string.h>
23
24 #include <glib.h>
25
26 #include <gsf/gsf.h>
27 #include <gsf/gsf-doc-meta-data.h>
28 #include <gsf/gsf-infile.h>
29 #include <gsf/gsf-infile-msole.h>
30 #include <gsf/gsf-input-stdio.h>
31 #include <gsf/gsf-msole-utils.h>
32 #include <gsf/gsf-utils.h>
33 #include <gsf/gsf-infile-zip.h>
34
35 #include <libtracker-common/tracker-utils.h>
36 #include <libtracker-common/tracker-os-dependant.h>
37
38 #include <libtracker-extract/tracker-extract.h>
39
40 #include "tracker-main.h"
41 #include "tracker-gsf.h"
42
43 typedef enum {
44 MS_OFFICE_XML_TAG_INVALID,
45 MS_OFFICE_XML_TAG_TITLE,
46 MS_OFFICE_XML_TAG_SUBJECT,
47 MS_OFFICE_XML_TAG_AUTHOR,
48 MS_OFFICE_XML_TAG_MODIFIED,
49 MS_OFFICE_XML_TAG_COMMENTS,
50 MS_OFFICE_XML_TAG_CREATED,
51 MS_OFFICE_XML_TAG_GENERATOR,
52 MS_OFFICE_XML_TAG_NUM_OF_PAGES,
53 MS_OFFICE_XML_TAG_NUM_OF_CHARACTERS,
54 MS_OFFICE_XML_TAG_NUM_OF_WORDS,
55 MS_OFFICE_XML_TAG_NUM_OF_LINES,
56 MS_OFFICE_XML_TAG_APPLICATION,
57 MS_OFFICE_XML_TAG_NUM_OF_PARAGRAPHS,
58 MS_OFFICE_XML_TAG_SLIDE_TEXT,
59 MS_OFFICE_XML_TAG_WORD_TEXT,
60 MS_OFFICE_XML_TAG_XLS_SHARED_TEXT,
61 MS_OFFICE_XML_TAG_DOCUMENT_CORE_DATA,
62 MS_OFFICE_XML_TAG_DOCUMENT_TEXT_DATA
63 } MsOfficeXMLTagType;
64
65 typedef enum {
66 FILE_TYPE_INVALID,
67 FILE_TYPE_PPTX,
68 FILE_TYPE_PPSX,
69 FILE_TYPE_DOCX,
70 FILE_TYPE_XLSX
71 } MsOfficeXMLFileType;
72
73 typedef struct {
74 /* Common constant stuff */
75 const gchar *uri;
76 MsOfficeXMLFileType file_type;
77
78 /* Tag type, reused by Content and Metadata parsers */
79 MsOfficeXMLTagType tag_type;
80
81 /* Metadata-parsing specific things */
82 TrackerSparqlBuilder *metadata;
83 gboolean title_already_set;
84 gboolean generator_already_set;
85
86 /* Content-parsing specific things */
87 GString *content;
88 gulong bytes_pending;
89 gboolean style_element_present;
90 gboolean preserve_attribute_present;
91 GTimer *timer;
92 GList *parts;
93 } MsOfficeXMLParserInfo;
94
95 static void msoffice_xml_content_parse_start (GMarkupParseContext *context,
96 const gchar *element_name,
97 const gchar **attribute_names,
98 const gchar **attribute_values,
99 gpointer user_data,
100 GError **error);
101 static void msoffice_xml_content_parse_stop (GMarkupParseContext *context,
102 const gchar *element_name,
103 gpointer user_data,
104 GError **error);
105 static void msoffice_xml_content_parse (GMarkupParseContext *context,
106 const gchar *text,
107 gsize text_len,
108 gpointer user_data,
109 GError **error);
110
111 static void msoffice_xml_metadata_parse_start (GMarkupParseContext *context,
112 const gchar *element_name,
113 const gchar **attribute_names,
114 const gchar **attribute_values,
115 gpointer user_data,
116 GError **error);
117 static void msoffice_xml_metadata_parse_stop (GMarkupParseContext *context,
118 const gchar *element_name,
119 gpointer user_data,
120 GError **error);
121 static void msoffice_xml_metadata_parse (GMarkupParseContext *context,
122 const gchar *text,
123 gsize text_len,
124 gpointer user_data,
125 GError **error);
126
127 static void msoffice_xml_content_types_parse_start (GMarkupParseContext *context,
128 const gchar *element_name,
129 const gchar **attribute_names,
130 const gchar **attribute_values,
131 gpointer user_data,
132 GError **error);
133
134 static const GMarkupParser metadata_parser = {
135 msoffice_xml_metadata_parse_start,
136 msoffice_xml_metadata_parse_stop,
137 msoffice_xml_metadata_parse,
138 NULL,
139 NULL
140 };
141
142 static const GMarkupParser content_parser = {
143 msoffice_xml_content_parse_start,
144 msoffice_xml_content_parse_stop,
145 msoffice_xml_content_parse,
146 NULL,
147 NULL
148 };
149
150 static const GMarkupParser content_types_parser = {
151 msoffice_xml_content_types_parse_start,
152 NULL,
153 NULL,
154 NULL,
155 NULL
156 };
157
158 static GQuark maximum_size_error_quark = 0;
159
160 /* ------------------------- CONTENT files parsing -----------------------------------*/
161
162 static void
163 msoffice_xml_content_parse_start (GMarkupParseContext *context,
164 const gchar *element_name,
165 const gchar **attribute_names,
166 const gchar **attribute_values,
167 gpointer user_data,
168 GError **error)
169 {
170 MsOfficeXMLParserInfo *info = user_data;
171 const gchar **a;
172 const gchar **v;
173
174 switch (info->file_type) {
175 case FILE_TYPE_DOCX:
176 if (g_ascii_strcasecmp (element_name, "w:pStyle") == 0) {
177 for (a = attribute_names, v = attribute_values; *a; ++a, ++v) {
178 if (g_ascii_strcasecmp (*a, "w:val") != 0) {
179 continue;
180 }
181
182 if (g_ascii_strncasecmp (*v, "Heading", 7) == 0) {
183 info->style_element_present = TRUE;
184 } else if (g_ascii_strncasecmp (*v, "TOC", 3) == 0) {
185 info->style_element_present = TRUE;
186 } else if (g_ascii_strncasecmp (*v, "Section", 7) == 0) {
187 info->style_element_present = TRUE;
188 } else if (g_ascii_strncasecmp (*v, "Title", 5) == 0) {
189 info->style_element_present = TRUE;
190 } else if (g_ascii_strncasecmp (*v, "Subtitle", 8) == 0) {
191 info->style_element_present = TRUE;
192 }
193 }
194 } else if (g_ascii_strcasecmp (element_name, "w:rStyle") == 0) {
195 for (a = attribute_names, v = attribute_values; *a; ++a, ++v) {
196 if (g_ascii_strcasecmp (*a, "w:val") != 0) {
197 continue;
198 }
199
200 if (g_ascii_strncasecmp (*v, "SubtleEmphasis", 14) == 0) {
201 info->style_element_present = TRUE;
202 } else if (g_ascii_strncasecmp (*v, "SubtleReference", 15) == 0) {
203 info->style_element_present = TRUE;
204 }
205 }
206 } else if (g_ascii_strcasecmp (element_name, "w:sz") == 0) {
207 for (a = attribute_names, v = attribute_values; *a; ++a, ++v) {
208 if (g_ascii_strcasecmp (*a, "w:val") != 0) {
209 continue;
210 }
211
212 if (atoi (*v) >= 38) {
213 info->style_element_present = TRUE;
214 }
215 }
216 } else if (g_ascii_strcasecmp (element_name, "w:smartTag") == 0) {
217 info->style_element_present = TRUE;
218 } else if (g_ascii_strcasecmp (element_name, "w:sdtContent") == 0) {
219 info->style_element_present = TRUE;
220 } else if (g_ascii_strcasecmp (element_name, "w:hyperlink") == 0) {
221 info->style_element_present = TRUE;
222 } else if (g_ascii_strcasecmp (element_name, "w:t") == 0) {
223 for (a = attribute_names, v = attribute_values; *a; ++a, ++v) {
224 if (g_ascii_strcasecmp (*a, "xml:space") != 0) {
225 continue;
226 }
227
228 if (g_ascii_strncasecmp (*v, "preserve", 8) == 0) {
229 info->preserve_attribute_present = TRUE;
230 }
231 }
232
233 info->tag_type = MS_OFFICE_XML_TAG_WORD_TEXT;
234 }
235 break;
236
237 case FILE_TYPE_XLSX:
238 if (g_ascii_strcasecmp (element_name, "sheet") == 0) {
239 for (a = attribute_names, v = attribute_values; *a; ++a, ++v) {
240 if (g_ascii_strcasecmp (*a, "name") == 0) {
241 info->tag_type = MS_OFFICE_XML_TAG_XLS_SHARED_TEXT;
242 }
243 }
244
245 } else if (g_ascii_strcasecmp (element_name, "t") == 0) {
246 info->tag_type = MS_OFFICE_XML_TAG_XLS_SHARED_TEXT;
247 }
248 break;
249
250 case FILE_TYPE_PPTX:
251 case FILE_TYPE_PPSX:
252 info->tag_type = MS_OFFICE_XML_TAG_SLIDE_TEXT;
253 break;
254
255 case FILE_TYPE_INVALID:
256 g_message ("Microsoft document type:%d invalid", info->file_type);
257 break;
258 }
259 }
260
261 static void
262 msoffice_xml_content_parse_stop (GMarkupParseContext *context,
263 const gchar *element_name,
264 gpointer user_data,
265 GError **error)
266 {
267 MsOfficeXMLParserInfo *info = user_data;
268
269 if (g_ascii_strcasecmp (element_name, "w:p") == 0) {
270 info->style_element_present = FALSE;
271 info->preserve_attribute_present = FALSE;
272 }
273
274 /* Reset tag */
275 info->tag_type = MS_OFFICE_XML_TAG_INVALID;
276 }
277
278 static void
279 msoffice_xml_content_parse (GMarkupParseContext *context,
280 const gchar *text,
281 gsize text_len,
282 gpointer user_data,
283 GError **error)
284 {
285 MsOfficeXMLParserInfo *info = user_data;
286 gsize written_bytes = 0;
287
288 /* If reached max bytes to extract, just return */
289 if (info->bytes_pending == 0) {
290 g_set_error_literal (error,
291 maximum_size_error_quark,
292 0,
293 "Maximum text limit reached");
294 return;
295 }
296
297 /* Create content string if not already done before */
298 if (G_UNLIKELY (info->content == NULL)) {
299 info->content = g_string_new ("");
300 }
301
302 switch (info->tag_type) {
303 case MS_OFFICE_XML_TAG_WORD_TEXT:
304 tracker_text_validate_utf8 (text,
305 MIN (text_len, info->bytes_pending),
306 &info->content,
307 &written_bytes);
308 g_string_append_c (info->content, ' ');
309 info->bytes_pending -= written_bytes;
310 break;
311
312 case MS_OFFICE_XML_TAG_SLIDE_TEXT:
313 tracker_text_validate_utf8 (text,
314 MIN (text_len, info->bytes_pending),
315 &info->content,
316 &written_bytes);
317 g_string_append_c (info->content, ' ');
318 info->bytes_pending -= written_bytes;
319 break;
320
321 case MS_OFFICE_XML_TAG_XLS_SHARED_TEXT:
322 if (atoi (text) == 0) {
323 tracker_text_validate_utf8 (text,
324 MIN (text_len, info->bytes_pending),
325 &info->content,
326 &written_bytes);
327 g_string_append_c (info->content, ' ');
328 info->bytes_pending -= written_bytes;
329 }
330 break;
331
332 /* Ignore tags that may not happen inside the text subdocument */
333 case MS_OFFICE_XML_TAG_TITLE:
334 case MS_OFFICE_XML_TAG_SUBJECT:
335 case MS_OFFICE_XML_TAG_AUTHOR:
336 case MS_OFFICE_XML_TAG_COMMENTS:
337 case MS_OFFICE_XML_TAG_CREATED:
338 case MS_OFFICE_XML_TAG_GENERATOR:
339 case MS_OFFICE_XML_TAG_APPLICATION:
340 case MS_OFFICE_XML_TAG_MODIFIED:
341 case MS_OFFICE_XML_TAG_NUM_OF_PAGES:
342 case MS_OFFICE_XML_TAG_NUM_OF_CHARACTERS:
343 case MS_OFFICE_XML_TAG_NUM_OF_WORDS:
344 case MS_OFFICE_XML_TAG_NUM_OF_LINES:
345 case MS_OFFICE_XML_TAG_NUM_OF_PARAGRAPHS:
346 case MS_OFFICE_XML_TAG_DOCUMENT_CORE_DATA:
347 case MS_OFFICE_XML_TAG_DOCUMENT_TEXT_DATA:
348 case MS_OFFICE_XML_TAG_INVALID:
349 break;
350 }
351 }
352
353 /* ------------------------- METADATA files parsing -----------------------------------*/
354
355 static void
356 msoffice_xml_metadata_parse_start (GMarkupParseContext *context,
357 const gchar *element_name,
358 const gchar **attribute_names,
359 const gchar **attribute_values,
360 gpointer user_data,
361 GError **error)
362 {
363 MsOfficeXMLParserInfo *info = user_data;
364
365 /* Setup the proper tag type */
366 if (g_ascii_strcasecmp (element_name, "dc:title") == 0) {
367 info->tag_type = MS_OFFICE_XML_TAG_TITLE;
368 } else if (g_ascii_strcasecmp (element_name, "dc:subject") == 0) {
369 info->tag_type = MS_OFFICE_XML_TAG_SUBJECT;
370 } else if (g_ascii_strcasecmp (element_name, "dc:creator") == 0) {
371 info->tag_type = MS_OFFICE_XML_TAG_AUTHOR;
372 } else if (g_ascii_strcasecmp (element_name, "dc:description") == 0) {
373 info->tag_type = MS_OFFICE_XML_TAG_COMMENTS;
374 } else if (g_ascii_strcasecmp (element_name, "dcterms:created") == 0) {
375 info->tag_type = MS_OFFICE_XML_TAG_CREATED;
376 } else if (g_ascii_strcasecmp (element_name, "meta:generator") == 0) {
377 info->tag_type = MS_OFFICE_XML_TAG_GENERATOR;
378 } else if (g_ascii_strcasecmp (element_name, "dcterms:modified") == 0) {
379 info->tag_type = MS_OFFICE_XML_TAG_MODIFIED;
380 } else if (g_ascii_strcasecmp (element_name, "Pages") == 0) {
381 info->tag_type = MS_OFFICE_XML_TAG_NUM_OF_PAGES;
382 } else if (g_ascii_strcasecmp (element_name, "Slides") == 0) {
383 info->tag_type = MS_OFFICE_XML_TAG_NUM_OF_PAGES;
384 } else if (g_ascii_strcasecmp (element_name, "Paragraphs") == 0) {
385 info->tag_type = MS_OFFICE_XML_TAG_NUM_OF_PARAGRAPHS;
386 } else if (g_ascii_strcasecmp (element_name, "Characters") == 0) {
387 info->tag_type = MS_OFFICE_XML_TAG_NUM_OF_CHARACTERS;
388 } else if (g_ascii_strcasecmp (element_name, "Words") == 0) {
389 info->tag_type = MS_OFFICE_XML_TAG_NUM_OF_WORDS;
390 } else if (g_ascii_strcasecmp (element_name, "Lines") == 0) {
391 info->tag_type = MS_OFFICE_XML_TAG_NUM_OF_LINES;
392 } else if (g_ascii_strcasecmp (element_name, "Application") == 0) {
393 info->tag_type = MS_OFFICE_XML_TAG_APPLICATION;
394 } else {
395 info->tag_type = MS_OFFICE_XML_TAG_INVALID;
396 }
397 }
398
399 static void
400 msoffice_xml_metadata_parse_stop (GMarkupParseContext *context,
401 const gchar *element_name,
402 gpointer user_data,
403 GError **error)
404 {
405 /* Reset tag */
406 ((MsOfficeXMLParserInfo *)user_data)->tag_type = MS_OFFICE_XML_TAG_INVALID;
407 }
408
409 static void
410 msoffice_xml_metadata_parse (GMarkupParseContext *context,
411 const gchar *text,
412 gsize text_len,
413 gpointer user_data,
414 GError **error)
415 {
416 MsOfficeXMLParserInfo *info = user_data;
417
418 switch (info->tag_type) {
419 /* Ignore tags that may not happen inside the core subdocument */
420 case MS_OFFICE_XML_TAG_WORD_TEXT:
421 case MS_OFFICE_XML_TAG_SLIDE_TEXT:
422 case MS_OFFICE_XML_TAG_XLS_SHARED_TEXT:
423 break;
424
425 case MS_OFFICE_XML_TAG_TITLE:
426 if (info->title_already_set) {
427 g_warning ("Avoiding additional title (%s) in MsOffice XML document '%s'",
428 text, info->uri);
429 } else {
430 info->title_already_set = TRUE;
431 tracker_sparql_builder_predicate (info->metadata, "nie:title");
432 tracker_sparql_builder_object_unvalidated (info->metadata, text);
433 }
434 break;
435
436 case MS_OFFICE_XML_TAG_SUBJECT:
437 tracker_sparql_builder_predicate (info->metadata, "nie:subject");
438 tracker_sparql_builder_object_unvalidated (info->metadata, text);
439 break;
440
441 case MS_OFFICE_XML_TAG_AUTHOR:
442 tracker_sparql_builder_predicate (info->metadata, "nco:publisher");
443
444 tracker_sparql_builder_object_blank_open (info->metadata);
445 tracker_sparql_builder_predicate (info->metadata, "a");
446 tracker_sparql_builder_object (info->metadata, "nco:Contact");
447
448 tracker_sparql_builder_predicate (info->metadata, "nco:fullname");
449 tracker_sparql_builder_object_unvalidated (info->metadata, text);
450 tracker_sparql_builder_object_blank_close (info->metadata);
451 break;
452
453 case MS_OFFICE_XML_TAG_COMMENTS:
454 tracker_sparql_builder_predicate (info->metadata, "nie:comment");
455 tracker_sparql_builder_object_unvalidated (info->metadata, text);
456 break;
457
458 case MS_OFFICE_XML_TAG_CREATED: {
459 gchar *date;
460
461 date = tracker_date_guess (text);
462 tracker_sparql_builder_predicate (info->metadata, "nie:contentCreated");
463 tracker_sparql_builder_object_unvalidated (info->metadata, date);
464 g_free (date);
465 break;
466 }
467
468 case MS_OFFICE_XML_TAG_GENERATOR:
469 if (info->generator_already_set) {
470 g_warning ("Avoiding additional generator (%s) in MsOffice XML document '%s'",
471 text, info->uri);
472 } else {
473 info->generator_already_set = TRUE;
474 tracker_sparql_builder_predicate (info->metadata, "nie:generator");
475 tracker_sparql_builder_object_unvalidated (info->metadata, text);
476 }
477 break;
478
479 case MS_OFFICE_XML_TAG_APPLICATION:
480 /* FIXME: Same code as MS_OFFICE_XML_TAG_GENERATOR should be
481 * used, but nie:generator has max cardinality of 1
482 * and this would cause errors.
483 */
484 break;
485
486 case MS_OFFICE_XML_TAG_MODIFIED: {
487 gchar *date;
488
489 date = tracker_date_guess (text);
490 tracker_sparql_builder_predicate (info->metadata, "nie:contentLastModified");
491 tracker_sparql_builder_object_unvalidated (info->metadata, date);
492 g_free (date);
493 break;
494 }
495
496 case MS_OFFICE_XML_TAG_NUM_OF_PAGES:
497 tracker_sparql_builder_predicate (info->metadata, "nfo:pageCount");
498 tracker_sparql_builder_object_unvalidated (info->metadata, text);
499 break;
500
501 case MS_OFFICE_XML_TAG_NUM_OF_CHARACTERS:
502 tracker_sparql_builder_predicate (info->metadata, "nfo:characterCount");
503 tracker_sparql_builder_object_unvalidated (info->metadata, text);
504 break;
505
506 case MS_OFFICE_XML_TAG_NUM_OF_WORDS:
507 tracker_sparql_builder_predicate (info->metadata, "nfo:wordCount");
508 tracker_sparql_builder_object_unvalidated (info->metadata, text);
509 break;
510
511 case MS_OFFICE_XML_TAG_NUM_OF_LINES:
512 tracker_sparql_builder_predicate (info->metadata, "nfo:lineCount");
513 tracker_sparql_builder_object_unvalidated (info->metadata, text);
514 break;
515
516 case MS_OFFICE_XML_TAG_NUM_OF_PARAGRAPHS:
517 /* TODO: There is no ontology for this. */
518 break;
519
520 case MS_OFFICE_XML_TAG_DOCUMENT_CORE_DATA:
521 case MS_OFFICE_XML_TAG_DOCUMENT_TEXT_DATA:
522 /* Nothing as we are using it in defining type of data */
523 break;
524
525 case MS_OFFICE_XML_TAG_INVALID:
526 /* Here we cant use log otheriwse it will print for other non useful files */
527 break;
528 }
529 }
530
531 /* ------------------------- CONTENT-TYPES file parsing -----------------------------------*/
532
533 static gboolean
534 xml_read (MsOfficeXMLParserInfo *parser_info,
535 const gchar *xml_filename,
536 MsOfficeXMLTagType type)
537 {
538 GMarkupParseContext *context;
539
540 switch (type) {
541 case MS_OFFICE_XML_TAG_DOCUMENT_CORE_DATA: {
542 /* Reset these flags before going on */
543 parser_info->tag_type = MS_OFFICE_XML_TAG_INVALID;
544
545 context = g_markup_parse_context_new (&metadata_parser,
546 0,
547 parser_info,
548 NULL);
549 break;
550 }
551 case MS_OFFICE_XML_TAG_DOCUMENT_TEXT_DATA: {
552 /* Reset these flags before going on */
553 parser_info->tag_type = MS_OFFICE_XML_TAG_INVALID;
554 parser_info->style_element_present = FALSE;
555 parser_info->preserve_attribute_present = FALSE;
556
557 context = g_markup_parse_context_new (&content_parser,
558 0,
559 parser_info,
560 NULL);
561 break;
562 }
563 default:
564 context = NULL;
565 break;
566 }
567
568 if (context) {
569 GError *error = NULL;
570
571 /* Load the internal XML file from the Zip archive, and parse it
572 * using the given context */
573 tracker_gsf_parse_xml_in_zip (parser_info->uri,
574 xml_filename,
575 context,
576 &error);
577 g_markup_parse_context_free (context);
578
579 if (error) {
580 g_debug ("Parsing internal '%s' gave error: '%s'",
581 xml_filename,
582 error->message);
583 g_error_free (error);
584 }
585 }
586
587 return TRUE;
588 }
589
590 static gint
591 compare_slide_name (gconstpointer a,
592 gconstpointer b)
593 {
594 gchar *col_a, *col_b;
595 gint result;
596
597 col_a = g_utf8_collate_key_for_filename (a, -1);
598 col_b = g_utf8_collate_key_for_filename (b, -1);
599 result = strcmp (col_a, col_b);
600
601 g_free (col_a);
602 g_free (col_b);
603
604 return result;
605 }
606
607 static void
608 msoffice_xml_content_types_parse_start (GMarkupParseContext *context,
609 const gchar *element_name,
610 const gchar **attribute_names,
611 const gchar **attribute_values,
612 gpointer user_data,
613 GError **error)
614 {
615 MsOfficeXMLParserInfo *info = user_data;
616 const gchar *part_name = NULL;
617 const gchar *content_type = NULL;
618 gint i;
619
620 if (g_ascii_strcasecmp (element_name, "Override") != 0) {
621 return;
622 }
623
624 /* Look for part name and content type */
625 for (i = 0; attribute_names[i]; i++) {
626 if (g_ascii_strcasecmp (attribute_names[i], "PartName") == 0) {
627 part_name = attribute_values[i];
628 } else if (g_ascii_strcasecmp (attribute_names[i], "ContentType") == 0) {
629 content_type = attribute_values[i];
630 }
631 }
632
633 /* Both part_name and content_type MUST be NON-NULL */
634 if (!part_name || !content_type) {
635 g_message ("Invalid file (part_name:%s, content_type:%s)",
636 part_name ? part_name : "none",
637 content_type ? content_type : "none");
638 return;
639 }
640
641 /* Metadata part? */
642 if ((g_ascii_strcasecmp (content_type, "application/vnd.openxmlformats-package.core-properties+xml") == 0) ||
643 (g_ascii_strcasecmp (content_type, "application/vnd.openxmlformats-officedocument.extended-properties+xml") == 0)) {
644 xml_read (info, part_name + 1, MS_OFFICE_XML_TAG_DOCUMENT_CORE_DATA);
645 return;
646 }
647
648 /* If the file type is unknown, skip trying to extract content */
649 if (info->file_type == FILE_TYPE_INVALID) {
650 g_message ("Invalid file type, not extracting content from '%s'",
651 part_name + 1);
652 return;
653 }
654
655 /* Content part? */
656 if ((info->file_type == FILE_TYPE_DOCX &&
657 g_ascii_strcasecmp (content_type, "application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml") == 0) ||
658 ((info->file_type == FILE_TYPE_PPTX || info->file_type == FILE_TYPE_PPSX) &&
659 (g_ascii_strcasecmp (content_type, "application/vnd.openxmlformats-officedocument.presentationml.slide+xml") == 0 ||
660 g_ascii_strcasecmp (content_type, "application/vnd.openxmlformats-officedocument.drawingml.diagramData+xml") == 0)) ||
661 (info->file_type == FILE_TYPE_XLSX &&
662 (g_ascii_strcasecmp (content_type, "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet.main+xml") == 0 ||
663 g_ascii_strcasecmp (content_type, "application/vnd.openxmlformats-officedocument.spreadsheetml.sharedStrings+xml") == 0))) {
664 if (info->file_type == FILE_TYPE_PPTX) {
665 info->parts = g_list_insert_sorted (info->parts, g_strdup (part_name + 1),
666 compare_slide_name);
667 } else {
668 info->parts = g_list_append (info->parts, g_strdup (part_name + 1));
669 }
670 }
671 }
672
673 /* ------------------------- Main methods -----------------------------------*/
674
675 static MsOfficeXMLFileType
676 msoffice_xml_get_file_type (const gchar *uri)
677 {
678 GFile *file;
679 GFileInfo *file_info;
680 const gchar *mime_used;
681 MsOfficeXMLFileType file_type;
682
683 /* Get GFile from uri... */
684 file = g_file_new_for_uri (uri);
685 if (!file) {
686 g_warning ("Could not create GFile for URI:'%s'", uri);
687 return FILE_TYPE_INVALID;
688 }
689
690 /* Get GFileInfo from GFile... (synchronous) */
691 file_info = g_file_query_info (file,
692 G_FILE_ATTRIBUTE_STANDARD_CONTENT_TYPE,
693 G_FILE_QUERY_INFO_NONE,
694 NULL,
695 NULL);
696 g_object_unref (file);
697 if (!file_info) {
698 g_warning ("Could not get GFileInfo for URI:'%s'", uri);
699 return FILE_TYPE_INVALID;
700 }
701
702 /* Get Content Type from GFileInfo. The constant string will be valid
703 * as long as the file info reference is valid */
704 mime_used = g_file_info_get_content_type (file_info);
705 if (g_ascii_strcasecmp (mime_used, "application/vnd.openxmlformats-officedocument.wordprocessingml.document") == 0) {
706 /* MsOffice Word document */
707 file_type = FILE_TYPE_DOCX;
708 } else if (g_ascii_strcasecmp (mime_used, "application/vnd.openxmlformats-officedocument.presentationml.presentation") == 0) {
709 /* MsOffice Powerpoint document */
710 file_type = FILE_TYPE_PPTX;
711 } else if (g_ascii_strcasecmp (mime_used, "application/vnd.openxmlformats-officedocument.presentationml.slideshow") == 0) {
712 /* MsOffice Powerpoint (slideshow) document */
713 file_type = FILE_TYPE_PPSX;
714 } else if (g_ascii_strcasecmp (mime_used, "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet") == 0) {
715 /* MsOffice Excel document */
716 file_type = FILE_TYPE_XLSX;
717 } else {
718 g_message ("Mime type was not recognised:'%s'", mime_used);
719 file_type = FILE_TYPE_INVALID;
720 }
721
722 g_object_unref (file_info);
723
724 return file_type;
725 }
726
727 static void
728 extract_content (MsOfficeXMLParserInfo *info)
729 {
730 GList *parts;
731
732 if (!info->parts) {
733 return;
734 }
735
736 for (parts = info->parts; parts; parts = parts->next) {
737 const gchar *part_name;
738
739 part_name = parts->data;
740 /* If reached max bytes to extract, don't event start parsing the file... just return */
741 if (info->bytes_pending == 0) {
742 g_debug ("Skipping '%s' as already reached max bytes to extract",
743 part_name);
744 break;
745 } else if (g_timer_elapsed (info->timer, NULL) > 5) {
746 g_debug ("Skipping '%s' as already reached max time to extract",
747 part_name);
748 break;
749 } else {
750 xml_read (info, part_name, MS_OFFICE_XML_TAG_DOCUMENT_TEXT_DATA);
751 }
752 }
753 }
754
755 G_MODULE_EXPORT gboolean
756 tracker_extract_get_metadata (TrackerExtractInfo *extract_info)
757 {
758 MsOfficeXMLParserInfo info = { 0 };
759 MsOfficeXMLFileType file_type;
760 TrackerSparqlBuilder *metadata;
761 TrackerConfig *config;
762 GMarkupParseContext *context = NULL;
763 GError *error = NULL;
764 GFile *file;
765 gchar *uri;
766
767 if (G_UNLIKELY (maximum_size_error_quark == 0)) {
768 maximum_size_error_quark = g_quark_from_static_string ("maximum_size_error");
769 }
770
771 metadata = tracker_extract_info_get_metadata_builder (extract_info);
772 file = tracker_extract_info_get_file (extract_info);
773 uri = g_file_get_uri (file);
774
775 /* Get current Content Type */
776 file_type = msoffice_xml_get_file_type (uri);
777
778 /* Setup conf */
779 config = tracker_main_get_config ();
780
781 g_debug ("Extracting MsOffice XML format...");
782
783 tracker_sparql_builder_predicate (metadata, "a");
784 tracker_sparql_builder_object (metadata, "nfo:PaginatedTextDocument");
785
786 /* Setup Parser info */
787 info.metadata = metadata;
788 info.file_type = file_type;
789 info.tag_type = MS_OFFICE_XML_TAG_INVALID;
790 info.style_element_present = FALSE;
791 info.preserve_attribute_present = FALSE;
792 info.uri = uri;
793 info.content = NULL;
794 info.title_already_set = FALSE;
795 info.generator_already_set = FALSE;
796 info.bytes_pending = tracker_config_get_max_bytes (config);
797
798 /* Create content-type parser context */
799 context = g_markup_parse_context_new (&content_types_parser,
800 0,
801 &info,
802 NULL);
803
804 info.timer = g_timer_new ();
805 /* Load the internal XML file from the Zip archive, and parse it
806 * using the given context */
807 tracker_gsf_parse_xml_in_zip (uri,
808 "[Content_Types].xml",
809 context,
810 &error);
811 if (error) {
812 g_debug ("Parsing the content-types file gave an error: '%s'",
813 error->message);
814 g_error_free (error);
815 }
816
817 extract_content (&info);
818
819 /* If we got any content, add it */
820 if (info.content) {
821 gchar *content;
822
823 content = g_string_free (info.content, FALSE);
824 info.content = NULL;
825
826 if (content) {
827 tracker_sparql_builder_predicate (metadata, "nie:plainTextContent");
828 tracker_sparql_builder_object_unvalidated (metadata, content);
829 g_free (content);
830 }
831 }
832
833 if (info.parts) {
834 g_list_foreach (info.parts, (GFunc) g_free, NULL);
835 g_list_free (info.parts);
836 }
837
838 g_timer_destroy (info.timer);
839 g_markup_parse_context_free (context);
840 g_free (uri);
841
842 return TRUE;
843 }