No issues found
1 /*
2 * Copyright (C) 2006, Jamie McCracken <jamiemcc@gnome.org>
3 * Copyright (C) 2008, Nokia <ivan.frade@nokia.com>
4 *
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2.1 of the License, or (at your option) any later version.
9 *
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
14 *
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, write to the
17 * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
18 * Boston, MA 02110-1301, USA.
19 */
20
21 #include <libtracker-common/tracker-os-dependant.h>
22
23 #include <libtracker-extract/tracker-extract.h>
24
25 #include "tracker-main.h"
26 #include "tracker-gsf.h"
27 #include "tracker-read.h"
28
29 #include <unistd.h>
30
31 typedef enum {
32 ODT_TAG_TYPE_UNKNOWN,
33 ODT_TAG_TYPE_TITLE,
34 ODT_TAG_TYPE_SUBJECT,
35 ODT_TAG_TYPE_AUTHOR,
36 ODT_TAG_TYPE_KEYWORDS,
37 ODT_TAG_TYPE_COMMENTS,
38 ODT_TAG_TYPE_STATS,
39 ODT_TAG_TYPE_CREATED,
40 ODT_TAG_TYPE_GENERATOR,
41 ODT_TAG_TYPE_WORD_TEXT,
42 ODT_TAG_TYPE_SLIDE_TEXT,
43 ODT_TAG_TYPE_SPREADSHEET_TEXT,
44 ODT_TAG_TYPE_GRAPHICS_TEXT
45 } ODTTagType;
46
47 typedef enum {
48 FILE_TYPE_INVALID,
49 FILE_TYPE_ODP,
50 FILE_TYPE_ODT,
51 FILE_TYPE_ODS,
52 FILE_TYPE_ODG
53 } ODTFileType;
54
55 typedef struct {
56 TrackerSparqlBuilder *metadata;
57 ODTTagType current;
58 const gchar *uri;
59 gboolean title_already_set;
60 } ODTMetadataParseInfo;
61
62 typedef struct {
63 ODTTagType current;
64 ODTFileType file_type;
65 GString *content;
66 gulong bytes_pending;
67 } ODTContentParseInfo;
68
69 GQuark maximum_size_error_quark = 0;
70
71 static void xml_start_element_handler_metadata (GMarkupParseContext *context,
72 const gchar *element_name,
73 const gchar **attribute_names,
74 const gchar **attribute_values,
75 gpointer user_data,
76 GError **error);
77 static void xml_end_element_handler_metadata (GMarkupParseContext *context,
78 const gchar *element_name,
79 gpointer user_data,
80 GError **error);
81 static void xml_text_handler_metadata (GMarkupParseContext *context,
82 const gchar *text,
83 gsize text_len,
84 gpointer user_data,
85 GError **error);
86 static void xml_start_element_handler_content (GMarkupParseContext *context,
87 const gchar *element_name,
88 const gchar **attribute_names,
89 const gchar **attribute_values,
90 gpointer user_data,
91 GError **error);
92 static void xml_end_element_handler_content (GMarkupParseContext *context,
93 const gchar *element_name,
94 gpointer user_data,
95 GError **error);
96 static void xml_text_handler_content (GMarkupParseContext *context,
97 const gchar *text,
98 gsize text_len,
99 gpointer user_data,
100 GError **error);
101 static void extract_oasis_content (const gchar *uri,
102 gulong total_bytes,
103 ODTFileType file_type,
104 TrackerSparqlBuilder *metadata);
105
106 static void
107 extract_oasis_content (const gchar *uri,
108 gulong total_bytes,
109 ODTFileType file_type,
110 TrackerSparqlBuilder *metadata)
111 {
112 gchar *content = NULL;
113 ODTContentParseInfo info;
114 GMarkupParseContext *context;
115 GError *error = NULL;
116 GMarkupParser parser = {
117 xml_start_element_handler_content,
118 xml_end_element_handler_content,
119 xml_text_handler_content,
120 NULL,
121 NULL
122 };
123
124 /* If no content requested, return */
125 if (total_bytes == 0) {
126 return;
127 }
128
129 /* Create parse info */
130 info.current = ODT_TAG_TYPE_UNKNOWN;
131 info.file_type = file_type;
132 info.content = g_string_new ("");
133 info.bytes_pending = total_bytes;
134
135 /* Create parsing context */
136 context = g_markup_parse_context_new (&parser, 0, &info, NULL);
137
138 /* Load the internal XML file from the Zip archive, and parse it
139 * using the given context */
140 tracker_gsf_parse_xml_in_zip (uri, "content.xml", context, &error);
141
142 if (!error || g_error_matches (error, maximum_size_error_quark, 0)) {
143 content = g_string_free (info.content, FALSE);
144 tracker_sparql_builder_predicate (metadata, "nie:plainTextContent");
145 tracker_sparql_builder_object_unvalidated (metadata, content);
146 } else {
147 g_warning ("Got error parsing XML file: %s\n", error->message);
148 g_string_free (info.content, TRUE);
149 }
150
151 if (error) {
152 g_error_free (error);
153 }
154
155 g_free (content);
156 g_markup_parse_context_free (context);
157 }
158
159 G_MODULE_EXPORT gboolean
160 tracker_extract_get_metadata (TrackerExtractInfo *extract_info)
161 {
162 TrackerSparqlBuilder *metadata;
163 TrackerConfig *config;
164 ODTMetadataParseInfo info;
165 ODTFileType file_type;
166 GFile *file;
167 gchar *uri;
168 const gchar *mime_used;
169 GMarkupParseContext *context;
170 GMarkupParser parser = {
171 xml_start_element_handler_metadata,
172 xml_end_element_handler_metadata,
173 xml_text_handler_metadata,
174 NULL,
175 NULL
176 };
177
178 if (G_UNLIKELY (maximum_size_error_quark == 0)) {
179 maximum_size_error_quark = g_quark_from_static_string ("maximum_size_error");
180 }
181
182 metadata = tracker_extract_info_get_metadata_builder (extract_info);
183 mime_used = tracker_extract_info_get_mimetype (extract_info);
184
185 file = tracker_extract_info_get_file (extract_info);
186 uri = g_file_get_uri (file);
187
188 /* Setup conf */
189 config = tracker_main_get_config ();
190
191 g_debug ("Extracting OASIS metadata and contents from '%s'", uri);
192
193 /* First, parse metadata */
194
195 tracker_sparql_builder_predicate (metadata, "a");
196 tracker_sparql_builder_object (metadata, "nfo:PaginatedTextDocument");
197
198 /* Create parse info */
199 info.metadata = metadata;
200 info.current = ODT_TAG_TYPE_UNKNOWN;
201 info.uri = uri;
202 info.title_already_set = FALSE;
203
204 /* Create parsing context */
205 context = g_markup_parse_context_new (&parser, 0, &info, NULL);
206
207 /* Load the internal XML file from the Zip archive, and parse it
208 * using the given context */
209 tracker_gsf_parse_xml_in_zip (uri, "meta.xml", context, NULL);
210 g_markup_parse_context_free (context);
211
212 if (g_ascii_strcasecmp (mime_used, "application/vnd.oasis.opendocument.text") == 0) {
213 file_type = FILE_TYPE_ODT;
214 } else if (g_ascii_strcasecmp (mime_used, "application/vnd.oasis.opendocument.presentation") == 0) {
215 file_type = FILE_TYPE_ODP;
216 } else if (g_ascii_strcasecmp (mime_used, "application/vnd.oasis.opendocument.spreadsheet") == 0) {
217 file_type = FILE_TYPE_ODS;
218 } else if (g_ascii_strcasecmp (mime_used, "application/vnd.oasis.opendocument.graphics") == 0) {
219 file_type = FILE_TYPE_ODG;
220 } else {
221 g_message ("Mime type was not recognised:'%s'", mime_used);
222 file_type = FILE_TYPE_INVALID;
223 }
224
225 /* Extract content with the given limitations */
226 extract_oasis_content (uri,
227 tracker_config_get_max_bytes (config),
228 file_type,
229 metadata);
230
231 g_free (uri);
232
233 return TRUE;
234 }
235
236 static void
237 xml_start_element_handler_metadata (GMarkupParseContext *context,
238 const gchar *element_name,
239 const gchar **attribute_names,
240 const gchar **attribute_values,
241 gpointer user_data,
242 GError **error)
243 {
244 ODTMetadataParseInfo *data = user_data;
245
246 if (g_ascii_strcasecmp (element_name, "dc:title") == 0) {
247 data->current = ODT_TAG_TYPE_TITLE;
248 } else if (g_ascii_strcasecmp (element_name, "dc:subject") == 0) {
249 data->current = ODT_TAG_TYPE_SUBJECT;
250 } else if (g_ascii_strcasecmp (element_name, "dc:creator") == 0) {
251 data->current = ODT_TAG_TYPE_AUTHOR;
252 } else if (g_ascii_strcasecmp (element_name, "meta:keyword") == 0) {
253 data->current = ODT_TAG_TYPE_KEYWORDS;
254 } else if (g_ascii_strcasecmp (element_name, "dc:description") == 0) {
255 data->current = ODT_TAG_TYPE_COMMENTS;
256 } else if (g_ascii_strcasecmp (element_name, "meta:document-statistic") == 0) {
257 TrackerSparqlBuilder *metadata;
258 const gchar **a, **v;
259
260 metadata = data->metadata;
261
262 for (a = attribute_names, v = attribute_values; *a; ++a, ++v) {
263 if (g_ascii_strcasecmp (*a, "meta:word-count") == 0) {
264 tracker_sparql_builder_predicate (metadata, "nfo:wordCount");
265 tracker_sparql_builder_object_unvalidated (metadata, *v);
266 } else if (g_ascii_strcasecmp (*a, "meta:page-count") == 0) {
267 tracker_sparql_builder_predicate (metadata, "nfo:pageCount");
268 tracker_sparql_builder_object_unvalidated (metadata, *v);
269 }
270 }
271
272 data->current = ODT_TAG_TYPE_STATS;
273 } else if (g_ascii_strcasecmp (element_name, "meta:creation-date") == 0) {
274 data->current = ODT_TAG_TYPE_CREATED;
275 } else if (g_ascii_strcasecmp (element_name, "meta:generator") == 0) {
276 data->current = ODT_TAG_TYPE_GENERATOR;
277 } else {
278 data->current = -1;
279 }
280 }
281
282 static void
283 xml_end_element_handler_metadata (GMarkupParseContext *context,
284 const gchar *element_name,
285 gpointer user_data,
286 GError **error)
287 {
288 ((ODTMetadataParseInfo*) user_data)->current = -1;
289 }
290
291 static void
292 xml_text_handler_metadata (GMarkupParseContext *context,
293 const gchar *text,
294 gsize text_len,
295 gpointer user_data,
296 GError **error)
297 {
298 ODTMetadataParseInfo *data;
299 TrackerSparqlBuilder *metadata;
300 gchar *date;
301
302 data = user_data;
303 metadata = data->metadata;
304
305 if (text_len == 0) {
306 /* ignore empty values */
307 return;
308 }
309
310 switch (data->current) {
311 case ODT_TAG_TYPE_TITLE:
312 if (data->title_already_set) {
313 g_warning ("Avoiding additional title (%s) in OASIS document '%s'",
314 text, data->uri);
315 } else {
316 data->title_already_set = TRUE;
317 tracker_sparql_builder_predicate (metadata, "nie:title");
318 tracker_sparql_builder_object_unvalidated (metadata, text);
319 }
320 break;
321
322 case ODT_TAG_TYPE_SUBJECT:
323 tracker_sparql_builder_predicate (metadata, "nie:subject");
324 tracker_sparql_builder_object_unvalidated (metadata, text);
325 break;
326
327 case ODT_TAG_TYPE_AUTHOR:
328 tracker_sparql_builder_predicate (metadata, "nco:publisher");
329
330 tracker_sparql_builder_object_blank_open (metadata);
331 tracker_sparql_builder_predicate (metadata, "a");
332 tracker_sparql_builder_object (metadata, "nco:Contact");
333
334 tracker_sparql_builder_predicate (metadata, "nco:fullname");
335 tracker_sparql_builder_object_unvalidated (metadata, text);
336 tracker_sparql_builder_object_blank_close (metadata);
337 break;
338
339 case ODT_TAG_TYPE_KEYWORDS: {
340 gchar *keywords;
341 gchar *lasts, *keyw;
342
343 keywords = g_strdup (text);
344
345 for (keyw = strtok_r (keywords, ",; ", &lasts);
346 keyw;
347 keyw = strtok_r (NULL, ",; ", &lasts)) {
348 tracker_sparql_builder_predicate (metadata, "nie:keyword");
349 tracker_sparql_builder_object_unvalidated (metadata, keyw);
350 }
351
352 g_free (keywords);
353
354 break;
355 }
356
357 case ODT_TAG_TYPE_COMMENTS:
358 tracker_sparql_builder_predicate (metadata, "nie:comment");
359 tracker_sparql_builder_object_unvalidated (metadata, text);
360 break;
361
362 case ODT_TAG_TYPE_CREATED:
363 date = tracker_date_guess (text);
364 if (date) {
365 tracker_sparql_builder_predicate (metadata, "nie:contentCreated");
366 tracker_sparql_builder_object_unvalidated (metadata, date);
367 g_free (date);
368 }
369 break;
370
371 case ODT_TAG_TYPE_GENERATOR:
372 tracker_sparql_builder_predicate (metadata, "nie:generator");
373 tracker_sparql_builder_object_unvalidated (metadata, text);
374 break;
375
376 default:
377 case ODT_TAG_TYPE_STATS:
378 break;
379 }
380 }
381
382 static void
383 xml_start_element_handler_content (GMarkupParseContext *context,
384 const gchar *element_name,
385 const gchar **attribute_names,
386 const gchar **attribute_values,
387 gpointer user_data,
388 GError **error)
389 {
390 ODTContentParseInfo *data = user_data;
391
392 switch (data->file_type) {
393 case FILE_TYPE_ODT:
394 if ((g_ascii_strcasecmp (element_name, "text:p") == 0) ||
395 (g_ascii_strcasecmp (element_name, "text:h") == 0) ||
396 (g_ascii_strcasecmp (element_name, "text:a") == 0) ||
397 (g_ascii_strcasecmp (element_name, "text:span") == 0) ||
398 (g_ascii_strcasecmp (element_name, "table:table-cell")) == 0) {
399 data->current = ODT_TAG_TYPE_WORD_TEXT;
400 } else {
401 data->current = -1;
402 }
403 break;
404
405 case FILE_TYPE_ODP:
406 data->current = ODT_TAG_TYPE_SLIDE_TEXT;
407 break;
408
409 case FILE_TYPE_ODS:
410 if (g_ascii_strncasecmp (element_name, "text", 4) == 0) {
411 data->current = ODT_TAG_TYPE_SPREADSHEET_TEXT;
412 } else {
413 data->current = -1;
414 }
415 break;
416
417 case FILE_TYPE_ODG:
418 if (g_ascii_strncasecmp (element_name, "text", 4) == 0) {
419 data->current = ODT_TAG_TYPE_GRAPHICS_TEXT;
420 } else {
421 data->current = -1;
422 }
423 break;
424
425 case FILE_TYPE_INVALID:
426 g_message ("Open Office Document type: %d invalid", data->file_type);
427 break;
428 }
429 }
430
431 static void
432 xml_end_element_handler_content (GMarkupParseContext *context,
433 const gchar *element_name,
434 gpointer user_data,
435 GError **error)
436 {
437 ODTContentParseInfo *data = user_data;
438
439 data->current = -1;
440 }
441
442 static void
443 xml_text_handler_content (GMarkupParseContext *context,
444 const gchar *text,
445 gsize text_len,
446 gpointer user_data,
447 GError **error)
448 {
449 ODTContentParseInfo *data = user_data;
450 gsize written_bytes = 0;
451
452 switch (data->current) {
453 case ODT_TAG_TYPE_WORD_TEXT:
454 case ODT_TAG_TYPE_SLIDE_TEXT:
455 case ODT_TAG_TYPE_SPREADSHEET_TEXT:
456 case ODT_TAG_TYPE_GRAPHICS_TEXT:
457 if (data->bytes_pending == 0) {
458 g_set_error_literal (error,
459 maximum_size_error_quark, 0,
460 "Maximum text limit reached");
461 break;
462 }
463
464 /* Look for valid UTF-8 text */
465 if (tracker_text_validate_utf8 (text,
466 MIN (text_len, data->bytes_pending),
467 &data->content,
468 &written_bytes)) {
469 if (data->content->str[data->content->len - 1] != ' ') {
470 /* If some bytes found to be valid, append an extra whitespace
471 * as separator */
472 g_string_append_c (data->content, ' ');
473 }
474 }
475
476 data->bytes_pending -= written_bytes;
477 break;
478
479 default:
480 break;
481 }
482 }