No issues found
1 /*
2 * Copyright (C) 2006, Jamie McCracken <jamiemcc@gnome.org>
3 * Copyright (C) 2008, Nokia <ivan.frade@nokia.com>
4 *
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2.1 of the License, or (at your option) any later version.
9 *
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
14 *
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, write to the
17 * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
18 * Boston, MA 02110-1301, USA.
19 */
20
21 #include <libtracker-extract/tracker-extract.h>
22
23 #include "tracker-main.h"
24 #include "tracker-gsf.h"
25 #include "tracker-read.h"
26
27 #include <unistd.h>
28
29 typedef enum {
30 OPF_TAG_TYPE_UNKNOWN,
31 OPF_TAG_TYPE_TITLE,
32 OPF_TAG_TYPE_AUTHOR,
33 OPF_TAG_TYPE_CREATED
34 } OPFTagType;
35
36 typedef struct {
37 TrackerSparqlBuilder *preupdate;
38 TrackerSparqlBuilder *metadata;
39 OPFTagType element;
40 GList *pages;
41 guint in_metadata : 1;
42 guint in_manifest : 1;
43 } OPFData;
44
45 typedef struct {
46 GString *contents;
47 gsize limit;
48 } OPFContentData;
49
50 /* Methods to parse the container.xml file
51 * pointing to the real metadata/content
52 */
53 static void
54 container_xml_start_element_handler (GMarkupParseContext *context,
55 const gchar *element_name,
56 const gchar **attribute_names,
57 const gchar **attribute_values,
58 gpointer user_data,
59 GError **error)
60 {
61 gchar **path_out = user_data;
62 gint i;
63
64 if (g_strcmp0 (element_name, "rootfile") != 0) {
65 return;
66 }
67
68 for (i = 0; attribute_names[i] != NULL; i++) {
69 if (g_strcmp0 (attribute_names[i], "full-path") == 0) {
70 if (!*path_out) {
71 *path_out = g_strdup (attribute_values[i]);
72 }
73 break;
74 }
75 }
76 }
77
78 /* Methods to parse the OPF document metadata/layout */
79 static void
80 opf_xml_start_element_handler (GMarkupParseContext *context,
81 const gchar *element_name,
82 const gchar **attribute_names,
83 const gchar **attribute_values,
84 gpointer user_data,
85 GError **error)
86 {
87 OPFData *data = user_data;
88 gint i;
89
90 if (g_strcmp0 (element_name, "metadata") == 0) {
91 data->in_metadata = TRUE;
92 } else if (g_strcmp0 (element_name, "manifest") == 0) {
93 data->in_manifest = TRUE;
94 } else if (data->in_metadata) {
95 /* epub metadata */
96 if (g_strcmp0 (element_name, "dc:title") == 0) {
97 data->element = OPF_TAG_TYPE_TITLE;
98 } else if (g_strcmp0 (element_name, "dc:creator") == 0) {
99 for (i = 0; attribute_names[i] != NULL; i++) {
100 if (g_strcmp0 (attribute_names[i], "opf:role") == 0 &&
101 g_strcmp0 (attribute_values[i], "aut") == 0) {
102 data->element = OPF_TAG_TYPE_AUTHOR;
103 break;
104 }
105 }
106 } else if (g_strcmp0 (element_name, "dc:date") == 0) {
107 for (i = 0; attribute_names[i] != NULL; i++) {
108 if (g_strcmp0 (attribute_names[i], "opf:event") == 0 &&
109 g_strcmp0 (attribute_values[i], "original-publication") == 0) {
110 data->element = OPF_TAG_TYPE_CREATED;
111 break;
112 }
113 }
114 }
115 } else if (data->in_manifest &&
116 g_strcmp0 (element_name, "item") == 0) {
117 const gchar *rel_path = NULL;
118 gboolean is_xhtml = FALSE;
119
120 /* Keep list of xhtml documents for plain text extraction */
121 for (i = 0; attribute_names[i] != NULL; i++) {
122 if (g_strcmp0 (attribute_names[i], "href") == 0) {
123 rel_path = attribute_values[i];
124 } else if (g_strcmp0 (attribute_names[i], "media-type") == 0 &&
125 g_strcmp0 (attribute_values[i], "application/xhtml+xml") == 0) {
126 is_xhtml = TRUE;
127 }
128 }
129
130 if (is_xhtml && rel_path) {
131 data->pages = g_list_append (data->pages, g_strdup (rel_path));
132 }
133 }
134 }
135
136 static void
137 opf_xml_end_element_handler (GMarkupParseContext *context,
138 const gchar *element_name,
139 gpointer user_data,
140 GError **error)
141 {
142 OPFData *data = user_data;
143
144 if (g_strcmp0 (element_name, "metadata") == 0) {
145 data->in_metadata = FALSE;
146 } else if (g_strcmp0 (element_name, "manifest") == 0) {
147 data->in_manifest = FALSE;
148 } else {
149 data->element = OPF_TAG_TYPE_UNKNOWN;
150 }
151 }
152
153 static void
154 opf_xml_text_handler (GMarkupParseContext *context,
155 const gchar *text,
156 gsize text_len,
157 gpointer user_data,
158 GError **error)
159 {
160 OPFData *data = user_data;
161 gchar *date;
162
163 switch (data->element) {
164 case OPF_TAG_TYPE_AUTHOR:
165 tracker_sparql_builder_predicate (data->metadata, "nco:publisher");
166
167 tracker_sparql_builder_object_blank_open (data->metadata);
168 tracker_sparql_builder_predicate (data->metadata, "a");
169 tracker_sparql_builder_object (data->metadata, "nco:Contact");
170
171 tracker_sparql_builder_predicate (data->metadata, "nco:fullname");
172 tracker_sparql_builder_object_unvalidated (data->metadata, text);
173 tracker_sparql_builder_object_blank_close (data->metadata);
174 break;
175 case OPF_TAG_TYPE_TITLE:
176 tracker_sparql_builder_predicate (data->metadata, "nie:title");
177 tracker_sparql_builder_object_unvalidated (data->metadata, text);
178 break;
179 case OPF_TAG_TYPE_CREATED:
180 date = tracker_date_guess (text);
181 tracker_sparql_builder_predicate (data->metadata, "nie:contentCreated");
182 tracker_sparql_builder_object_unvalidated (data->metadata, date);
183 g_free (date);
184 break;
185 case OPF_TAG_TYPE_UNKNOWN:
186 default:
187 break;
188 }
189 }
190
191 /* Methods to extract XHTML text content */
192 static void
193 content_xml_text_handler (GMarkupParseContext *context,
194 const gchar *text,
195 gsize text_len,
196 gpointer user_data,
197 GError **error)
198 {
199 OPFContentData *content_data = user_data;
200 gsize written_bytes = 0;
201
202 if (text_len <= 0) {
203 return;
204 }
205
206 if (tracker_text_validate_utf8 (text,
207 MIN (text_len, content_data->limit),
208 &content_data->contents,
209 &written_bytes)) {
210 if (content_data->contents->str[content_data->contents->len - 1] != ' ') {
211 g_string_append_c (content_data->contents, ' ');
212 }
213 }
214
215 content_data->limit -= written_bytes;
216 }
217
218 static gchar *
219 extract_opf_path (const gchar *uri)
220 {
221 GMarkupParseContext *context;
222 gchar *path = NULL;
223 GError *error = NULL;
224 GMarkupParser parser = {
225 container_xml_start_element_handler,
226 NULL, NULL, NULL, NULL
227 };
228
229 /* Create parsing context */
230 context = g_markup_parse_context_new (&parser, 0, &path, NULL);
231
232 /* Load the internal container file from the Zip archive,
233 * and parse it to extract the .opf file to get metadata from
234 */
235 tracker_gsf_parse_xml_in_zip (uri, "META-INF/container.xml", context, &error);
236 g_markup_parse_context_free (context);
237
238 if (error || !path) {
239 g_warning ("Could not get EPUB container.xml file: %s\n",
240 (error) ? error->message : "No error provided");
241 g_error_free (error);
242 return NULL;
243 }
244
245 return path;
246 }
247
248 static gchar *
249 extract_opf_contents (const gchar *uri,
250 const gchar *content_prefix,
251 GList *content_files)
252 {
253 OPFContentData content_data = { 0 };
254 GMarkupParseContext *context;
255 TrackerConfig *config;
256 GError *error = NULL;
257 GList *l;
258 GMarkupParser xml_parser = {
259 NULL, NULL,
260 content_xml_text_handler,
261 NULL, NULL
262 };
263
264 config = tracker_main_get_config ();
265 context = g_markup_parse_context_new (&xml_parser, 0, &content_data, NULL);
266
267 content_data.contents = g_string_new ("");
268 content_data.limit = (gsize) tracker_config_get_max_bytes (config);
269
270 g_debug ("Extracting up to %" G_GSIZE_FORMAT " bytes of content", content_data.limit);
271
272 for (l = content_files; l; l = l->next) {
273 gchar *path;
274
275 /* Page file is relative to OPF file location */
276 path = g_build_filename (content_prefix, l->data, NULL);
277 tracker_gsf_parse_xml_in_zip (uri, path, context, &error);
278 g_free (path);
279
280 if (error) {
281 g_warning ("Error extracting EPUB contents: %s\n",
282 error->message);
283 break;
284 }
285
286 if (content_data.limit <= 0) {
287 /* Reached plain text extraction limit */
288 break;
289 }
290 }
291
292 g_markup_parse_context_free (context);
293
294 return g_string_free (content_data.contents, FALSE);
295 }
296
297 static gboolean
298 extract_opf (const gchar *uri,
299 const gchar *opf_path,
300 TrackerSparqlBuilder *preupdate,
301 TrackerSparqlBuilder *metadata)
302 {
303 GMarkupParseContext *context;
304 OPFData data = { 0 };
305 GError *error = NULL;
306 gchar *dirname, *contents;
307 GMarkupParser opf_parser = {
308 opf_xml_start_element_handler,
309 opf_xml_end_element_handler,
310 opf_xml_text_handler,
311 NULL, NULL
312 };
313
314 g_debug ("Extracting OPF file contents from EPUB '%s'", uri);
315
316 tracker_sparql_builder_predicate (metadata, "a");
317 tracker_sparql_builder_object (metadata, "nfo:TextDocument");
318
319 data.metadata = metadata;
320 data.preupdate = preupdate;
321
322 /* Create parsing context */
323 context = g_markup_parse_context_new (&opf_parser, 0, &data, NULL);
324
325 /* Load the internal container file from the Zip archive,
326 * and parse it to extract the .opf file to get metadata from
327 */
328 tracker_gsf_parse_xml_in_zip (uri, opf_path, context, &error);
329 g_markup_parse_context_free (context);
330
331 if (error) {
332 g_warning ("Could not get EPUB '%s' file: %s\n", opf_path,
333 (error) ? error->message : "No error provided");
334 g_error_free (error);
335 return FALSE;
336 }
337
338 dirname = g_path_get_dirname (opf_path);
339 contents = extract_opf_contents (uri, dirname, data.pages);
340 g_free (dirname);
341
342 if (contents && *contents) {
343 tracker_sparql_builder_predicate (metadata, "nie:plainTextContent");
344 tracker_sparql_builder_object_unvalidated (metadata, contents);
345 }
346
347 g_list_foreach (data.pages, (GFunc) g_free, NULL);
348 g_list_free (data.pages);
349 g_free (contents);
350
351 return TRUE;
352 }
353
354 G_MODULE_EXPORT gboolean
355 tracker_extract_get_metadata (TrackerExtractInfo *info)
356 {
357 gchar *opf_path, *uri;
358 GFile *file;
359
360 file = tracker_extract_info_get_file (info);
361 uri = g_file_get_uri (file);
362
363 opf_path = extract_opf_path (uri);
364
365 if (!opf_path) {
366 g_free (uri);
367 return FALSE;
368 }
369
370 extract_opf (uri, opf_path,
371 tracker_extract_info_get_preupdate_builder (info),
372 tracker_extract_info_get_metadata_builder (info));
373 g_free (opf_path);
374 g_free (uri);
375
376 return TRUE;
377 }