tracker-0.16.2/src/tracker-extract/tracker-extract-epub.c

No issues found

  1 /*
  2  * Copyright (C) 2006, Jamie McCracken <jamiemcc@gnome.org>
  3  * Copyright (C) 2008, Nokia <ivan.frade@nokia.com>
  4  *
  5  * This library is free software; you can redistribute it and/or
  6  * modify it under the terms of the GNU Lesser General Public
  7  * License as published by the Free Software Foundation; either
  8  * version 2.1 of the License, or (at your option) any later version.
  9  *
 10  * This library is distributed in the hope that it will be useful,
 11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 13  * Lesser General Public License for more details.
 14  *
 15  * You should have received a copy of the GNU Lesser General Public
 16  * License along with this library; if not, write to the
 17  * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
 18  * Boston, MA  02110-1301, USA.
 19  */
 20 
 21 #include <libtracker-extract/tracker-extract.h>
 22 
 23 #include "tracker-main.h"
 24 #include "tracker-gsf.h"
 25 #include "tracker-read.h"
 26 
 27 #include <unistd.h>
 28 
 29 typedef enum {
 30 	OPF_TAG_TYPE_UNKNOWN,
 31 	OPF_TAG_TYPE_TITLE,
 32 	OPF_TAG_TYPE_AUTHOR,
 33 	OPF_TAG_TYPE_CREATED
 34 } OPFTagType;
 35 
 36 typedef struct {
 37 	TrackerSparqlBuilder *preupdate;
 38 	TrackerSparqlBuilder *metadata;
 39 	OPFTagType element;
 40 	GList *pages;
 41 	guint in_metadata : 1;
 42 	guint in_manifest : 1;
 43 } OPFData;
 44 
 45 typedef struct {
 46 	GString *contents;
 47 	gsize limit;
 48 } OPFContentData;
 49 
 50 /* Methods to parse the container.xml file
 51  * pointing to the real metadata/content
 52  */
 53 static void
 54 container_xml_start_element_handler (GMarkupParseContext  *context,
 55                                      const gchar          *element_name,
 56                                      const gchar         **attribute_names,
 57                                      const gchar         **attribute_values,
 58                                      gpointer              user_data,
 59                                      GError              **error)
 60 {
 61 	gchar **path_out = user_data;
 62 	gint i;
 63 
 64 	if (g_strcmp0 (element_name, "rootfile") != 0) {
 65 		return;
 66 	}
 67 
 68 	for (i = 0; attribute_names[i] != NULL; i++) {
 69 		if (g_strcmp0 (attribute_names[i], "full-path") == 0) {
 70 			if (!*path_out) {
 71 				*path_out = g_strdup (attribute_values[i]);
 72 			}
 73 			break;
 74 		}
 75 	}
 76 }
 77 
 78 /* Methods to parse the OPF document metadata/layout */
 79 static void
 80 opf_xml_start_element_handler (GMarkupParseContext  *context,
 81                                const gchar          *element_name,
 82                                const gchar         **attribute_names,
 83                                const gchar         **attribute_values,
 84                                gpointer              user_data,
 85                                GError              **error)
 86 {
 87 	OPFData *data = user_data;
 88 	gint i;
 89 
 90 	if (g_strcmp0 (element_name, "metadata") == 0) {
 91 		data->in_metadata = TRUE;
 92 	} else if (g_strcmp0 (element_name, "manifest") == 0) {
 93 		data->in_manifest = TRUE;
 94 	} else if (data->in_metadata) {
 95 		/* epub metadata */
 96 		if (g_strcmp0 (element_name, "dc:title") == 0) {
 97 			data->element = OPF_TAG_TYPE_TITLE;
 98 		} else if (g_strcmp0 (element_name, "dc:creator") == 0) {
 99 			for (i = 0; attribute_names[i] != NULL; i++) {
100 				if (g_strcmp0 (attribute_names[i], "opf:role") == 0 &&
101 				    g_strcmp0 (attribute_values[i], "aut") == 0) {
102 					data->element = OPF_TAG_TYPE_AUTHOR;
103 					break;
104 				}
105 			}
106 		} else if (g_strcmp0 (element_name, "dc:date") == 0) {
107 			for (i = 0; attribute_names[i] != NULL; i++) {
108 				if (g_strcmp0 (attribute_names[i], "opf:event") == 0 &&
109 				    g_strcmp0 (attribute_values[i], "original-publication") == 0) {
110 					data->element = OPF_TAG_TYPE_CREATED;
111 					break;
112 				}
113 			}
114 		}
115 	} else if (data->in_manifest &&
116 		   g_strcmp0 (element_name, "item") == 0) {
117 		const gchar *rel_path = NULL;
118 		gboolean is_xhtml = FALSE;
119 
120 		/* Keep list of xhtml documents for plain text extraction */
121 		for (i = 0; attribute_names[i] != NULL; i++) {
122 			if (g_strcmp0 (attribute_names[i], "href") == 0) {
123 				rel_path = attribute_values[i];
124 			} else if (g_strcmp0 (attribute_names[i], "media-type") == 0 &&
125 			           g_strcmp0 (attribute_values[i], "application/xhtml+xml") == 0) {
126 				is_xhtml = TRUE;
127 			}
128 		}
129 
130 		if (is_xhtml && rel_path) {
131 			data->pages = g_list_append (data->pages, g_strdup (rel_path));
132 		}
133 	}
134 }
135 
136 static void
137 opf_xml_end_element_handler (GMarkupParseContext  *context,
138                              const gchar          *element_name,
139                              gpointer              user_data,
140                              GError              **error)
141 {
142 	OPFData *data = user_data;
143 
144 	if (g_strcmp0 (element_name, "metadata") == 0) {
145 		data->in_metadata = FALSE;
146 	} else if (g_strcmp0 (element_name, "manifest") == 0) {
147 		data->in_manifest = FALSE;
148 	} else {
149 		data->element = OPF_TAG_TYPE_UNKNOWN;
150 	}
151 }
152 
153 static void
154 opf_xml_text_handler (GMarkupParseContext   *context,
155                       const gchar           *text,
156                       gsize                  text_len,
157                       gpointer               user_data,
158                       GError               **error)
159 {
160 	OPFData *data = user_data;
161 	gchar *date;
162 
163 	switch (data->element) {
164 	case OPF_TAG_TYPE_AUTHOR:
165 		tracker_sparql_builder_predicate (data->metadata, "nco:publisher");
166 
167 		tracker_sparql_builder_object_blank_open (data->metadata);
168 		tracker_sparql_builder_predicate (data->metadata, "a");
169 		tracker_sparql_builder_object (data->metadata, "nco:Contact");
170 
171 		tracker_sparql_builder_predicate (data->metadata, "nco:fullname");
172 		tracker_sparql_builder_object_unvalidated (data->metadata, text);
173 		tracker_sparql_builder_object_blank_close (data->metadata);
174 		break;
175 	case OPF_TAG_TYPE_TITLE:
176 		tracker_sparql_builder_predicate (data->metadata, "nie:title");
177 		tracker_sparql_builder_object_unvalidated (data->metadata, text);
178 		break;
179 	case OPF_TAG_TYPE_CREATED:
180 		date = tracker_date_guess (text);
181 		tracker_sparql_builder_predicate (data->metadata, "nie:contentCreated");
182 		tracker_sparql_builder_object_unvalidated (data->metadata, date);
183 		g_free (date);
184 		break;
185 	case OPF_TAG_TYPE_UNKNOWN:
186 	default:
187 		break;
188 	}
189 }
190 
191 /* Methods to extract XHTML text content */
192 static void
193 content_xml_text_handler (GMarkupParseContext   *context,
194                           const gchar           *text,
195                           gsize                  text_len,
196                           gpointer               user_data,
197                           GError               **error)
198 {
199 	OPFContentData *content_data = user_data;
200 	gsize written_bytes = 0;
201 
202 	if (text_len <= 0) {
203 		return;
204 	}
205 
206 	if (tracker_text_validate_utf8 (text,
207 	                                MIN (text_len, content_data->limit),
208 	                                &content_data->contents,
209 	                                &written_bytes)) {
210 		if (content_data->contents->str[content_data->contents->len - 1] != ' ') {
211 			g_string_append_c (content_data->contents, ' ');
212 		}
213 	}
214 
215 	content_data->limit -= written_bytes;
216 }
217 
218 static gchar *
219 extract_opf_path (const gchar *uri)
220 {
221 	GMarkupParseContext *context;
222 	gchar *path = NULL;
223 	GError *error = NULL;
224 	GMarkupParser parser = {
225 		container_xml_start_element_handler,
226 		NULL, NULL, NULL, NULL
227 	};
228 
229 	/* Create parsing context */
230 	context = g_markup_parse_context_new (&parser, 0, &path, NULL);
231 
232 	/* Load the internal container file from the Zip archive,
233 	 * and parse it to extract the .opf file to get metadata from
234 	 */
235 	tracker_gsf_parse_xml_in_zip (uri, "META-INF/container.xml", context, &error);
236 	g_markup_parse_context_free (context);
237 
238 	if (error || !path) {
239 		g_warning ("Could not get EPUB container.xml file: %s\n",
240 		           (error) ? error->message : "No error provided");
241 		g_error_free (error);
242 		return NULL;
243 	}
244 
245 	return path;
246 }
247 
248 static gchar *
249 extract_opf_contents (const gchar *uri,
250                       const gchar *content_prefix,
251                       GList       *content_files)
252 {
253 	OPFContentData content_data = { 0 };
254 	GMarkupParseContext *context;
255 	TrackerConfig *config;
256 	GError *error = NULL;
257 	GList *l;
258 	GMarkupParser xml_parser = {
259 		NULL, NULL,
260 		content_xml_text_handler,
261 		NULL, NULL
262 	};
263 
264 	config = tracker_main_get_config ();
265 	context = g_markup_parse_context_new (&xml_parser, 0, &content_data, NULL);
266 
267 	content_data.contents = g_string_new ("");
268 	content_data.limit = (gsize) tracker_config_get_max_bytes (config);
269 
270 	g_debug ("Extracting up to %" G_GSIZE_FORMAT " bytes of content", content_data.limit);
271 
272 	for (l = content_files; l; l = l->next) {
273 		gchar *path;
274 
275 		/* Page file is relative to OPF file location */
276 		path = g_build_filename (content_prefix, l->data, NULL);
277 		tracker_gsf_parse_xml_in_zip (uri, path, context, &error);
278 		g_free (path);
279 
280 		if (error) {
281 			g_warning ("Error extracting EPUB contents: %s\n",
282 			           error->message);
283 			break;
284 		}
285 
286 		if (content_data.limit <= 0) {
287 			/* Reached plain text extraction limit */
288 			break;
289 		}
290 	}
291 
292 	g_markup_parse_context_free (context);
293 
294 	return g_string_free (content_data.contents, FALSE);
295 }
296 
297 static gboolean
298 extract_opf (const gchar          *uri,
299              const gchar          *opf_path,
300              TrackerSparqlBuilder *preupdate,
301              TrackerSparqlBuilder *metadata)
302 {
303 	GMarkupParseContext *context;
304 	OPFData data = { 0 };
305 	GError *error = NULL;
306 	gchar *dirname, *contents;
307 	GMarkupParser opf_parser = {
308 		opf_xml_start_element_handler,
309 		opf_xml_end_element_handler,
310 		opf_xml_text_handler,
311 		NULL, NULL
312 	};
313 
314 	g_debug ("Extracting OPF file contents from EPUB '%s'", uri);
315 
316 	tracker_sparql_builder_predicate (metadata, "a");
317 	tracker_sparql_builder_object (metadata, "nfo:TextDocument");
318 
319 	data.metadata = metadata;
320 	data.preupdate = preupdate;
321 
322 	/* Create parsing context */
323 	context = g_markup_parse_context_new (&opf_parser, 0, &data, NULL);
324 
325 	/* Load the internal container file from the Zip archive,
326 	 * and parse it to extract the .opf file to get metadata from
327 	 */
328 	tracker_gsf_parse_xml_in_zip (uri, opf_path, context, &error);
329 	g_markup_parse_context_free (context);
330 
331 	if (error) {
332 		g_warning ("Could not get EPUB '%s' file: %s\n", opf_path,
333 		           (error) ? error->message : "No error provided");
334 		g_error_free (error);
335 		return FALSE;
336 	}
337 
338 	dirname = g_path_get_dirname (opf_path);
339 	contents = extract_opf_contents (uri, dirname, data.pages);
340 	g_free (dirname);
341 
342 	if (contents && *contents) {
343 		tracker_sparql_builder_predicate (metadata, "nie:plainTextContent");
344 		tracker_sparql_builder_object_unvalidated (metadata, contents);
345 	}
346 
347 	g_list_foreach (data.pages, (GFunc) g_free, NULL);
348 	g_list_free (data.pages);
349 	g_free (contents);
350 
351 	return TRUE;
352 }
353 
354 G_MODULE_EXPORT gboolean
355 tracker_extract_get_metadata (TrackerExtractInfo *info)
356 {
357 	gchar *opf_path, *uri;
358 	GFile *file;
359 
360 	file = tracker_extract_info_get_file (info);
361 	uri = g_file_get_uri (file);
362 
363 	opf_path = extract_opf_path (uri);
364 
365 	if (!opf_path) {
366 		g_free (uri);
367 		return FALSE;
368 	}
369 
370 	extract_opf (uri, opf_path,
371 	             tracker_extract_info_get_preupdate_builder (info),
372 	             tracker_extract_info_get_metadata_builder (info));
373 	g_free (opf_path);
374 	g_free (uri);
375 
376 	return TRUE;
377 }