Location | Tool | Test ID | Function | Issue |
---|---|---|---|---|
tracker-extract-pdf.c:397:38 | gcc | maybe-uninitialized | extract_content_parent_process.isra.1 | 'bytes_remaining' may be used uninitialized in this function |
tracker-extract-pdf.c:399:39 | clang-analyzer | The right operand of '<' is a garbage value |
1 /*
2 * Copyright (C) 2006, Jamie McCracken <jamiemcc@gnome.org>
3 * Copyright (C) 2008-2011, Nokia <ivan.frade@nokia.com>
4 * Copyright (C) 2010, Amit Aggarwal <amitcs06@gmail.com>
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the
18 * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
19 * Boston, MA 02110-1301, USA.
20 */
21
22 #include "config.h"
23
24 #ifndef _GNU_SOURCE
25 #define _GNU_SOURCE
26 #endif
27
28 #include <sys/mman.h>
29 #include <sys/types.h>
30 #include <sys/stat.h>
31 #include <sys/select.h>
32 #include <sys/wait.h>
33 #include <errno.h>
34 #include <fcntl.h>
35 #include <string.h>
36 #include <unistd.h>
37 #include <stdio.h>
38 #include <stdlib.h>
39
40 #include <glib.h>
41 #include <glib/gstdio.h>
42 #include <glib/poppler.h>
43
44 #include <gio/gunixoutputstream.h>
45 #include <gio/gunixinputstream.h>
46
47 #include <libtracker-common/tracker-date-time.h>
48 #include <libtracker-common/tracker-utils.h>
49 #include <libtracker-common/tracker-file-utils.h>
50
51 #include <libtracker-extract/tracker-extract.h>
52
53 #include "tracker-main.h"
54
55 /* Time in seconds before we kill the forked child process used for
56 * content extraction */
57 #define EXTRACTION_PROCESS_TIMEOUT 10
58
59 /* Size of the buffer to use when reading, in bytes */
60 #define BUFFER_SIZE 65535
61
62 typedef struct {
63 gchar *title;
64 gchar *subject;
65 gchar *creation_date;
66 gchar *author;
67 gchar *date;
68 gchar *keywords;
69 } PDFData;
70
71 static void
72 read_toc (PopplerIndexIter *index,
73 GString **toc)
74 {
75 if (!index) {
76 return;
77 }
78
79 if (!*toc) {
80 *toc = g_string_new ("");
81 }
82
83 do {
84 PopplerAction *action;
85 PopplerIndexIter *iter;
86
87 action = poppler_index_iter_get_action (index);
88
89 if (!action) {
90 continue;
91 }
92
93 switch (action->type) {
94 case POPPLER_ACTION_GOTO_DEST: {
95 PopplerActionGotoDest *ag = (PopplerActionGotoDest *)action;
96 PopplerDest *agd = ag->dest;
97
98 if (!tracker_is_empty_string (ag->title)) {
99 g_string_append_printf (*toc, "%s ", ag->title);
100 }
101
102 if (!tracker_is_empty_string (agd->named_dest)) {
103 g_string_append_printf (*toc, "%s ", agd->named_dest);
104 }
105
106 break;
107 }
108
109 case POPPLER_ACTION_LAUNCH: {
110 PopplerActionLaunch *al = (PopplerActionLaunch *)action;
111
112 if (!tracker_is_empty_string (al->title)) {
113 g_string_append_printf (*toc, "%s ", al->title);
114 }
115
116 if (!tracker_is_empty_string (al->file_name)) {
117 g_string_append_printf (*toc, "%s ", al->file_name);
118 }
119
120 if (!tracker_is_empty_string (al->params)) {
121 g_string_append_printf (*toc, "%s ", al->params);
122 }
123
124 break;
125 }
126
127 case POPPLER_ACTION_URI: {
128 PopplerActionUri *au = (PopplerActionUri *)action;
129
130 if (!tracker_is_empty_string (au->uri)) {
131 g_string_append_printf (*toc, "%s ", au->uri);
132 }
133
134 break;
135 }
136
137 case POPPLER_ACTION_NAMED: {
138 PopplerActionNamed *an = (PopplerActionNamed *)action;
139
140 if (!tracker_is_empty_string (an->title)) {
141 g_string_append_printf (*toc, "%s, ", an->title);
142 }
143
144 if (!tracker_is_empty_string (an->named_dest)) {
145 g_string_append_printf (*toc, "%s ", an->named_dest);
146 }
147
148 break;
149 }
150
151 case POPPLER_ACTION_MOVIE: {
152 PopplerActionMovie *am = (PopplerActionMovie *)action;
153
154 if (!tracker_is_empty_string (am->title)) {
155 g_string_append_printf (*toc, "%s ", am->title);
156 }
157
158 break;
159 }
160
161 case POPPLER_ACTION_NONE:
162 case POPPLER_ACTION_UNKNOWN:
163 case POPPLER_ACTION_GOTO_REMOTE:
164 case POPPLER_ACTION_RENDITION:
165 case POPPLER_ACTION_OCG_STATE:
166 case POPPLER_ACTION_JAVASCRIPT:
167 /* Do nothing */
168 break;
169 }
170
171 poppler_action_free (action);
172 iter = poppler_index_iter_get_child (index);
173 read_toc (iter, toc);
174 } while (poppler_index_iter_next (index));
175
176 poppler_index_iter_free (index);
177 }
178
179 static void
180 read_outline (PopplerDocument *document,
181 TrackerSparqlBuilder *metadata)
182 {
183 PopplerIndexIter *index;
184 GString *toc = NULL;
185
186 index = poppler_index_iter_new (document);
187
188 if (!index) {
189 return;
190 }
191
192 read_toc (index, &toc);
193
194 if (toc) {
195 if (toc->len > 0) {
196 tracker_sparql_builder_predicate (metadata, "nfo:tableOfContents");
197 tracker_sparql_builder_object_unvalidated (metadata, toc->str);
198 }
199
200 g_string_free (toc, TRUE);
201 }
202 }
203
204 static GString *
205 extract_content_text (PopplerDocument *document,
206 gsize n_bytes)
207 {
208 gint n_pages, i = 0;
209 GString *string;
210 GTimer *timer;
211 gsize remaining_bytes = n_bytes;
212
213 n_pages = poppler_document_get_n_pages (document);
214 string = g_string_new ("");
215 timer = g_timer_new ();
216
217 while (i < n_pages &&
218 remaining_bytes > 0) {
219 PopplerPage *page;
220 gsize written_bytes = 0;
221 gchar *text;
222
223 page = poppler_document_get_page (document, i);
224 i++;
225
226 text = poppler_page_get_text (page);
227
228 if (!text) {
229 g_object_unref (page);
230 continue;
231 }
232
233 if (tracker_text_validate_utf8 (text,
234 MIN (strlen (text), remaining_bytes),
235 &string,
236 &written_bytes)) {
237 g_string_append_c (string, ' ');
238 }
239
240 remaining_bytes -= written_bytes;
241
242 g_debug ("Child: Extracted %" G_GSIZE_FORMAT " bytes from page %d, "
243 "%" G_GSIZE_FORMAT " bytes remaining",
244 written_bytes, i, remaining_bytes);
245
246 g_free (text);
247 g_object_unref (page);
248 }
249
250 g_debug ("Child: Content extraction finished: %d/%d pages indexed in %2.2f seconds, "
251 "%" G_GSIZE_FORMAT " bytes extracted",
252 i,
253 n_pages,
254 g_timer_elapsed (timer, NULL),
255 (n_bytes - remaining_bytes));
256
257 g_timer_destroy (timer);
258
259 return string;
260 }
261
262 static void
263 extract_content_child_process (PopplerDocument *document,
264 gsize n_bytes,
265 int fd[2])
266 {
267 GString *str;
268 gint64 size;
269 GOutputStream *output_stream;
270 GDataOutputStream *dataout_stream;
271
272 /* This is the child extracting the content, hopefully in time */
273
274 output_stream = g_unix_output_stream_new (fd[1], FALSE);
275 dataout_stream = g_data_output_stream_new (output_stream);
276 str = extract_content_text (document, n_bytes);
277 size = (gint64) str->len;
278
279 /* Write the results to the pipe */
280 if (g_data_output_stream_put_int64 (dataout_stream, size, NULL, NULL)) {
281 g_output_stream_write_all (output_stream,
282 str->str,
283 str->len,
284 NULL,
285 NULL,
286 NULL);
287 }
288
289 g_debug ("Child: Content extraction now finished in child process, "
290 "written %" G_GSIZE_FORMAT " bytes to parent process",
291 size);
292
293 g_string_free (str, TRUE);
294 g_object_unref (dataout_stream);
295 g_object_unref (output_stream);
296
297 close (fd[1]);
298
299 exit (0);
300 }
301
302 static gchar *
303 extract_content_parent_process (PopplerDocument *document,
304 int fd[2],
305 pid_t child_pid)
306 {
307 GInputStream *input_stream;
308 GDataInputStream *datain_stream;
309 GString *content = NULL;
310 GError *error = NULL;
311 GTimer *timer = NULL;
312 gsize bytes_expected = -1;
313 gboolean timed_out = FALSE;
314 gboolean finished = FALSE;
315 struct timeval timeout;
316 fd_set rfds;
317
318 /* This is the parent process waiting for the content extractor to
319 * finish in time. */
320
321 g_debug ("Parent: Content extraction now starting in child process (pid = %d)", child_pid);
322
323 /* Set up gio streams */
324 input_stream = g_unix_input_stream_new (fd[0], FALSE);
325 datain_stream = g_data_input_stream_new (input_stream);
326
327 /* Watch FD to see when it has input. */
328 FD_ZERO(&rfds);
329 FD_SET(fd[0], &rfds);
330
331 /* We give the content extractor 10 seconds to do its job */
332 timeout.tv_sec = EXTRACTION_PROCESS_TIMEOUT;
333 timeout.tv_usec = 0;
334
335 /* We also use our own timer because timeouts in select()
336 * can be inconsistent across UNIX platforms. Some update the
337 * timeout and some don't.
338 */
339 timer = g_timer_new ();
340
341 /* So, this is fairly simple, what we're doing here is using
342 * select() to know when the child process has written some or
343 * all the data and we then avoid the child blocking by
344 * reading from that stream. We couple with this with a
345 * timeout of 10 seconds so if we receive nothing then we know
346 * we can kill the process because it is taking too long.
347 *
348 * We use waitpid() to know if the process quit because it has
349 * finished or if it is still processing data and needs to be
350 * killed.
351 */
352 while (!finished) {
353 int retval;
354
355 /* 1a. Wait for data on the FD and limit by timeout */
356 retval = select (fd[0] + 1, &rfds, NULL, NULL, &timeout);
357
358 /* 2. Did we error? Have data? or just timeout? */
359 if (retval == -1) {
360 perror ("select()");
361 finished = TRUE;
362 } else if (retval == 1) {
363 gsize bytes_remaining;
364 gboolean read_finished = FALSE;
365
366 if (g_timer_elapsed (timer, NULL) >= EXTRACTION_PROCESS_TIMEOUT) {
367 finished = TRUE;
368 timed_out = TRUE;
369 continue;
370 }
371
372 /* 3. Start reading data */
373 if (bytes_expected == -1) {
374 /* We only need to read the size once before the data! */
375 bytes_expected = (gsize) g_data_input_stream_read_int64 (datain_stream,
376 NULL,
377 &error);
378 if (error) {
379 g_warning ("Call to g_data_input_stream_read_int64() failed, %s",
380 error->message);
381 g_error_free (error);
382 finished = TRUE;
383 continue;
384 }
385
386 g_debug ("Parent: Expected bytes to read is %" G_GSSIZE_FORMAT "", bytes_expected);
387 bytes_remaining = bytes_expected;
388 content = g_string_new ("");
389 }
390
391 /* 4. Read until done from stream and concatenate data */
392 while (!read_finished) {
393 gchar buf[BUFFER_SIZE];
394 gsize bytes_read;
395
396 memset (buf, 0, BUFFER_SIZE);
397 bytes_read = g_input_stream_read (G_INPUT_STREAM (datain_stream),
(emitted by gcc) 398 buf,
399 MIN (BUFFER_SIZE, bytes_remaining),
(emitted by clang-analyzer)TODO: a detailed trace is available in the data model (not yet rendered in this report)
400 NULL,
401 &error);
402
403 g_debug ("Parent: Bytes read is %" G_GSSIZE_FORMAT ","
404 "bytes remaining is %" G_GSSIZE_FORMAT "",
405 bytes_read,
406 MAX (bytes_remaining - bytes_read, 0));
407
408 if (bytes_read == -1 || error) {
409 g_warning ("Call to g_input_stream_read() failed, %s",
410 error ? error->message : "no error given");
411 g_clear_error (&error);
412 read_finished = TRUE;
413 finished = TRUE;
414 } else {
415 content = g_string_append (content, buf);
416
417 bytes_remaining -= bytes_read;
418 bytes_remaining = MAX (bytes_remaining, 0);
419
420 if (bytes_read == 0) {
421 /* We finished reading */
422 g_debug ("Parent: Finished reading all bytes");
423 read_finished = TRUE;
424 }
425
426 /* Are we finished reading everything */
427 if (bytes_remaining < 1) {
428 finished = TRUE;
429 }
430 }
431 }
432 } else {
433 /* 3. We must have timed out with no data in select() */
434 finished = TRUE;
435 timed_out = TRUE;
436 g_debug ("Parent: Must have timed out with no data in select()");
437 }
438 }
439
440 if (timed_out) {
441 g_debug ("Parent: Child process took too long. We waited %d seconds, so we're going to kill it!",
442 EXTRACTION_PROCESS_TIMEOUT);
443 kill (child_pid, SIGKILL);
444 } else {
445 g_debug ("Parent: Data received in %2.2f seconds (timeout is %d seconds)",
446 g_timer_elapsed (timer, NULL),
447 EXTRACTION_PROCESS_TIMEOUT);
448 }
449
450 g_timer_destroy (timer);
451
452 g_object_unref (datain_stream);
453 g_object_unref (input_stream);
454
455 close (fd[0]);
456
457 return content ? g_string_free (content, FALSE) : NULL;
458 }
459
460 static void
461 extract_content_child_cleanup (int action)
462 {
463 pid_t child_pid;
464 int status;
465
466 g_debug ("Parent: Zombies, say hello to my little friend!");
467 while ((child_pid = waitpid (-1, &status, WNOHANG)) > 0) {
468 g_debug ("Parent: Zombie %d reaped", child_pid);
469 }
470 }
471
472 static gchar *
473 extract_content (PopplerDocument *document,
474 gsize n_bytes)
475 {
476 pid_t child_pid;
477 int fd[2];
478 sigset_t mask;
479 sigset_t orig_mask;
480 struct sigaction sa;
481
482 if (pipe (fd) == -1) {
483 g_warning ("Content extraction failed, call to pipe() failed");
484 return NULL;
485 }
486
487 /* Set sig mask before fork() to avoid race conditions */
488 sigemptyset (&mask);
489 sigaddset (&mask, SIGCHLD);
490
491 /* Add zombie handler */
492 sigfillset (&sa.sa_mask);
493 sa.sa_handler = extract_content_child_cleanup;
494 sa.sa_flags = 0;
495 sigaction (SIGCHLD, &sa, NULL);
496
497 if (sigprocmask (SIG_SETMASK, &mask, &orig_mask) == -1) {
498 g_warning ("Content extraction failed, call to sigprocmask() failed");
499 return NULL;
500 }
501
502 child_pid = fork ();
503
504 if (child_pid == -1) {
505 g_warning ("Content extraction failed, call to fork() failed");
506
507 close (fd[0]);
508 close (fd[1]);
509 }
510
511 if (child_pid == 0) {
512 extract_content_child_process (document, n_bytes, fd);
513 return NULL;
514 }
515
516 return extract_content_parent_process (document, fd, child_pid);
517 }
518
519 static void
520 write_pdf_data (PDFData data,
521 TrackerSparqlBuilder *metadata,
522 GPtrArray *keywords)
523 {
524 if (!tracker_is_empty_string (data.title)) {
525 tracker_sparql_builder_predicate (metadata, "nie:title");
526 tracker_sparql_builder_object_unvalidated (metadata, data.title);
527 }
528
529 if (!tracker_is_empty_string (data.subject)) {
530 tracker_sparql_builder_predicate (metadata, "nie:subject");
531 tracker_sparql_builder_object_unvalidated (metadata, data.subject);
532 }
533
534 if (!tracker_is_empty_string (data.author)) {
535 tracker_sparql_builder_predicate (metadata, "nco:creator");
536 tracker_sparql_builder_object_blank_open (metadata);
537 tracker_sparql_builder_predicate (metadata, "a");
538 tracker_sparql_builder_object (metadata, "nco:Contact");
539 tracker_sparql_builder_predicate (metadata, "nco:fullname");
540 tracker_sparql_builder_object_unvalidated (metadata, data.author);
541 tracker_sparql_builder_object_blank_close (metadata);
542 }
543
544 if (!tracker_is_empty_string (data.date)) {
545 tracker_sparql_builder_predicate (metadata, "nie:contentCreated");
546 tracker_sparql_builder_object_unvalidated (metadata, data.date);
547 }
548
549 if (!tracker_is_empty_string (data.keywords)) {
550 tracker_keywords_parse (keywords, data.keywords);
551 }
552 }
553
554 G_MODULE_EXPORT gboolean
555 tracker_extract_get_metadata (TrackerExtractInfo *info)
556 {
557 TrackerConfig *config;
558 GTime creation_date;
559 GError *error = NULL;
560 TrackerSparqlBuilder *metadata, *preupdate;
561 const gchar *graph;
562 TrackerXmpData *xd = NULL;
563 PDFData pd = { 0 }; /* actual data */
564 PDFData md = { 0 }; /* for merging */
565 PopplerDocument *document;
566 gchar *xml = NULL;
567 gchar *content, *uri;
568 guint n_bytes;
569 GPtrArray *keywords;
570 GString *where;
571 guint i;
572 GFile *file;
573 gchar *filename;
574 int fd;
575 gchar *contents = NULL;
576 gsize len;
577 struct stat st;
578
579 metadata = tracker_extract_info_get_metadata_builder (info);
580 preupdate = tracker_extract_info_get_preupdate_builder (info);
581 graph = tracker_extract_info_get_graph (info);
582
583 file = tracker_extract_info_get_file (info);
584 filename = g_file_get_path (file);
585
586 fd = tracker_file_open_fd (filename);
587
588 if (fd == -1) {
589 g_warning ("Could not open pdf file '%s': %s\n",
590 filename,
591 g_strerror (errno));
592 g_free (filename);
593 return FALSE;
594 }
595
596 if (fstat (fd, &st) == -1) {
597 g_warning ("Could not fstat pdf file '%s': %s\n",
598 filename,
599 g_strerror (errno));
600 close (fd);
601 g_free (filename);
602 return FALSE;
603 }
604
605 if (st.st_size == 0) {
606 contents = NULL;
607 len = 0;
608 } else {
609 contents = (gchar *) mmap (NULL, st.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
610 if (contents == NULL || contents == MAP_FAILED) {
611 g_warning ("Could not mmap pdf file '%s': %s\n",
612 filename,
613 g_strerror (errno));
614 close (fd);
615 g_free (filename);
616 return FALSE;
617 }
618 len = st.st_size;
619 }
620
621 g_free (filename);
622 uri = g_file_get_uri (file);
623
624 document = poppler_document_new_from_data (contents, len, NULL, &error);
625
626 if (error) {
627 if (error->code == POPPLER_ERROR_ENCRYPTED) {
628 tracker_sparql_builder_predicate (metadata, "a");
629 tracker_sparql_builder_object (metadata, "nfo:PaginatedTextDocument");
630
631 tracker_sparql_builder_predicate (metadata, "nfo:isContentEncrypted");
632 tracker_sparql_builder_object_boolean (metadata, TRUE);
633
634 g_error_free (error);
635 g_free (uri);
636 close (fd);
637
638 return TRUE;
639 } else {
640 g_warning ("Couldn't create PopplerDocument from uri:'%s', %s",
641 uri,
642 error->message ? error->message : "no error given");
643
644 g_error_free (error);
645 g_free (uri);
646 close (fd);
647
648 return FALSE;
649 }
650 }
651
652 if (!document) {
653 g_warning ("Could not create PopplerDocument from uri:'%s', "
654 "NULL returned without an error",
655 uri);
656 g_free (uri);
657 close (fd);
658 return FALSE;
659 }
660
661 tracker_sparql_builder_predicate (metadata, "a");
662 tracker_sparql_builder_object (metadata, "nfo:PaginatedTextDocument");
663
664 g_object_get (document,
665 "title", &pd.title,
666 "author", &pd.author,
667 "subject", &pd.subject,
668 "keywords", &pd.keywords,
669 "creation-date", &creation_date,
670 "metadata", &xml,
671 NULL);
672
673 if (creation_date > 0) {
674 pd.creation_date = tracker_date_to_string ((time_t) creation_date);
675 }
676
677 keywords = g_ptr_array_new ();
678
679 if (xml &&
680 (xd = tracker_xmp_new (xml, strlen (xml), uri)) != NULL) {
681 /* The casts here are well understood and known */
682 md.title = (gchar *) tracker_coalesce_strip (4, pd.title, xd->title, xd->title2, xd->pdf_title);
683 md.subject = (gchar *) tracker_coalesce_strip (2, pd.subject, xd->subject);
684 md.date = (gchar *) tracker_coalesce_strip (3, pd.creation_date, xd->date, xd->time_original);
685 md.author = (gchar *) tracker_coalesce_strip (2, pd.author, xd->creator);
686
687 write_pdf_data (md, metadata, keywords);
688
689 if (xd->keywords) {
690 tracker_keywords_parse (keywords, xd->keywords);
691 }
692
693 if (xd->pdf_keywords) {
694 tracker_keywords_parse (keywords, xd->pdf_keywords);
695 }
696
697 if (xd->publisher) {
698 tracker_sparql_builder_predicate (metadata, "nco:publisher");
699 tracker_sparql_builder_object_blank_open (metadata);
700 tracker_sparql_builder_predicate (metadata, "a");
701 tracker_sparql_builder_object (metadata, "nco:Contact");
702 tracker_sparql_builder_predicate (metadata, "nco:fullname");
703 tracker_sparql_builder_object_unvalidated (metadata, xd->publisher);
704 tracker_sparql_builder_object_blank_close (metadata);
705 }
706
707 if (xd->type) {
708 tracker_sparql_builder_predicate (metadata, "dc:type");
709 tracker_sparql_builder_object_unvalidated (metadata, xd->type);
710 }
711
712 if (xd->format) {
713 tracker_sparql_builder_predicate (metadata, "dc:format");
714 tracker_sparql_builder_object_unvalidated (metadata, xd->format);
715 }
716
717 if (xd->identifier) {
718 tracker_sparql_builder_predicate (metadata, "dc:identifier");
719 tracker_sparql_builder_object_unvalidated (metadata, xd->identifier);
720 }
721
722 if (xd->source) {
723 tracker_sparql_builder_predicate (metadata, "dc:source");
724 tracker_sparql_builder_object_unvalidated (metadata, xd->source);
725 }
726
727 if (xd->language) {
728 tracker_sparql_builder_predicate (metadata, "dc:language");
729 tracker_sparql_builder_object_unvalidated (metadata, xd->language);
730 }
731
732 if (xd->relation) {
733 tracker_sparql_builder_predicate (metadata, "dc:relation");
734 tracker_sparql_builder_object_unvalidated (metadata, xd->relation);
735 }
736
737 if (xd->coverage) {
738 tracker_sparql_builder_predicate (metadata, "dc:coverage");
739 tracker_sparql_builder_object_unvalidated (metadata, xd->coverage);
740 }
741
742 if (xd->license) {
743 tracker_sparql_builder_predicate (metadata, "nie:license");
744 tracker_sparql_builder_object_unvalidated (metadata, xd->license);
745 }
746
747 if (xd->make || xd->model) {
748 gchar *equip_uri;
749
750 equip_uri = tracker_sparql_escape_uri_printf ("urn:equipment:%s:%s:",
751 xd->make ? xd->make : "",
752 xd->model ? xd->model : "");
753
754 tracker_sparql_builder_insert_open (preupdate, NULL);
755 if (graph) {
756 tracker_sparql_builder_graph_open (preupdate, graph);
757 }
758
759 tracker_sparql_builder_subject_iri (preupdate, equip_uri);
760 tracker_sparql_builder_predicate (preupdate, "a");
761 tracker_sparql_builder_object (preupdate, "nfo:Equipment");
762
763 if (xd->make) {
764 tracker_sparql_builder_predicate (preupdate, "nfo:manufacturer");
765 tracker_sparql_builder_object_unvalidated (preupdate, xd->make);
766 }
767
768 if (xd->model) {
769 tracker_sparql_builder_predicate (preupdate, "nfo:model");
770 tracker_sparql_builder_object_unvalidated (preupdate, xd->model);
771 }
772
773 if (graph) {
774 tracker_sparql_builder_graph_close (preupdate);
775 }
776 tracker_sparql_builder_insert_close (preupdate);
777
778 tracker_sparql_builder_predicate (metadata, "nfo:equipment");
779 tracker_sparql_builder_object_iri (metadata, equip_uri);
780 g_free (equip_uri);
781 }
782
783 if (xd->orientation) {
784 tracker_sparql_builder_predicate (metadata, "nfo:orientation");
785 tracker_sparql_builder_object (metadata, xd->orientation);
786 }
787
788 if (xd->rights) {
789 tracker_sparql_builder_predicate (metadata, "nie:copyright");
790 tracker_sparql_builder_object_unvalidated (metadata, xd->rights);
791 }
792
793 if (xd->white_balance) {
794 tracker_sparql_builder_predicate (metadata, "nmm:whiteBalance");
795 tracker_sparql_builder_object (metadata, xd->white_balance);
796 }
797
798 if (xd->fnumber) {
799 gdouble value;
800
801 value = g_strtod (xd->fnumber, NULL);
802 tracker_sparql_builder_predicate (metadata, "nmm:fnumber");
803 tracker_sparql_builder_object_double (metadata, value);
804 }
805
806 if (xd->flash) {
807 tracker_sparql_builder_predicate (metadata, "nmm:flash");
808 tracker_sparql_builder_object (metadata, xd->flash);
809 }
810
811 if (xd->focal_length) {
812 gdouble value;
813
814 value = g_strtod (xd->focal_length, NULL);
815 tracker_sparql_builder_predicate (metadata, "nmm:focalLength");
816 tracker_sparql_builder_object_double (metadata, value);
817 }
818
819 /* Question: Shouldn't xd->Artist be merged with md.author instead? */
820
821 if (xd->artist || xd->contributor) {
822 const gchar *artist;
823
824 artist = tracker_coalesce_strip (2, xd->artist, xd->contributor);
825 tracker_sparql_builder_predicate (metadata, "nco:contributor");
826 tracker_sparql_builder_object_blank_open (metadata);
827 tracker_sparql_builder_predicate (metadata, "a");
828 tracker_sparql_builder_object (metadata, "nco:Contact");
829 tracker_sparql_builder_predicate (metadata, "nco:fullname");
830 tracker_sparql_builder_object_unvalidated (metadata, artist);
831 tracker_sparql_builder_object_blank_close (metadata);
832 }
833
834 if (xd->exposure_time) {
835 gdouble value;
836
837 value = g_strtod (xd->exposure_time, NULL);
838 tracker_sparql_builder_predicate (metadata, "nmm:exposureTime");
839 tracker_sparql_builder_object_double (metadata, value);
840 }
841
842 if (xd->iso_speed_ratings) {
843 gdouble value;
844
845 value = g_strtod (xd->iso_speed_ratings, NULL);
846 tracker_sparql_builder_predicate (metadata, "nmm:isoSpeed");
847 tracker_sparql_builder_object_double (metadata, value);
848 }
849
850 if (xd->description) {
851 tracker_sparql_builder_predicate (metadata, "nie:description");
852 tracker_sparql_builder_object_unvalidated (metadata, xd->description);
853 }
854
855 if (xd->metering_mode) {
856 tracker_sparql_builder_predicate (metadata, "nmm:meteringMode");
857 tracker_sparql_builder_object (metadata, xd->metering_mode);
858 }
859
860 if (xd->address || xd->state || xd->country || xd->city ||
861 xd->gps_altitude || xd->gps_latitude || xd-> gps_longitude) {
862
863 tracker_sparql_builder_predicate (metadata, "slo:location");
864
865 tracker_sparql_builder_object_blank_open (metadata); /* GeoLocation */
866 tracker_sparql_builder_predicate (metadata, "a");
867 tracker_sparql_builder_object (metadata, "slo:GeoLocation");
868
869 if (xd->address || xd->state || xd->country || xd->city) {
870 gchar *addruri;
871 addruri = tracker_sparql_get_uuid_urn ();
872
873 tracker_sparql_builder_predicate (metadata, "slo:postalAddress");
874 tracker_sparql_builder_object_iri (metadata, addruri);
875
876 tracker_sparql_builder_insert_open (preupdate, NULL);
877 if (graph) {
878 tracker_sparql_builder_graph_open (preupdate, graph);
879 }
880
881 tracker_sparql_builder_subject_iri (preupdate, addruri);
882
883 g_free (addruri);
884
885 tracker_sparql_builder_predicate (preupdate, "a");
886 tracker_sparql_builder_object (preupdate, "nco:PostalAddress");
887
888 if (xd->address) {
889 tracker_sparql_builder_predicate (preupdate, "nco:streetAddress");
890 tracker_sparql_builder_object_unvalidated (preupdate, xd->address);
891 }
892
893 if (xd->state) {
894 tracker_sparql_builder_predicate (preupdate, "nco:region");
895 tracker_sparql_builder_object_unvalidated (preupdate, xd->state);
896 }
897
898 if (xd->city) {
899 tracker_sparql_builder_predicate (preupdate, "nco:locality");
900 tracker_sparql_builder_object_unvalidated (preupdate, xd->city);
901 }
902
903 if (xd->country) {
904 tracker_sparql_builder_predicate (preupdate, "nco:country");
905 tracker_sparql_builder_object_unvalidated (preupdate, xd->country);
906 }
907
908 if (graph) {
909 tracker_sparql_builder_graph_close (preupdate);
910 }
911 tracker_sparql_builder_insert_close (preupdate);
912 }
913
914 if (xd->gps_altitude) {
915 tracker_sparql_builder_predicate (metadata, "slo:altitude");
916 tracker_sparql_builder_object_unvalidated (metadata, xd->gps_altitude);
917 }
918
919 if (xd->gps_latitude) {
920 tracker_sparql_builder_predicate (metadata, "slo:latitude");
921 tracker_sparql_builder_object_unvalidated (metadata, xd->gps_latitude);
922 }
923
924 if (xd->gps_longitude) {
925 tracker_sparql_builder_predicate (metadata, "slo:longitude");
926 tracker_sparql_builder_object_unvalidated (metadata, xd->gps_longitude);
927 }
928
929 tracker_sparql_builder_object_blank_close (metadata); /* GeoLocation */
930 }
931
932 if (xd->regions) {
933 tracker_xmp_apply_regions (preupdate, metadata, graph, xd);
934 }
935
936 tracker_xmp_free (xd);
937 } else {
938 /* So if we are here we have NO XMP data and we just
939 * write what we know from Poppler.
940 */
941 write_pdf_data (pd, metadata, keywords);
942 }
943
944 where = g_string_new ("");
945
946 for (i = 0; i < keywords->len; i++) {
947 gchar *p, *escaped, *var;
948
949 p = g_ptr_array_index (keywords, i);
950 escaped = tracker_sparql_escape_string (p);
951 var = g_strdup_printf ("tag%d", i + 1);
952
953 /* ensure tag with specified label exists */
954 tracker_sparql_builder_append (preupdate, "INSERT { ");
955
956 if (graph) {
957 tracker_sparql_builder_append (preupdate, "GRAPH <");
958 tracker_sparql_builder_append (preupdate, graph);
959 tracker_sparql_builder_append (preupdate, "> { ");
960 }
961
962 tracker_sparql_builder_append (preupdate,
963 "_:tag a nao:Tag ; nao:prefLabel \"");
964 tracker_sparql_builder_append (preupdate, escaped);
965 tracker_sparql_builder_append (preupdate, "\"");
966
967 if (graph) {
968 tracker_sparql_builder_append (preupdate, " } ");
969 }
970
971 tracker_sparql_builder_append (preupdate, " }\n");
972 tracker_sparql_builder_append (preupdate,
973 "WHERE { FILTER (NOT EXISTS { "
974 "?tag a nao:Tag ; nao:prefLabel \"");
975 tracker_sparql_builder_append (preupdate, escaped);
976 tracker_sparql_builder_append (preupdate,
977 "\" }) }\n");
978
979 /* associate file with tag */
980 tracker_sparql_builder_predicate (metadata, "nao:hasTag");
981 tracker_sparql_builder_object_variable (metadata, var);
982
983 g_string_append_printf (where, "?%s a nao:Tag ; nao:prefLabel \"%s\" .\n", var, escaped);
984
985 g_free (var);
986 g_free (escaped);
987 g_free (p);
988 }
989 g_ptr_array_free (keywords, TRUE);
990
991 tracker_extract_info_set_where_clause (info, where->str);
992 g_string_free (where, TRUE);
993
994 tracker_sparql_builder_predicate (metadata, "nfo:pageCount");
995 tracker_sparql_builder_object_int64 (metadata, poppler_document_get_n_pages (document));
996
997 config = tracker_main_get_config ();
998 n_bytes = tracker_config_get_max_bytes (config);
999 content = extract_content (document, n_bytes);
1000
1001 if (content) {
1002 tracker_sparql_builder_predicate (metadata, "nie:plainTextContent");
1003 tracker_sparql_builder_object_unvalidated (metadata, content);
1004 g_free (content);
1005 }
1006
1007 read_outline (document, metadata);
1008
1009 g_free (xml);
1010 g_free (pd.keywords);
1011 g_free (pd.title);
1012 g_free (pd.subject);
1013 g_free (pd.creation_date);
1014 g_free (pd.author);
1015 g_free (pd.date);
1016 g_free (uri);
1017
1018 g_object_unref (document);
1019
1020 if (contents) {
1021 munmap (contents, len);
1022 }
1023
1024 close (fd);
1025
1026 return TRUE;
1027 }