tracker-0.16.2/src/tracker-extract/tracker-extract-pdf.c

Location Tool Test ID Function Issue
tracker-extract-pdf.c:397:38 gcc maybe-uninitialized extract_content_parent_process.isra.1 'bytes_remaining' may be used uninitialized in this function
tracker-extract-pdf.c:399:39 clang-analyzer The right operand of '<' is a garbage value
   1 /*
   2  * Copyright (C) 2006, Jamie McCracken <jamiemcc@gnome.org>
   3  * Copyright (C) 2008-2011, Nokia <ivan.frade@nokia.com>
   4  * Copyright (C) 2010, Amit Aggarwal <amitcs06@gmail.com>
   5  *
   6  * This library is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2.1 of the License, or (at your option) any later version.
  10  *
  11  * This library is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with this library; if not, write to the
  18  * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
  19  * Boston, MA  02110-1301, USA.
  20  */
  21 
  22 #include "config.h"
  23 
  24 #ifndef _GNU_SOURCE
  25 #define _GNU_SOURCE
  26 #endif
  27 
  28 #include <sys/mman.h>
  29 #include <sys/types.h>
  30 #include <sys/stat.h>
  31 #include <sys/select.h>
  32 #include <sys/wait.h>
  33 #include <errno.h>
  34 #include <fcntl.h>
  35 #include <string.h>
  36 #include <unistd.h>
  37 #include <stdio.h>
  38 #include <stdlib.h>
  39 
  40 #include <glib.h>
  41 #include <glib/gstdio.h>
  42 #include <glib/poppler.h>
  43 
  44 #include <gio/gunixoutputstream.h>
  45 #include <gio/gunixinputstream.h>
  46 
  47 #include <libtracker-common/tracker-date-time.h>
  48 #include <libtracker-common/tracker-utils.h>
  49 #include <libtracker-common/tracker-file-utils.h>
  50 
  51 #include <libtracker-extract/tracker-extract.h>
  52 
  53 #include "tracker-main.h"
  54 
  55 /* Time in seconds before we kill the forked child process used for
  56  * content extraction */
  57 #define EXTRACTION_PROCESS_TIMEOUT 10
  58 
  59 /* Size of the buffer to use when reading, in bytes */
  60 #define BUFFER_SIZE 65535
  61 
  62 typedef struct {
  63 	gchar *title;
  64 	gchar *subject;
  65 	gchar *creation_date;
  66 	gchar *author;
  67 	gchar *date;
  68 	gchar *keywords;
  69 } PDFData;
  70 
  71 static void
  72 read_toc (PopplerIndexIter  *index,
  73           GString          **toc)
  74 {
  75 	if (!index) {
  76 		return;
  77 	}
  78 
  79 	if (!*toc) {
  80 		*toc = g_string_new ("");
  81 	}
  82 
  83 	do {
  84 		PopplerAction *action;
  85 		PopplerIndexIter *iter;
  86 
  87 		action = poppler_index_iter_get_action (index);
  88 
  89 		if (!action) {
  90 			continue;
  91 		}
  92 
  93 		switch (action->type) {
  94 			case POPPLER_ACTION_GOTO_DEST: {
  95 				PopplerActionGotoDest *ag = (PopplerActionGotoDest *)action;
  96 				PopplerDest *agd = ag->dest;
  97 
  98 				if (!tracker_is_empty_string (ag->title)) {
  99 					g_string_append_printf (*toc, "%s ", ag->title);
 100 				}
 101 
 102 				if (!tracker_is_empty_string (agd->named_dest)) {
 103 					g_string_append_printf (*toc, "%s ", agd->named_dest);
 104 				}
 105 
 106 				break;
 107 			}
 108 
 109 			case POPPLER_ACTION_LAUNCH: {
 110 				PopplerActionLaunch *al = (PopplerActionLaunch *)action;
 111 
 112 				if (!tracker_is_empty_string (al->title)) {
 113 					g_string_append_printf (*toc, "%s ", al->title);
 114 				}
 115 
 116 				if (!tracker_is_empty_string (al->file_name)) {
 117 					g_string_append_printf (*toc, "%s ", al->file_name);
 118 				}
 119 
 120 				if (!tracker_is_empty_string (al->params)) {
 121 					g_string_append_printf (*toc, "%s ", al->params);
 122 				}
 123 
 124 				break;
 125 			}
 126 
 127 			case POPPLER_ACTION_URI: {
 128 				PopplerActionUri *au = (PopplerActionUri *)action;
 129 
 130 				if (!tracker_is_empty_string (au->uri)) {
 131 					g_string_append_printf (*toc, "%s ", au->uri);
 132 				}
 133 
 134 				break;
 135 			}
 136 
 137 			case POPPLER_ACTION_NAMED: {
 138 				PopplerActionNamed *an = (PopplerActionNamed *)action;
 139 
 140 				if (!tracker_is_empty_string (an->title)) {
 141 					g_string_append_printf (*toc, "%s, ", an->title);
 142 				}
 143 
 144 				if (!tracker_is_empty_string (an->named_dest)) {
 145 					g_string_append_printf (*toc, "%s ", an->named_dest);
 146 				}
 147 
 148 				break;
 149 			}
 150 
 151 			case POPPLER_ACTION_MOVIE: {
 152 				PopplerActionMovie *am = (PopplerActionMovie *)action;
 153 
 154 				if (!tracker_is_empty_string (am->title)) {
 155 					g_string_append_printf (*toc, "%s ", am->title);
 156 				}
 157 
 158 				break;
 159 			}
 160 
 161 			case POPPLER_ACTION_NONE:
 162 			case POPPLER_ACTION_UNKNOWN:
 163 			case POPPLER_ACTION_GOTO_REMOTE:
 164 			case POPPLER_ACTION_RENDITION:
 165 			case POPPLER_ACTION_OCG_STATE:
 166 			case POPPLER_ACTION_JAVASCRIPT:
 167 				/* Do nothing */
 168 				break;
 169 		}
 170 
 171 		poppler_action_free (action);
 172 		iter = poppler_index_iter_get_child (index);
 173 		read_toc (iter, toc);
 174 	} while (poppler_index_iter_next (index));
 175 
 176 	poppler_index_iter_free (index);
 177 }
 178 
 179 static void
 180 read_outline (PopplerDocument      *document,
 181               TrackerSparqlBuilder *metadata)
 182 {
 183 	PopplerIndexIter *index;
 184 	GString *toc = NULL;
 185 
 186 	index = poppler_index_iter_new (document);
 187 
 188 	if (!index) {
 189 		return;
 190 	}
 191 
 192 	read_toc (index, &toc);
 193 
 194 	if (toc) {
 195 		if (toc->len > 0) {
 196 			tracker_sparql_builder_predicate (metadata, "nfo:tableOfContents");
 197 			tracker_sparql_builder_object_unvalidated (metadata, toc->str);
 198 		}
 199 
 200 		g_string_free (toc, TRUE);
 201 	}
 202 }
 203 
 204 static GString *
 205 extract_content_text (PopplerDocument *document,
 206                       gsize            n_bytes)
 207 {
 208 	gint n_pages, i = 0;
 209 	GString *string;
 210 	GTimer *timer;
 211 	gsize remaining_bytes = n_bytes;
 212 
 213 	n_pages = poppler_document_get_n_pages (document);
 214 	string = g_string_new ("");
 215 	timer = g_timer_new ();
 216 
 217 	while (i < n_pages &&
 218 	       remaining_bytes > 0) {
 219 		PopplerPage *page;
 220 		gsize written_bytes = 0;
 221 		gchar *text;
 222 
 223 		page = poppler_document_get_page (document, i);
 224 		i++;
 225 
 226 		text = poppler_page_get_text (page);
 227 
 228 		if (!text) {
 229 			g_object_unref (page);
 230 			continue;
 231 		}
 232 
 233 		if (tracker_text_validate_utf8 (text,
 234 		                                MIN (strlen (text), remaining_bytes),
 235 		                                &string,
 236 		                                &written_bytes)) {
 237 			g_string_append_c (string, ' ');
 238 		}
 239 
 240 		remaining_bytes -= written_bytes;
 241 
 242 		g_debug ("Child: Extracted %" G_GSIZE_FORMAT " bytes from page %d, "
 243 		         "%" G_GSIZE_FORMAT " bytes remaining",
 244 		         written_bytes, i, remaining_bytes);
 245 
 246 		g_free (text);
 247 		g_object_unref (page);
 248 	}
 249 
 250 	g_debug ("Child: Content extraction finished: %d/%d pages indexed in %2.2f seconds, "
 251 	         "%" G_GSIZE_FORMAT " bytes extracted",
 252 	         i,
 253 	         n_pages,
 254 	         g_timer_elapsed (timer, NULL),
 255 	         (n_bytes - remaining_bytes));
 256 
 257 	g_timer_destroy (timer);
 258 
 259 	return string;
 260 }
 261 
 262 static void
 263 extract_content_child_process (PopplerDocument *document,
 264                                gsize            n_bytes,
 265                                int              fd[2])
 266 {
 267 	GString *str;
 268 	gint64 size;
 269 	GOutputStream *output_stream;
 270 	GDataOutputStream *dataout_stream;
 271 
 272 	/* This is the child extracting the content, hopefully in time */
 273 
 274 	output_stream = g_unix_output_stream_new (fd[1], FALSE);
 275 	dataout_stream = g_data_output_stream_new (output_stream);
 276 	str = extract_content_text (document, n_bytes);
 277 	size = (gint64) str->len;
 278 
 279 	/* Write the results to the pipe */
 280 	if (g_data_output_stream_put_int64 (dataout_stream, size, NULL, NULL)) {
 281 		g_output_stream_write_all (output_stream,
 282 		                           str->str,
 283 		                           str->len,
 284 		                           NULL,
 285 		                           NULL,
 286 		                           NULL);
 287 	}
 288 
 289 	g_debug ("Child: Content extraction now finished in child process, "
 290 	         "written %" G_GSIZE_FORMAT " bytes to parent process",
 291 	         size);
 292 
 293 	g_string_free (str, TRUE);
 294 	g_object_unref (dataout_stream);
 295 	g_object_unref (output_stream);
 296 
 297 	close (fd[1]);
 298 
 299 	exit (0);
 300 }
 301 
 302 static gchar *
 303 extract_content_parent_process (PopplerDocument *document,
 304                                 int              fd[2],
 305                                 pid_t            child_pid)
 306 {
 307 	GInputStream *input_stream;
 308 	GDataInputStream *datain_stream;
 309 	GString *content = NULL;
 310 	GError *error = NULL;
 311 	GTimer *timer = NULL;
 312 	gsize bytes_expected = -1;
 313 	gboolean timed_out = FALSE;
 314 	gboolean finished = FALSE;
 315 	struct timeval timeout;
 316 	fd_set rfds;
 317 
 318 	/* This is the parent process waiting for the content extractor to
 319 	 * finish in time. */
 320 
 321 	g_debug ("Parent: Content extraction now starting in child process (pid = %d)", child_pid);
 322 
 323 	/* Set up gio streams */
 324 	input_stream = g_unix_input_stream_new (fd[0], FALSE);
 325 	datain_stream = g_data_input_stream_new (input_stream);
 326 
 327 	/* Watch FD to see when it has input. */
 328 	FD_ZERO(&rfds);
 329 	FD_SET(fd[0], &rfds);
 330 
 331 	/* We give the content extractor 10 seconds to do its job */
 332 	timeout.tv_sec = EXTRACTION_PROCESS_TIMEOUT;
 333 	timeout.tv_usec = 0;
 334 
 335 	/* We also use our own timer because timeouts in select()
 336 	 * can be inconsistent across UNIX platforms. Some update the
 337 	 * timeout and some don't.
 338 	 */
 339 	timer = g_timer_new ();
 340 
 341 	/* So, this is fairly simple, what we're doing here is using
 342 	 * select() to know when the child process has written some or
 343 	 * all the data and we then avoid the child blocking by
 344 	 * reading from that stream. We couple with this with a
 345 	 * timeout of 10 seconds so if we receive nothing then we know
 346 	 * we can kill the process because it is taking too long.
 347 	 *
 348 	 * We use waitpid() to know if the process quit because it has
 349 	 * finished or if it is still processing data and needs to be
 350 	 * killed.
 351 	 */
 352 	while (!finished) {
 353 		int retval;
 354 
 355 		/* 1a. Wait for data on the FD and limit by timeout */
 356 		retval = select (fd[0] + 1,  &rfds, NULL, NULL, &timeout);
 357 
 358 		/* 2. Did we error? Have data? or just timeout? */
 359 		if (retval == -1) {
 360 			perror ("select()");
 361 			finished = TRUE;
 362 		} else if (retval == 1) {
 363 			gsize bytes_remaining;
 364 			gboolean read_finished = FALSE;
 365 
 366 			if (g_timer_elapsed (timer, NULL) >= EXTRACTION_PROCESS_TIMEOUT) {
 367 				finished = TRUE;
 368 				timed_out = TRUE;
 369 				continue;
 370 			}
 371 
 372 			/* 3. Start reading data */
 373 			if (bytes_expected == -1) {
 374 				/* We only need to read the size once before the data! */
 375 				bytes_expected = (gsize) g_data_input_stream_read_int64 (datain_stream,
 376 				                                                         NULL,
 377 				                                                         &error);
 378 				if (error) {
 379 					g_warning ("Call to g_data_input_stream_read_int64() failed, %s",
 380 					           error->message);
 381 					g_error_free (error);
 382 					finished = TRUE;
 383 					continue;
 384 				}
 385 
 386 				g_debug ("Parent: Expected bytes to read is %" G_GSSIZE_FORMAT "", bytes_expected);
 387 				bytes_remaining = bytes_expected;
 388 				content = g_string_new ("");
 389 			}
 390 
 391 			/* 4. Read until done from stream and concatenate data */
 392 			while (!read_finished) {
 393 				gchar buf[BUFFER_SIZE];
 394 				gsize bytes_read;
 395 
 396 				memset (buf, 0, BUFFER_SIZE);
 397 				bytes_read = g_input_stream_read (G_INPUT_STREAM (datain_stream),
'bytes_remaining' may be used uninitialized in this function
(emitted by gcc)
398 buf, 399 MIN (BUFFER_SIZE, bytes_remaining),
The right operand of '<' is a garbage value
(emitted by clang-analyzer)

TODO: a detailed trace is available in the data model (not yet rendered in this report)

400 NULL, 401 &error); 402 403 g_debug ("Parent: Bytes read is %" G_GSSIZE_FORMAT "," 404 "bytes remaining is %" G_GSSIZE_FORMAT "", 405 bytes_read, 406 MAX (bytes_remaining - bytes_read, 0)); 407 408 if (bytes_read == -1 || error) { 409 g_warning ("Call to g_input_stream_read() failed, %s", 410 error ? error->message : "no error given"); 411 g_clear_error (&error); 412 read_finished = TRUE; 413 finished = TRUE; 414 } else { 415 content = g_string_append (content, buf); 416 417 bytes_remaining -= bytes_read; 418 bytes_remaining = MAX (bytes_remaining, 0); 419 420 if (bytes_read == 0) { 421 /* We finished reading */ 422 g_debug ("Parent: Finished reading all bytes"); 423 read_finished = TRUE; 424 } 425 426 /* Are we finished reading everything */ 427 if (bytes_remaining < 1) { 428 finished = TRUE; 429 } 430 } 431 } 432 } else { 433 /* 3. We must have timed out with no data in select() */ 434 finished = TRUE; 435 timed_out = TRUE; 436 g_debug ("Parent: Must have timed out with no data in select()"); 437 } 438 } 439 440 if (timed_out) { 441 g_debug ("Parent: Child process took too long. We waited %d seconds, so we're going to kill it!", 442 EXTRACTION_PROCESS_TIMEOUT); 443 kill (child_pid, SIGKILL); 444 } else { 445 g_debug ("Parent: Data received in %2.2f seconds (timeout is %d seconds)", 446 g_timer_elapsed (timer, NULL), 447 EXTRACTION_PROCESS_TIMEOUT); 448 } 449 450 g_timer_destroy (timer); 451 452 g_object_unref (datain_stream); 453 g_object_unref (input_stream); 454 455 close (fd[0]); 456 457 return content ? g_string_free (content, FALSE) : NULL; 458 } 459 460 static void 461 extract_content_child_cleanup (int action) 462 { 463 pid_t child_pid; 464 int status; 465 466 g_debug ("Parent: Zombies, say hello to my little friend!"); 467 while ((child_pid = waitpid (-1, &status, WNOHANG)) > 0) { 468 g_debug ("Parent: Zombie %d reaped", child_pid); 469 } 470 } 471 472 static gchar * 473 extract_content (PopplerDocument *document, 474 gsize n_bytes) 475 { 476 pid_t child_pid; 477 int fd[2]; 478 sigset_t mask; 479 sigset_t orig_mask; 480 struct sigaction sa; 481 482 if (pipe (fd) == -1) { 483 g_warning ("Content extraction failed, call to pipe() failed"); 484 return NULL; 485 } 486 487 /* Set sig mask before fork() to avoid race conditions */ 488 sigemptyset (&mask); 489 sigaddset (&mask, SIGCHLD); 490 491 /* Add zombie handler */ 492 sigfillset (&sa.sa_mask); 493 sa.sa_handler = extract_content_child_cleanup; 494 sa.sa_flags = 0; 495 sigaction (SIGCHLD, &sa, NULL); 496 497 if (sigprocmask (SIG_SETMASK, &mask, &orig_mask) == -1) { 498 g_warning ("Content extraction failed, call to sigprocmask() failed"); 499 return NULL; 500 } 501 502 child_pid = fork (); 503 504 if (child_pid == -1) { 505 g_warning ("Content extraction failed, call to fork() failed"); 506 507 close (fd[0]); 508 close (fd[1]); 509 } 510 511 if (child_pid == 0) { 512 extract_content_child_process (document, n_bytes, fd); 513 return NULL; 514 } 515 516 return extract_content_parent_process (document, fd, child_pid); 517 } 518 519 static void 520 write_pdf_data (PDFData data, 521 TrackerSparqlBuilder *metadata, 522 GPtrArray *keywords) 523 { 524 if (!tracker_is_empty_string (data.title)) { 525 tracker_sparql_builder_predicate (metadata, "nie:title"); 526 tracker_sparql_builder_object_unvalidated (metadata, data.title); 527 } 528 529 if (!tracker_is_empty_string (data.subject)) { 530 tracker_sparql_builder_predicate (metadata, "nie:subject"); 531 tracker_sparql_builder_object_unvalidated (metadata, data.subject); 532 } 533 534 if (!tracker_is_empty_string (data.author)) { 535 tracker_sparql_builder_predicate (metadata, "nco:creator"); 536 tracker_sparql_builder_object_blank_open (metadata); 537 tracker_sparql_builder_predicate (metadata, "a"); 538 tracker_sparql_builder_object (metadata, "nco:Contact"); 539 tracker_sparql_builder_predicate (metadata, "nco:fullname"); 540 tracker_sparql_builder_object_unvalidated (metadata, data.author); 541 tracker_sparql_builder_object_blank_close (metadata); 542 } 543 544 if (!tracker_is_empty_string (data.date)) { 545 tracker_sparql_builder_predicate (metadata, "nie:contentCreated"); 546 tracker_sparql_builder_object_unvalidated (metadata, data.date); 547 } 548 549 if (!tracker_is_empty_string (data.keywords)) { 550 tracker_keywords_parse (keywords, data.keywords); 551 } 552 } 553 554 G_MODULE_EXPORT gboolean 555 tracker_extract_get_metadata (TrackerExtractInfo *info) 556 { 557 TrackerConfig *config; 558 GTime creation_date; 559 GError *error = NULL; 560 TrackerSparqlBuilder *metadata, *preupdate; 561 const gchar *graph; 562 TrackerXmpData *xd = NULL; 563 PDFData pd = { 0 }; /* actual data */ 564 PDFData md = { 0 }; /* for merging */ 565 PopplerDocument *document; 566 gchar *xml = NULL; 567 gchar *content, *uri; 568 guint n_bytes; 569 GPtrArray *keywords; 570 GString *where; 571 guint i; 572 GFile *file; 573 gchar *filename; 574 int fd; 575 gchar *contents = NULL; 576 gsize len; 577 struct stat st; 578 579 metadata = tracker_extract_info_get_metadata_builder (info); 580 preupdate = tracker_extract_info_get_preupdate_builder (info); 581 graph = tracker_extract_info_get_graph (info); 582 583 file = tracker_extract_info_get_file (info); 584 filename = g_file_get_path (file); 585 586 fd = tracker_file_open_fd (filename); 587 588 if (fd == -1) { 589 g_warning ("Could not open pdf file '%s': %s\n", 590 filename, 591 g_strerror (errno)); 592 g_free (filename); 593 return FALSE; 594 } 595 596 if (fstat (fd, &st) == -1) { 597 g_warning ("Could not fstat pdf file '%s': %s\n", 598 filename, 599 g_strerror (errno)); 600 close (fd); 601 g_free (filename); 602 return FALSE; 603 } 604 605 if (st.st_size == 0) { 606 contents = NULL; 607 len = 0; 608 } else { 609 contents = (gchar *) mmap (NULL, st.st_size, PROT_READ, MAP_PRIVATE, fd, 0); 610 if (contents == NULL || contents == MAP_FAILED) { 611 g_warning ("Could not mmap pdf file '%s': %s\n", 612 filename, 613 g_strerror (errno)); 614 close (fd); 615 g_free (filename); 616 return FALSE; 617 } 618 len = st.st_size; 619 } 620 621 g_free (filename); 622 uri = g_file_get_uri (file); 623 624 document = poppler_document_new_from_data (contents, len, NULL, &error); 625 626 if (error) { 627 if (error->code == POPPLER_ERROR_ENCRYPTED) { 628 tracker_sparql_builder_predicate (metadata, "a"); 629 tracker_sparql_builder_object (metadata, "nfo:PaginatedTextDocument"); 630 631 tracker_sparql_builder_predicate (metadata, "nfo:isContentEncrypted"); 632 tracker_sparql_builder_object_boolean (metadata, TRUE); 633 634 g_error_free (error); 635 g_free (uri); 636 close (fd); 637 638 return TRUE; 639 } else { 640 g_warning ("Couldn't create PopplerDocument from uri:'%s', %s", 641 uri, 642 error->message ? error->message : "no error given"); 643 644 g_error_free (error); 645 g_free (uri); 646 close (fd); 647 648 return FALSE; 649 } 650 } 651 652 if (!document) { 653 g_warning ("Could not create PopplerDocument from uri:'%s', " 654 "NULL returned without an error", 655 uri); 656 g_free (uri); 657 close (fd); 658 return FALSE; 659 } 660 661 tracker_sparql_builder_predicate (metadata, "a"); 662 tracker_sparql_builder_object (metadata, "nfo:PaginatedTextDocument"); 663 664 g_object_get (document, 665 "title", &pd.title, 666 "author", &pd.author, 667 "subject", &pd.subject, 668 "keywords", &pd.keywords, 669 "creation-date", &creation_date, 670 "metadata", &xml, 671 NULL); 672 673 if (creation_date > 0) { 674 pd.creation_date = tracker_date_to_string ((time_t) creation_date); 675 } 676 677 keywords = g_ptr_array_new (); 678 679 if (xml && 680 (xd = tracker_xmp_new (xml, strlen (xml), uri)) != NULL) { 681 /* The casts here are well understood and known */ 682 md.title = (gchar *) tracker_coalesce_strip (4, pd.title, xd->title, xd->title2, xd->pdf_title); 683 md.subject = (gchar *) tracker_coalesce_strip (2, pd.subject, xd->subject); 684 md.date = (gchar *) tracker_coalesce_strip (3, pd.creation_date, xd->date, xd->time_original); 685 md.author = (gchar *) tracker_coalesce_strip (2, pd.author, xd->creator); 686 687 write_pdf_data (md, metadata, keywords); 688 689 if (xd->keywords) { 690 tracker_keywords_parse (keywords, xd->keywords); 691 } 692 693 if (xd->pdf_keywords) { 694 tracker_keywords_parse (keywords, xd->pdf_keywords); 695 } 696 697 if (xd->publisher) { 698 tracker_sparql_builder_predicate (metadata, "nco:publisher"); 699 tracker_sparql_builder_object_blank_open (metadata); 700 tracker_sparql_builder_predicate (metadata, "a"); 701 tracker_sparql_builder_object (metadata, "nco:Contact"); 702 tracker_sparql_builder_predicate (metadata, "nco:fullname"); 703 tracker_sparql_builder_object_unvalidated (metadata, xd->publisher); 704 tracker_sparql_builder_object_blank_close (metadata); 705 } 706 707 if (xd->type) { 708 tracker_sparql_builder_predicate (metadata, "dc:type"); 709 tracker_sparql_builder_object_unvalidated (metadata, xd->type); 710 } 711 712 if (xd->format) { 713 tracker_sparql_builder_predicate (metadata, "dc:format"); 714 tracker_sparql_builder_object_unvalidated (metadata, xd->format); 715 } 716 717 if (xd->identifier) { 718 tracker_sparql_builder_predicate (metadata, "dc:identifier"); 719 tracker_sparql_builder_object_unvalidated (metadata, xd->identifier); 720 } 721 722 if (xd->source) { 723 tracker_sparql_builder_predicate (metadata, "dc:source"); 724 tracker_sparql_builder_object_unvalidated (metadata, xd->source); 725 } 726 727 if (xd->language) { 728 tracker_sparql_builder_predicate (metadata, "dc:language"); 729 tracker_sparql_builder_object_unvalidated (metadata, xd->language); 730 } 731 732 if (xd->relation) { 733 tracker_sparql_builder_predicate (metadata, "dc:relation"); 734 tracker_sparql_builder_object_unvalidated (metadata, xd->relation); 735 } 736 737 if (xd->coverage) { 738 tracker_sparql_builder_predicate (metadata, "dc:coverage"); 739 tracker_sparql_builder_object_unvalidated (metadata, xd->coverage); 740 } 741 742 if (xd->license) { 743 tracker_sparql_builder_predicate (metadata, "nie:license"); 744 tracker_sparql_builder_object_unvalidated (metadata, xd->license); 745 } 746 747 if (xd->make || xd->model) { 748 gchar *equip_uri; 749 750 equip_uri = tracker_sparql_escape_uri_printf ("urn:equipment:%s:%s:", 751 xd->make ? xd->make : "", 752 xd->model ? xd->model : ""); 753 754 tracker_sparql_builder_insert_open (preupdate, NULL); 755 if (graph) { 756 tracker_sparql_builder_graph_open (preupdate, graph); 757 } 758 759 tracker_sparql_builder_subject_iri (preupdate, equip_uri); 760 tracker_sparql_builder_predicate (preupdate, "a"); 761 tracker_sparql_builder_object (preupdate, "nfo:Equipment"); 762 763 if (xd->make) { 764 tracker_sparql_builder_predicate (preupdate, "nfo:manufacturer"); 765 tracker_sparql_builder_object_unvalidated (preupdate, xd->make); 766 } 767 768 if (xd->model) { 769 tracker_sparql_builder_predicate (preupdate, "nfo:model"); 770 tracker_sparql_builder_object_unvalidated (preupdate, xd->model); 771 } 772 773 if (graph) { 774 tracker_sparql_builder_graph_close (preupdate); 775 } 776 tracker_sparql_builder_insert_close (preupdate); 777 778 tracker_sparql_builder_predicate (metadata, "nfo:equipment"); 779 tracker_sparql_builder_object_iri (metadata, equip_uri); 780 g_free (equip_uri); 781 } 782 783 if (xd->orientation) { 784 tracker_sparql_builder_predicate (metadata, "nfo:orientation"); 785 tracker_sparql_builder_object (metadata, xd->orientation); 786 } 787 788 if (xd->rights) { 789 tracker_sparql_builder_predicate (metadata, "nie:copyright"); 790 tracker_sparql_builder_object_unvalidated (metadata, xd->rights); 791 } 792 793 if (xd->white_balance) { 794 tracker_sparql_builder_predicate (metadata, "nmm:whiteBalance"); 795 tracker_sparql_builder_object (metadata, xd->white_balance); 796 } 797 798 if (xd->fnumber) { 799 gdouble value; 800 801 value = g_strtod (xd->fnumber, NULL); 802 tracker_sparql_builder_predicate (metadata, "nmm:fnumber"); 803 tracker_sparql_builder_object_double (metadata, value); 804 } 805 806 if (xd->flash) { 807 tracker_sparql_builder_predicate (metadata, "nmm:flash"); 808 tracker_sparql_builder_object (metadata, xd->flash); 809 } 810 811 if (xd->focal_length) { 812 gdouble value; 813 814 value = g_strtod (xd->focal_length, NULL); 815 tracker_sparql_builder_predicate (metadata, "nmm:focalLength"); 816 tracker_sparql_builder_object_double (metadata, value); 817 } 818 819 /* Question: Shouldn't xd->Artist be merged with md.author instead? */ 820 821 if (xd->artist || xd->contributor) { 822 const gchar *artist; 823 824 artist = tracker_coalesce_strip (2, xd->artist, xd->contributor); 825 tracker_sparql_builder_predicate (metadata, "nco:contributor"); 826 tracker_sparql_builder_object_blank_open (metadata); 827 tracker_sparql_builder_predicate (metadata, "a"); 828 tracker_sparql_builder_object (metadata, "nco:Contact"); 829 tracker_sparql_builder_predicate (metadata, "nco:fullname"); 830 tracker_sparql_builder_object_unvalidated (metadata, artist); 831 tracker_sparql_builder_object_blank_close (metadata); 832 } 833 834 if (xd->exposure_time) { 835 gdouble value; 836 837 value = g_strtod (xd->exposure_time, NULL); 838 tracker_sparql_builder_predicate (metadata, "nmm:exposureTime"); 839 tracker_sparql_builder_object_double (metadata, value); 840 } 841 842 if (xd->iso_speed_ratings) { 843 gdouble value; 844 845 value = g_strtod (xd->iso_speed_ratings, NULL); 846 tracker_sparql_builder_predicate (metadata, "nmm:isoSpeed"); 847 tracker_sparql_builder_object_double (metadata, value); 848 } 849 850 if (xd->description) { 851 tracker_sparql_builder_predicate (metadata, "nie:description"); 852 tracker_sparql_builder_object_unvalidated (metadata, xd->description); 853 } 854 855 if (xd->metering_mode) { 856 tracker_sparql_builder_predicate (metadata, "nmm:meteringMode"); 857 tracker_sparql_builder_object (metadata, xd->metering_mode); 858 } 859 860 if (xd->address || xd->state || xd->country || xd->city || 861 xd->gps_altitude || xd->gps_latitude || xd-> gps_longitude) { 862 863 tracker_sparql_builder_predicate (metadata, "slo:location"); 864 865 tracker_sparql_builder_object_blank_open (metadata); /* GeoLocation */ 866 tracker_sparql_builder_predicate (metadata, "a"); 867 tracker_sparql_builder_object (metadata, "slo:GeoLocation"); 868 869 if (xd->address || xd->state || xd->country || xd->city) { 870 gchar *addruri; 871 addruri = tracker_sparql_get_uuid_urn (); 872 873 tracker_sparql_builder_predicate (metadata, "slo:postalAddress"); 874 tracker_sparql_builder_object_iri (metadata, addruri); 875 876 tracker_sparql_builder_insert_open (preupdate, NULL); 877 if (graph) { 878 tracker_sparql_builder_graph_open (preupdate, graph); 879 } 880 881 tracker_sparql_builder_subject_iri (preupdate, addruri); 882 883 g_free (addruri); 884 885 tracker_sparql_builder_predicate (preupdate, "a"); 886 tracker_sparql_builder_object (preupdate, "nco:PostalAddress"); 887 888 if (xd->address) { 889 tracker_sparql_builder_predicate (preupdate, "nco:streetAddress"); 890 tracker_sparql_builder_object_unvalidated (preupdate, xd->address); 891 } 892 893 if (xd->state) { 894 tracker_sparql_builder_predicate (preupdate, "nco:region"); 895 tracker_sparql_builder_object_unvalidated (preupdate, xd->state); 896 } 897 898 if (xd->city) { 899 tracker_sparql_builder_predicate (preupdate, "nco:locality"); 900 tracker_sparql_builder_object_unvalidated (preupdate, xd->city); 901 } 902 903 if (xd->country) { 904 tracker_sparql_builder_predicate (preupdate, "nco:country"); 905 tracker_sparql_builder_object_unvalidated (preupdate, xd->country); 906 } 907 908 if (graph) { 909 tracker_sparql_builder_graph_close (preupdate); 910 } 911 tracker_sparql_builder_insert_close (preupdate); 912 } 913 914 if (xd->gps_altitude) { 915 tracker_sparql_builder_predicate (metadata, "slo:altitude"); 916 tracker_sparql_builder_object_unvalidated (metadata, xd->gps_altitude); 917 } 918 919 if (xd->gps_latitude) { 920 tracker_sparql_builder_predicate (metadata, "slo:latitude"); 921 tracker_sparql_builder_object_unvalidated (metadata, xd->gps_latitude); 922 } 923 924 if (xd->gps_longitude) { 925 tracker_sparql_builder_predicate (metadata, "slo:longitude"); 926 tracker_sparql_builder_object_unvalidated (metadata, xd->gps_longitude); 927 } 928 929 tracker_sparql_builder_object_blank_close (metadata); /* GeoLocation */ 930 } 931 932 if (xd->regions) { 933 tracker_xmp_apply_regions (preupdate, metadata, graph, xd); 934 } 935 936 tracker_xmp_free (xd); 937 } else { 938 /* So if we are here we have NO XMP data and we just 939 * write what we know from Poppler. 940 */ 941 write_pdf_data (pd, metadata, keywords); 942 } 943 944 where = g_string_new (""); 945 946 for (i = 0; i < keywords->len; i++) { 947 gchar *p, *escaped, *var; 948 949 p = g_ptr_array_index (keywords, i); 950 escaped = tracker_sparql_escape_string (p); 951 var = g_strdup_printf ("tag%d", i + 1); 952 953 /* ensure tag with specified label exists */ 954 tracker_sparql_builder_append (preupdate, "INSERT { "); 955 956 if (graph) { 957 tracker_sparql_builder_append (preupdate, "GRAPH <"); 958 tracker_sparql_builder_append (preupdate, graph); 959 tracker_sparql_builder_append (preupdate, "> { "); 960 } 961 962 tracker_sparql_builder_append (preupdate, 963 "_:tag a nao:Tag ; nao:prefLabel \""); 964 tracker_sparql_builder_append (preupdate, escaped); 965 tracker_sparql_builder_append (preupdate, "\""); 966 967 if (graph) { 968 tracker_sparql_builder_append (preupdate, " } "); 969 } 970 971 tracker_sparql_builder_append (preupdate, " }\n"); 972 tracker_sparql_builder_append (preupdate, 973 "WHERE { FILTER (NOT EXISTS { " 974 "?tag a nao:Tag ; nao:prefLabel \""); 975 tracker_sparql_builder_append (preupdate, escaped); 976 tracker_sparql_builder_append (preupdate, 977 "\" }) }\n"); 978 979 /* associate file with tag */ 980 tracker_sparql_builder_predicate (metadata, "nao:hasTag"); 981 tracker_sparql_builder_object_variable (metadata, var); 982 983 g_string_append_printf (where, "?%s a nao:Tag ; nao:prefLabel \"%s\" .\n", var, escaped); 984 985 g_free (var); 986 g_free (escaped); 987 g_free (p); 988 } 989 g_ptr_array_free (keywords, TRUE); 990 991 tracker_extract_info_set_where_clause (info, where->str); 992 g_string_free (where, TRUE); 993 994 tracker_sparql_builder_predicate (metadata, "nfo:pageCount"); 995 tracker_sparql_builder_object_int64 (metadata, poppler_document_get_n_pages (document)); 996 997 config = tracker_main_get_config (); 998 n_bytes = tracker_config_get_max_bytes (config); 999 content = extract_content (document, n_bytes); 1000 1001 if (content) { 1002 tracker_sparql_builder_predicate (metadata, "nie:plainTextContent"); 1003 tracker_sparql_builder_object_unvalidated (metadata, content); 1004 g_free (content); 1005 } 1006 1007 read_outline (document, metadata); 1008 1009 g_free (xml); 1010 g_free (pd.keywords); 1011 g_free (pd.title); 1012 g_free (pd.subject); 1013 g_free (pd.creation_date); 1014 g_free (pd.author); 1015 g_free (pd.date); 1016 g_free (uri); 1017 1018 g_object_unref (document); 1019 1020 if (contents) { 1021 munmap (contents, len); 1022 } 1023 1024 close (fd); 1025 1026 return TRUE; 1027 }