tracker-0.16.2/src/tracker-extract/tracker-extract-msoffice.c

Location Tool Test ID Function Issue
tracker-extract-msoffice.c:158:2 gcc pointer-sign msoffice_string_process_octal_triplets pointer targets in passing argument 1 of 'strlen' differ in signedness
tracker-extract-msoffice.c:252:4 gcc pointer-sign metadata_add_gvalue pointer targets in passing argument 1 of 'msoffice_string_process_octal_triplets' differ in signedness
tracker-extract-msoffice.c:422:30 gcc pointer-sign msoffice_convert_and_normalize_chunk pointer targets in passing argument 1 of 'g_convert' differ in signedness

Incomplete coverage

Tool Failure ID Location Function Message Data
clang-analyzer no-output-found tracker-extract-msoffice.c Message(text='Unable to locate XML output from invoke-clang-analyzer') None
Failure running clang-analyzer ('no-output-found')
Message
Unable to locate XML output from invoke-clang-analyzer
   1 /*
   2  * Copyright (C) 2006, Edward Duffy <eduffy@gmail.com>
   3  * Copyright (C) 2006, Laurent Aguerreche <laurent.aguerreche@free.fr>
   4  * Copyright (C) 2008, Nokia <ivan.frade@nokia.com>
   5  *
   6  * This library is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2.1 of the License, or (at your option) any later version.
  10  *
  11  * This library is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with this library; if not, write to the
  18  * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
  19  * Boston, MA  02110-1301, USA.
  20  */
  21 
  22 #include "config.h"
  23 
  24 #include <errno.h>
  25 #include <string.h>
  26 
  27 #include <glib.h>
  28 
  29 #include <gsf/gsf.h>
  30 #include <gsf/gsf-doc-meta-data.h>
  31 #include <gsf/gsf-infile.h>
  32 #include <gsf/gsf-infile-msole.h>
  33 #include <gsf/gsf-input-stdio.h>
  34 #include <gsf/gsf-msole-utils.h>
  35 #include <gsf/gsf-utils.h>
  36 #include <gsf/gsf-infile-zip.h>
  37 
  38 #include <libtracker-common/tracker-utils.h>
  39 #include <libtracker-common/tracker-file-utils.h>
  40 #include <libtracker-common/tracker-os-dependant.h>
  41 
  42 #include <libtracker-extract/tracker-extract.h>
  43 
  44 #include "tracker-main.h"
  45 #include "tracker-gsf.h"
  46 
  47 /* Powerpoint files comprise of structures. Each structure contains a
  48  * header. Within that header is a record type that specifies what
  49  * strcture it is. It is called record type.
  50  *
  51  * Here are are some record types and description of the structure
  52  * (called atom) they contain.
  53  */
  54 
  55 /* An atom record that specifies Unicode characters with no high byte
  56  * of a UTF-16 Unicode character. High byte is always 0.
  57  * http://msdn.microsoft.com/en-us/library/dd947905%28v=office.12%29.aspx
  58  */
  59 #define TEXTBYTESATOM_RECORD_TYPE      0x0FA8
  60 
  61 /* An atom record that specifies Unicode characters.
  62  * http://msdn.microsoft.com/en-us/library/dd772921%28v=office.12%29.aspx
  63  */
  64 #define TEXTCHARSATOM_RECORD_TYPE      0x0FA0
  65 
  66 /* A container record that specifies information about the powerpoint
  67  * document.
  68  */
  69 #define DOCUMENTCONTAINER_RECORD_TYPE  0x03E8
  70 
  71 /* Variant type of record. Within Powerpoint text extraction we are
  72  * interested of SlideListWithTextContainer type that contains the
  73  * textual content of the slide(s).
  74  */
  75 #define SLIDELISTWITHTEXT_RECORD_TYPE  0x0FF0
  76 
  77 /**
  78  * @brief Header for all powerpoint structures
  79  *
  80  * A structure at the beginning of each container record and each atom record in
  81  * the file. The values in the record header and the context of the record are
  82  * used to identify and interpret the record data that follows.
  83  */
  84 typedef struct {
  85 	/**
  86 	 * @brief An unsigned integer that specifies the version of the record
  87 	 * data that follows the record header. A value of 0xF specifies that the
  88 	 * record is a container record.
  89 	 */
  90 	guint recVer;
  91 
  92 	/**
  93 	 * @brief An unsigned integer that specifies the record instance data.
  94 	 * Interpretation of the value is dependent on the particular record
  95 	 * type.
  96 	 */
  97 	guint recInstance;
  98 
  99 	/**
 100 	 * @brief A RecordType enumeration that specifies the type of the record
 101 	 * data that follows the record header.
 102 	 */
 103 	gint recType;
 104 
 105 	/**
 106 	 * @brief An unsigned integer that specifies the length, in bytes, of the
 107 	 * record data that follows the record header.
 108 	 */
 109 	guint recLen;
 110 } PowerPointRecordHeader;
 111 
 112 /* Excel spec record type to read shared string */
 113 typedef enum {
 114 	RECORD_TYPE_SST      = 252,
 115 	RECORD_TYPE_CONTINUE = 60,
 116 	RECORD_TYPE_EOF      = 10
 117 } ExcelRecordType;
 118 
 119 /* ExcelBiffHeader to read excel spec header */
 120 typedef struct {
 121 	ExcelRecordType id;
 122 	guint length;
 123 } ExcelBiffHeader;
 124 
 125 /* ExtendendString Record offset in stream and length */
 126 typedef struct {
 127 	gsf_off_t offset; /* 64 bits!! */
 128 	gsize     length;
 129 } ExcelExtendedStringRecord;
 130 
 131 typedef struct {
 132 	TrackerSparqlBuilder *metadata;
 133 	const gchar *uri;
 134 } MetadataInfo;
 135 
 136 /* Valid range from \000 to \377 (0 to 255) */
 137 #define octal_ascii_triplet_is_valid(slash, a2, a1, a0) \
 138 	(slash == '\\' && \
 139 	 a2 >= '0' && a2 <= '3' && \
 140 	 a1 >= '0' && a1 <= '7' && \
 141 	 a0 >= '0' && a0 <= '7')
 142 
 143 #define octal_ascii_triplet_to_decimal_int(a2, a1, a0) \
 144 	((a0 - '0') + 8 * ((a1 - '0') + 8 * (a2 - '0')))
 145 
 146 /*
 147  * So, we may get input strings with UTF-8 characters encoded in OCTAL and
 148  * represented in ASCII, like this:
 149  *     K\303\230BENHAVNS UNIVERSITET
 150  * which is equivalent to:
 151  *     KØBENHAVNS UNIVERSITET
 152  */
 153 static void
 154 msoffice_string_process_octal_triplets (guchar *str)
 155 {
 156 	guint i = 0; /* index in original string */
 157 	guint j = 0; /* index in processed string */
 158 	guint length = strlen (str);
pointer targets in passing argument 1 of 'strlen' differ in signedness
(emitted by gcc)
159 160 /* Changing the string IN PLACE, note that j<=i ALWAYS! */ 161 while (i < length) { 162 if (length - i >= 4 && 163 octal_ascii_triplet_is_valid (str[i], str[i+1], str[i+2], str[i+3])) { 164 /* Found a new octal triplet */ 165 str[j] = octal_ascii_triplet_to_decimal_int (str[i+1], str[i+2], str[i+3]); 166 i += 4; 167 } else if (i != j) { 168 /* We previously found an octal triplet, 169 * we need to update the string */ 170 str[j] = str[i]; 171 i++; 172 } else { 173 /* No need to update the string yet */ 174 i++; 175 } 176 j++; 177 } 178 /* New end of string */ 179 str[j]='\0'; 180 } 181 182 static void 183 metadata_add_gvalue (TrackerSparqlBuilder *metadata, 184 const gchar *uri, 185 const gchar *key, 186 GValue const *val, 187 const gchar *type, 188 const gchar *predicate, 189 gboolean is_date) 190 { 191 gchar *s; 192 193 g_return_if_fail (metadata != NULL); 194 g_return_if_fail (key != NULL); 195 196 if (!val) { 197 return; 198 } 199 200 s = g_strdup_value_contents (val); 201 202 if (!s) { 203 return; 204 } 205 206 if (!tracker_is_empty_string (s)) { 207 gchar *str_val; 208 209 /* Some fun: strings are always written "str" with double quotes 210 * around, but not numbers! 211 */ 212 if (s[0] == '"') { 213 size_t len; 214 215 len = strlen (s); 216 217 if (s[len - 1] == '"') { 218 if (is_date) { 219 if (len > 2) { 220 gchar *str = g_strndup (s + 1, len - 2); 221 str_val = tracker_date_guess (str); 222 g_free (str); 223 } else { 224 str_val = NULL; 225 } 226 } else { 227 str_val = len > 2 ? g_strndup (s + 1, len - 2) : NULL; 228 } 229 } else { 230 /* We have a string that begins with a double 231 * quote but which finishes by something 232 * different... We copy the string from the 233 * beginning. 234 */ 235 if (is_date) { 236 str_val = tracker_date_guess (s); 237 } else { 238 str_val = g_strdup (s); 239 } 240 } 241 } else { 242 /* Here, we probably have a number */ 243 if (is_date) { 244 str_val = tracker_date_guess (s); 245 } else { 246 str_val = g_strdup (s); 247 } 248 } 249 250 if (str_val) { 251 /* Process (in place) octal triplets if found */ 252 msoffice_string_process_octal_triplets (str_val);
pointer targets in passing argument 1 of 'msoffice_string_process_octal_triplets' differ in signedness
(emitted by gcc)
253 254 if (type && predicate) { 255 tracker_sparql_builder_predicate (metadata, key); 256 257 tracker_sparql_builder_object_blank_open (metadata); 258 tracker_sparql_builder_predicate (metadata, "a"); 259 tracker_sparql_builder_object (metadata, type); 260 261 tracker_sparql_builder_predicate (metadata, predicate); 262 tracker_sparql_builder_object_unvalidated (metadata, str_val); 263 tracker_sparql_builder_object_blank_close (metadata); 264 } else { 265 tracker_sparql_builder_predicate (metadata, key); 266 tracker_sparql_builder_object_unvalidated (metadata, str_val); 267 } 268 269 g_free (str_val); 270 } 271 } 272 273 g_free (s); 274 } 275 276 static void 277 summary_metadata_cb (gpointer key, 278 gpointer value, 279 gpointer user_data) 280 { 281 MetadataInfo *info = user_data; 282 GValue const *val; 283 284 val = gsf_doc_prop_get_val (value); 285 286 if (g_strcmp0 (key, "dc:title") == 0) { 287 metadata_add_gvalue (info->metadata, info->uri, "nie:title", val, NULL, NULL, FALSE); 288 } else if (g_strcmp0 (key, "dc:subject") == 0) { 289 metadata_add_gvalue (info->metadata, info->uri, "nie:subject", val, NULL, NULL, FALSE); 290 } else if (g_strcmp0 (key, "dc:creator") == 0) { 291 metadata_add_gvalue (info->metadata, info->uri, "nco:creator", val, "nco:Contact", "nco:fullname", FALSE); 292 } else if (g_strcmp0 (key, "dc:keywords") == 0) { 293 gchar *keywords = g_strdup_value_contents (val); 294 gchar *lasts, *keyw; 295 size_t len; 296 297 keyw = keywords; 298 keywords = strchr (keywords, '"'); 299 300 if (keywords) { 301 keywords++; 302 } else { 303 keywords = keyw; 304 } 305 306 len = strlen (keywords); 307 if (keywords[len - 1] == '"') { 308 keywords[len - 1] = '\0'; 309 } 310 311 for (keyw = strtok_r (keywords, ",; ", &lasts); keyw; 312 keyw = strtok_r (NULL, ",; ", &lasts)) { 313 tracker_sparql_builder_predicate (info->metadata, "nie:keyword"); 314 tracker_sparql_builder_object_unvalidated (info->metadata, keyw); 315 } 316 317 g_free (keyw); 318 } else if (g_strcmp0 (key, "dc:description") == 0) { 319 metadata_add_gvalue (info->metadata, info->uri, "nie:comment", val, NULL, NULL, FALSE); 320 } else if (g_strcmp0 (key, "gsf:page-count") == 0) { 321 metadata_add_gvalue (info->metadata, info->uri, "nfo:pageCount", val, NULL, NULL, FALSE); 322 } else if (g_strcmp0 (key, "gsf:word-count") == 0) { 323 metadata_add_gvalue (info->metadata, info->uri, "nfo:wordCount", val, NULL, NULL, FALSE); 324 } else if (g_strcmp0 (key, "meta:creation-date") == 0) { 325 metadata_add_gvalue (info->metadata, info->uri, "nie:contentCreated", val, NULL, NULL, TRUE); 326 } else if (g_strcmp0 (key, "meta:generator") == 0) { 327 metadata_add_gvalue (info->metadata, info->uri, "nie:generator", val, NULL, NULL, FALSE); 328 } 329 } 330 331 static void 332 document_metadata_cb (gpointer key, 333 gpointer value, 334 gpointer user_data) 335 { 336 if (g_strcmp0 (key, "CreativeCommons_LicenseURL") == 0) { 337 MetadataInfo *info = user_data; 338 339 metadata_add_gvalue (info->metadata, 340 info->uri, 341 "nie:license", 342 gsf_doc_prop_get_val (value), 343 NULL, 344 NULL, 345 FALSE); 346 } 347 } 348 349 /** 350 * @brief Read 8 bit unsigned integer 351 * @param buffer data to read integer from 352 * @return 16 bit unsigned integer 353 */ 354 static guint 355 read_8bit (const guint8 *buffer) 356 { 357 return buffer[0]; 358 } 359 360 /** 361 * @brief Read 16 bit unsigned integer 362 * @param buffer data to read integer from 363 * @return 16 bit unsigned integer 364 */ 365 static guint16 366 read_16bit (const guint8 *buffer) 367 { 368 return buffer[0] + (buffer[1] << 8); 369 } 370 371 /** 372 * @brief Read 32 bit unsigned integer 373 * @param buffer data to read integer from 374 * @return 32 bit unsigned integer 375 */ 376 static guint32 377 read_32bit (const guint8 *buffer) 378 { 379 return buffer[0] + (buffer[1] << 8) + (buffer[2] << 16) + (buffer[3] << 24); 380 } 381 382 /** 383 * @brief Common conversion and normalization method for all msoffice type 384 * documents. 385 * @param buffer Input buffer with the string contents 386 * @param chunk_size Number of valid bytes in the input buffer 387 * @param is_ansi If %TRUE, input text should be encoded in CP1252, and 388 * in UTF-16 otherwise. 389 * @param p_bytes_remaining Pointer to #gsize specifying how many bytes 390 * should still be considered. 391 * @param p_content Pointer to a #GString where the output normalized words 392 * will be appended. 393 */ 394 static void 395 msoffice_convert_and_normalize_chunk (guint8 *buffer, 396 gsize chunk_size, 397 gboolean is_ansi, 398 gsize *bytes_remaining, 399 GString **content) 400 { 401 gsize n_bytes_utf8; 402 gchar *converted_text; 403 GError *error = NULL; 404 405 g_return_if_fail (buffer != NULL); 406 g_return_if_fail (chunk_size > 0); 407 g_return_if_fail (bytes_remaining != NULL); 408 g_return_if_fail (content != NULL); 409 410 /* chunks can have different encoding 411 * 412 * TODO: Using g_iconv, this extra heap allocation could be 413 * avoided, re-using over and over again the same output buffer 414 * for the UTF-8 encoded string 415 */ 416 converted_text = g_convert (buffer, 417 chunk_size, 418 "UTF-8", 419 is_ansi ? "CP1252" : "UTF-16", 420 NULL, 421 &n_bytes_utf8, 422 &error);
pointer targets in passing argument 1 of 'g_convert' differ in signedness
(emitted by gcc)
423 424 if (converted_text) { 425 gsize len_to_validate; 426 427 len_to_validate = MIN (*bytes_remaining, n_bytes_utf8); 428 429 if (tracker_text_validate_utf8 (converted_text, 430 len_to_validate, 431 content, 432 NULL)) { 433 /* A whitespace is added to separate next strings appended */ 434 g_string_append_c (*content, ' '); 435 } 436 437 /* Update accumulated UTF-8 bytes read */ 438 *bytes_remaining -= len_to_validate; 439 g_free (converted_text); 440 } else { 441 g_warning ("Couldn't convert %" G_GSIZE_FORMAT " bytes from %s to UTF-8: %s", 442 chunk_size, 443 is_ansi ? "CP1252" : "UTF-16", 444 error ? error->message : "no error given"); 445 } 446 447 /* Note that error may be set even if some converted text is 448 * available, due to G_CONVERT_ERROR_ILLEGAL_SEQUENCE for example */ 449 g_clear_error (&error); 450 } 451 452 /** 453 * @brief Read header data from given stream 454 * @param stream Stream to read header data 455 * @param header Pointer to header where to store results 456 */ 457 static gboolean 458 ppt_read_header (GsfInput *stream, 459 PowerPointRecordHeader *header) 460 { 461 guint8 buffer[8] = {0}; 462 463 g_return_val_if_fail (stream, FALSE); 464 g_return_val_if_fail (header, FALSE); 465 g_return_val_if_fail (!gsf_input_eof (stream), FALSE); 466 467 468 /* Header is always 8 bytes, read it */ 469 g_return_val_if_fail (gsf_input_read (stream, 8, buffer), FALSE); 470 471 /* Then parse individual details 472 * 473 * Record header is 8 bytes long. Data is split as follows: 474 * recVer (4 bits) 475 * recInstance (12 bits) 476 * recType (2 bytes) 477 * recLen (4 bytes) 478 * 479 * See RecordHeader for more detailed explanation of each field. 480 * 481 * Here we parse each of those fields. 482 */ 483 484 header->recType = read_16bit (&buffer[2]); 485 header->recLen = read_32bit (&buffer[4]); 486 header->recVer = (read_16bit (buffer) & 0xF000) >> 12; 487 header->recInstance = read_16bit (buffer) & 0x0FFF; 488 489 return TRUE; 490 } 491 492 /** 493 * @brief Read powerpoint text from given stream. 494 * 495 * Powerpoint contains texts in either TextBytesAtom or TextCharsAtom. Below 496 * are excerpt from [MS-PPT].pdf file describing the ppt file struture: 497 * 498 * TextCharsAtom contains an array of UTF-16 Unicode [RFC2781] characters that 499 * specifies the characters of the corresponding text. The length, in bytes, of 500 * the array is specified by rh.recLen. The array MUST NOT contain the NUL 501 * character 0x0000. 502 * 503 * TextBytesAtom contains an array of bytes that specifies the characters of the 504 * corresponding text. Each item represents the low byte of a UTF-16 Unicode 505 * [RFC2781] character whose high byte is 0x00. The length, in bytes, of the 506 * array is specified by rh.recLen. The array MUST NOT contain a 0x00 byte. 507 * 508 * @param stream Stream to read text bytes/chars atom 509 * @return read text or NULL if no text was read. Has to be freed by the caller 510 */ 511 static void 512 ppt_read_text (GsfInput *stream, 513 guint8 **p_buffer, 514 gsize *p_buffer_size, 515 gsize *p_read_size) 516 { 517 PowerPointRecordHeader header; 518 gsize required_size; 519 520 g_return_if_fail (stream); 521 g_return_if_fail (p_buffer); 522 g_return_if_fail (p_buffer_size); 523 g_return_if_fail (p_read_size); 524 525 /* First read the header that describes the structures type 526 * (TextBytesAtom or TextCharsAtom) and it's length. 527 */ 528 g_return_if_fail (ppt_read_header (stream, &header)); 529 530 /* We only want header with type either TEXTBYTESATOM_RECORD_TYPE 531 * (TextBytesAtom) or TEXTCHARSATOM_RECORD_TYPE (TextCharsAtom). 532 * 533 * We don't care about anything else 534 */ 535 if (header.recType != TEXTBYTESATOM_RECORD_TYPE && 536 header.recType != TEXTCHARSATOM_RECORD_TYPE) { 537 return; 538 } 539 540 /* Then we'll allocate data for the actual texts */ 541 if (header.recType == TEXTBYTESATOM_RECORD_TYPE) { 542 /* TextBytesAtom doesn't include high bytes propably in order to 543 * save space on the ppt files. We'll have to allocate double the 544 * size for it to get the high bytes 545 */ 546 required_size = header.recLen * 2; 547 } else { 548 required_size = header.recLen; 549 } 550 551 /* Resize reused buffer if needed */ 552 if (required_size > *p_buffer_size) { 553 *p_buffer = g_realloc (*p_buffer, required_size); 554 *p_buffer_size = required_size; 555 } 556 557 /* Then read the textual data from the stream */ 558 if (!gsf_input_read (stream, header.recLen, *p_buffer)) { 559 return; 560 } 561 562 /* Again if we are reading TextBytesAtom we'll need to add those utf16 563 * high bytes ourselves. They are zero as specified in [MS-PPT].pdf 564 * and this function's comments 565 */ 566 if (header.recType == TEXTBYTESATOM_RECORD_TYPE) { 567 gint i; 568 569 for (i = 0; i < header.recLen; i++) { 570 /* We'll add an empty 0 byte between each byte in the array */ 571 (*p_buffer)[(header.recLen - i - 1) * 2] = (*p_buffer)[header.recLen - i - 1]; 572 (*p_buffer)[((header.recLen - i - 1) * 2) + 1] = '\0'; 573 } 574 } 575 576 /* Set read size as output */ 577 *p_read_size = required_size; 578 } 579 580 /** 581 * @brief Find a specific header from given stream 582 * @param stream Stream to parse headers from 583 * @param type1 first type of header to look for 584 * @param type2 convenience parameter if we are looking for either of two 585 * header types 586 * @param rewind if a proper header is found should this function seek 587 * to the start of the header (TRUE) 588 * @return TRUE if either of specified headers was found 589 */ 590 static gboolean 591 ppt_seek_header (GsfInput *stream, 592 gint type1, 593 gint type2, 594 gboolean rewind) 595 { 596 PowerPointRecordHeader header; 597 598 g_return_val_if_fail (stream,FALSE); 599 600 /* Read until we reach eof */ 601 while (!gsf_input_eof (stream)) { 602 /* Read first header */ 603 g_return_val_if_fail (ppt_read_header (stream, &header), FALSE); 604 605 /* Check if it's the correct type */ 606 if (header.recType == type1 || header.recType == type2) { 607 /* Sometimes it's needed to rewind to the start of the 608 * header 609 */ 610 if (rewind) { 611 gsf_input_seek (stream, -8, G_SEEK_CUR); 612 } 613 614 return TRUE; 615 } 616 617 /* If it's not the correct type, seek to the beginning of the 618 * next header 619 */ 620 g_return_val_if_fail (!gsf_input_seek (stream, 621 header.recLen, 622 G_SEEK_CUR), 623 FALSE); 624 } 625 626 return FALSE; 627 } 628 629 static gchar * 630 extract_powerpoint_content (GsfInfile *infile, 631 gsize max_bytes, 632 gboolean *is_encrypted) 633 { 634 /* Try to find Powerpoint Document stream */ 635 GsfInput *stream; 636 GString *all_texts = NULL; 637 gsf_off_t last_document_container; 638 639 /* If no content requested, return */ 640 if (max_bytes == 0) { 641 return NULL; 642 } 643 644 stream = gsf_infile_child_by_name (infile, "PowerPoint Document"); 645 646 if (is_encrypted) { 647 *is_encrypted = FALSE; 648 } 649 650 if (!stream) { 651 return NULL; 652 } 653 654 /* Powerpoint documents have a "editing history" stored within them. 655 * There is a structure that defines what changes were made each time 656 * but it is just easier to get the current/latest version just by 657 * finding the last occurrence of DocumentContainer structure 658 */ 659 last_document_container = -1; 660 661 /* Read until we reach eof. */ 662 while (!gsf_input_eof (stream)) { 663 PowerPointRecordHeader header; 664 665 /* 666 * We only read headers of data structures 667 */ 668 if (!ppt_read_header (stream, &header)) { 669 break; 670 } 671 672 /* And we only care about headers with type 1000, 673 * DocumentContainer 674 */ 675 676 if (header.recType == DOCUMENTCONTAINER_RECORD_TYPE) { 677 last_document_container = gsf_input_tell (stream); 678 } 679 680 /* and then seek to the start of the next data 681 * structure so it is fast and we don't have to read 682 * through the whole file 683 */ 684 if (gsf_input_seek (stream, header.recLen, G_SEEK_CUR)) { 685 break; 686 } 687 } 688 689 /* If a DocumentContainer was found and we are able to seek to it. 690 * 691 * Then we'll have to find the second header with type 692 * SLIDELISTWITHTEXT_RECORD_TYPE since DocumentContainer 693 * contains MasterListWithTextContainer and 694 * SlideListWithTextContainer structures with both having the 695 * same header type. We however only want 696 * SlideListWithTextContainer which contains the textual 697 * content of the power point file. 698 */ 699 if (last_document_container >= 0 && 700 !gsf_input_seek (stream, last_document_container, G_SEEK_SET) && 701 ppt_seek_header (stream, 702 SLIDELISTWITHTEXT_RECORD_TYPE, 703 SLIDELISTWITHTEXT_RECORD_TYPE, 704 FALSE) && 705 ppt_seek_header (stream, 706 SLIDELISTWITHTEXT_RECORD_TYPE, 707 SLIDELISTWITHTEXT_RECORD_TYPE, 708 FALSE)) { 709 gsize bytes_remaining = max_bytes; 710 guint8 *buffer = NULL; 711 gsize buffer_size = 0; 712 713 /* 714 * Read while we have either TextBytesAtom or 715 * TextCharsAtom and we have read less than max_bytes 716 * (in UTF-8) 717 */ 718 while (bytes_remaining > 0 && 719 ppt_seek_header (stream, 720 TEXTBYTESATOM_RECORD_TYPE, 721 TEXTCHARSATOM_RECORD_TYPE, 722 TRUE)) { 723 gsize read_size = 0; 724 725 /* Read the UTF-16 text in the reused buffer, and also get 726 * number of read bytes */ 727 ppt_read_text (stream, &buffer, &buffer_size, &read_size); 728 729 /* Avoid empty strings */ 730 if (read_size > 0) { 731 /* Convert, normalize and limit max words & bytes. 732 * NOTE: `is_ansi' argument is FALSE, as the string is 733 * always in UTF-16 */ 734 msoffice_convert_and_normalize_chunk (buffer, 735 read_size, 736 FALSE, /* Always UTF-16 */ 737 &bytes_remaining, 738 &all_texts); 739 } 740 } 741 742 g_free (buffer); 743 } 744 745 g_object_unref (stream); 746 747 return all_texts ? g_string_free (all_texts, FALSE) : NULL; 748 } 749 750 static GsfInfile * 751 open_file (const gchar *filename, FILE *file) 752 { 753 GsfInput *input; 754 GsfInfile *infile; 755 GError *error = NULL; 756 757 input = gsf_input_stdio_new_FILE (filename, file, TRUE); 758 759 if (!input) { 760 return NULL; 761 } 762 763 infile = gsf_infile_msole_new (input, &error); 764 765 if (error) { 766 g_warning ("Failed to open file: %s", error->message); 767 g_error_free (error); 768 } 769 770 g_object_unref (input); 771 772 return infile; 773 } 774 775 /* This function was programmed by using ideas and algorithms from 776 * b2xtranslator project (http://b2xtranslator.sourceforge.net/) 777 */ 778 static gchar * 779 extract_msword_content (GsfInfile *infile, 780 gsize n_bytes, 781 gboolean *is_encrypted) 782 { 783 GsfInput *document_stream, *table_stream; 784 gint16 i = 0; 785 guint8 tmp_buffer[4] = { 0 }; 786 gint fcClx, lcbClx; 787 guint8 *piece_table = NULL; 788 guint8 *clx = NULL; 789 gint lcb_piece_table; 790 gint piece_count = 0; 791 gint32 fc; 792 GString *content = NULL; 793 guint8 *text_buffer = NULL; 794 gint text_buffer_size = 0; 795 gsize n_bytes_remaining; 796 797 /* If no content requested, return */ 798 if (n_bytes == 0) { 799 return NULL; 800 } 801 802 document_stream = gsf_infile_child_by_name (infile, "WordDocument"); 803 if (document_stream == NULL) { 804 return NULL; 805 } 806 807 /* abort if FIB can't be found from beginning of WordDocument stream */ 808 gsf_input_seek (document_stream, 0, G_SEEK_SET); 809 gsf_input_read (document_stream, 2, tmp_buffer); 810 if (read_16bit (tmp_buffer) != 0xa5ec) { 811 g_object_unref (document_stream); 812 return NULL; 813 } 814 815 /* abort if document is encrypted */ 816 gsf_input_seek (document_stream, 11, G_SEEK_SET); 817 gsf_input_read (document_stream, 1, tmp_buffer); 818 if ((tmp_buffer[0] & 0x1) == 0x1) { 819 g_object_unref (document_stream); 820 *is_encrypted = TRUE; 821 return NULL; 822 } else 823 *is_encrypted = FALSE; 824 825 /* document can have 0Table or 1Table or both. If flag 0x0200 is 826 * set to true in word 0x000A of the FIB then 1Table is used 827 */ 828 gsf_input_seek (document_stream, 0x000A, G_SEEK_SET); 829 gsf_input_read (document_stream, 2, tmp_buffer); 830 i = read_16bit (tmp_buffer); 831 832 if ((i & 0x0200) == 0x0200) { 833 table_stream = gsf_infile_child_by_name (infile, "1Table"); 834 } else { 835 table_stream = gsf_infile_child_by_name (infile, "0Table"); 836 } 837 838 if (table_stream == NULL) { 839 g_object_unref (G_OBJECT (document_stream)); 840 return NULL; 841 } 842 843 /* find out location and length of piece table from FIB */ 844 gsf_input_seek (document_stream, 418, G_SEEK_SET); 845 gsf_input_read (document_stream, 4, tmp_buffer); 846 fcClx = read_32bit (tmp_buffer); 847 gsf_input_read (document_stream, 4, tmp_buffer); 848 lcbClx = read_32bit (tmp_buffer); 849 850 /* If we got an invalid or empty length of piece table, just return 851 * as we cannot iterate over pieces */ 852 if (lcbClx <= 0) { 853 g_object_unref (document_stream); 854 g_object_unref (table_stream); 855 return NULL; 856 } 857 858 /* copy the structure holding the piece table into the clx array. */ 859 clx = g_malloc (lcbClx); 860 gsf_input_seek (table_stream, fcClx, G_SEEK_SET); 861 gsf_input_read (table_stream, lcbClx, clx); 862 863 /* find out piece table from clx and set piece_table -pointer to it */ 864 i = 0; 865 lcb_piece_table = 0; 866 867 while (TRUE) { 868 if (clx[i] == 2) { 869 /* Nice, a proper structure with contents, no need to 870 * iterate more. */ 871 lcb_piece_table = read_32bit (clx + (i + 1)); 872 piece_table = clx + i + 5; 873 piece_count = (lcb_piece_table - 4) / 12; 874 break; 875 } else if (clx[i] == 1) { 876 /* Oh, a PRC structure with properties of text, not 877 * real text, so skip it */ 878 guint16 GrpPrl_len; 879 880 GrpPrl_len = read_16bit (&clx[i+1]); 881 /* 3 is the length of clxt (1byte) and cbGrpprl(2bytes) */ 882 i = i + 3 + GrpPrl_len; 883 } else { 884 break; 885 } 886 } 887 888 /* Iterate over pieces... 889 * Loop is halted whenever one of this conditions is met: 890 * a) Max bytes to be read reached 891 * b) No more pieces to read 892 */ 893 i = 0; 894 n_bytes_remaining = n_bytes; 895 while (n_bytes_remaining > 0 && 896 i < piece_count) { 897 guint8 *piece_descriptor; 898 gint piece_start; 899 gint piece_end; 900 gint piece_size; 901 gboolean is_ansi; 902 903 /* logical position of the text piece in the document_stream */ 904 piece_start = read_32bit (piece_table + (i * 4)); 905 piece_end = read_32bit (piece_table + ((i + 1) * 4)); 906 907 /* descriptor of single piece from piece table */ 908 piece_descriptor = piece_table + ((piece_count + 1) * 4) + (i * 8); 909 910 /* file character position */ 911 fc = read_32bit (piece_descriptor + 2); 912 913 /* second bit is set to 1 if text is saved in ANSI encoding */ 914 is_ansi = (fc & 0x40000000) == 0x40000000; 915 916 /* modify file character position according to text encoding */ 917 if (!is_ansi) { 918 fc = (fc & 0xBFFFFFFF); 919 } else { 920 fc = (fc & 0xBFFFFFFF) >> 1; 921 } 922 923 piece_size = piece_end - piece_start; 924 925 /* NOTE: Very very long pieces may appear. In fact, a single 926 * piece document seems to be quite normal. Thus, we limit 927 * here the number of bytes to read from the stream, based 928 * on the maximum number of bytes in UTF-8. Assuming, then 929 * that a safe limit is 2*n_bytes_remaining if UTF-16 input, 930 * and just n_bytes_remaining in CP1251 input */ 931 piece_size = MIN (piece_size, n_bytes_remaining); 932 933 /* UTF-16 uses twice as many bytes as CP1252 934 * NOTE: Not quite sure about this. Some unicode points will be 935 * encoded using 4 bytes in UTF-16 */ 936 if (!is_ansi) { 937 piece_size *= 2; 938 } 939 940 /* Avoid empty pieces */ 941 if (piece_size >= 1) { 942 943 /* Re-allocate buffer to make it bigger if needed. 944 * This text buffer is re-used over and over in each 945 * iteration. */ 946 if (piece_size > text_buffer_size) { 947 text_buffer = g_realloc (text_buffer, piece_size); 948 text_buffer_size = piece_size; 949 } 950 951 /* read and parse single text piece from document_stream */ 952 gsf_input_seek (document_stream, fc, G_SEEK_SET); 953 gsf_input_read (document_stream, piece_size, text_buffer); 954 955 msoffice_convert_and_normalize_chunk (text_buffer, 956 piece_size, 957 is_ansi, 958 &n_bytes_remaining, 959 &content); 960 } 961 962 /* Go on to next piece */ 963 i++; 964 } 965 966 g_free (text_buffer); 967 g_object_unref (document_stream); 968 g_object_unref (table_stream); 969 g_free (clx); 970 971 return content ? g_string_free (content, FALSE) : NULL; 972 } 973 974 /* Reads and interprets the flags of a given string. May be 975 * used just to skip the fields, as when this bitmask-byte 976 * comes as the first byte of a new record. 977 * NOTE: For a detailed meaning of each field parsed here, 978 * take a look at the XLUnicodeRichExtendedString format: 979 * http://msdn.microsoft.com/en-us/library/dd943830.aspx 980 **/ 981 static void 982 read_excel_string_flags (GsfInput *stream, 983 gboolean *p_is_high_byte, 984 guint16 *p_c_run, 985 guint16 *p_cb_ext_rst) 986 { 987 guint8 tmp_buffer[4] = { 0 }; 988 guint8 bit_mask; 989 gboolean is_ext_string; 990 gboolean is_rich_string; 991 992 /* Note that output arguments may be NULL if we don't need 993 * their values... */ 994 995 /* Reading 1 byte for mask */ 996 gsf_input_read (stream, 1, tmp_buffer); 997 bit_mask = read_8bit (tmp_buffer); 998 999 /* Get flags */ 1000 if (p_is_high_byte) { 1001 *p_is_high_byte = (bit_mask & 0x01) == 0x01; 1002 } 1003 is_ext_string = (bit_mask & 0x04) == 0x04; 1004 is_rich_string = (bit_mask & 0x08) == 0x08; 1005 1006 /* If the c_run value is required as output, read it */ 1007 if (p_c_run) { 1008 if (is_rich_string) { 1009 /* Reading 2 Bytes */ 1010 gsf_input_read (stream, 2, tmp_buffer); 1011 1012 /* Reading cRun */ 1013 *p_c_run = read_16bit (tmp_buffer); 1014 } else { 1015 *p_c_run = 0; 1016 } 1017 } else if (is_rich_string) { 1018 /* If not required, just skip those bytes */ 1019 gsf_input_seek (stream, 2, G_SEEK_CUR); 1020 } 1021 1022 /* If the cb_ext_rst value is required as output, read it */ 1023 if (p_cb_ext_rst) { 1024 if (is_ext_string) { 1025 /* Reading 4 Bytes */ 1026 gsf_input_read (stream, 4, tmp_buffer); 1027 1028 /* Reading cRun */ 1029 *p_cb_ext_rst = read_16bit (tmp_buffer); 1030 } else { 1031 *p_cb_ext_rst = 0; 1032 } 1033 } else if (is_ext_string) { 1034 /* If not required, just skip those bytes */ 1035 gsf_input_seek (stream, 4, G_SEEK_CUR); 1036 } 1037 } 1038 1039 /* Returns TRUE if record was changed. BUT, the value of the 1040 * current_record should be checked by the caller to know 1041 * if there are no more records */ 1042 static gboolean 1043 change_excel_record_if_needed (GsfInput *stream, 1044 GArray *record_array, 1045 guint *p_current_record) 1046 { 1047 ExcelExtendedStringRecord *record; 1048 1049 /* Get current record */ 1050 record = &g_array_index (record_array, 1051 ExcelExtendedStringRecord, 1052 *p_current_record); 1053 1054 /* We may already have surpassed the record, so adjust if so */ 1055 if (gsf_input_tell (stream) >= (record->offset + record->length)) { 1056 /* Switch records and read from the second one... */ 1057 (*p_current_record)++; 1058 1059 if (*p_current_record < record_array->len) { 1060 record = &g_array_index (record_array, 1061 ExcelExtendedStringRecord, 1062 *p_current_record); 1063 1064 gsf_input_seek (stream, record->offset, G_SEEK_SET); 1065 } 1066 1067 return TRUE; 1068 } 1069 1070 return FALSE; 1071 } 1072 1073 /* Returns TRUE if correctly read 1074 * 1075 * Note that p_current_record may get changed if the required 1076 * bytes to read were split into two different records. 1077 */ 1078 static gboolean 1079 read_excel_string (GsfInput *stream, 1080 guint8 *buffer, 1081 gsize chunk_size, 1082 GArray *record_array, 1083 guint *p_current_record) 1084 { 1085 ExcelExtendedStringRecord *record; 1086 gsf_off_t current_position; 1087 gsf_off_t current_record_end; 1088 1089 /* Record may have changed when we want to read the string contents 1090 * This is a pretty special case, where the new CONTINUE record 1091 * shouldn't start with a bitmask */ 1092 if (change_excel_record_if_needed (stream, record_array, p_current_record) && 1093 *p_current_record >= record_array->len) { 1094 /* When reached max number of records, just return */ 1095 return FALSE; 1096 } 1097 1098 /* Get current record */ 1099 record = &g_array_index (record_array, 1100 ExcelExtendedStringRecord, 1101 *p_current_record); 1102 1103 /* Compute current position in the stream and end of current record*/ 1104 current_position = gsf_input_tell (stream); 1105 current_record_end = record->offset + record->length; 1106 1107 /* The best case is when the whole number of bytes to read are in the 1108 * current record, as no record switching is therefore needed */ 1109 if (current_position + chunk_size <= current_record_end) { 1110 return gsf_input_read (stream, chunk_size, buffer) != NULL ? TRUE : FALSE; 1111 } else if (current_record_end < current_position) { 1112 /* Safety check, actually pretty important */ 1113 return FALSE; 1114 } else { 1115 /* Read the string in two chunks */ 1116 gsize chunk_size_first_record; 1117 gsize chunk_size_second_record; 1118 1119 /* Compute how much to read in each record */ 1120 chunk_size_first_record = current_record_end - current_position; 1121 chunk_size_second_record = chunk_size - chunk_size_first_record; 1122 1123 /* g_debug ("Current position: %" GSF_OFF_T_FORMAT, current_position); */ 1124 /* g_debug ("Current record index: %u", *p_current_record); */ 1125 /* g_debug ("Current record start: %" GSF_OFF_T_FORMAT, record->offset); */ 1126 /* g_debug ("Current record length: %" G_GSIZE_FORMAT, record->length); */ 1127 /* g_debug ("Current record end: %" GSF_OFF_T_FORMAT, current_record_end); */ 1128 /* g_debug ("Bytes to read: %" G_GSIZE_FORMAT, chunk_size); */ 1129 /* g_debug ("Bytes to read (1st): %" G_GSIZE_FORMAT, chunk_size_first_record); */ 1130 /* g_debug ("Bytes to read (2nd): %" G_GSIZE_FORMAT, chunk_size_second_record); */ 1131 1132 /* Now, read from first record... */ 1133 if (gsf_input_read (stream, 1134 chunk_size_first_record, 1135 buffer)) { 1136 /* Now switch records and read from the second one... */ 1137 (*p_current_record)++; 1138 1139 if (*p_current_record < record_array->len) { 1140 record = &g_array_index (record_array, 1141 ExcelExtendedStringRecord, 1142 *p_current_record); 1143 1144 /* g_debug ("New record index: %u", *p_current_record); */ 1145 /* g_debug ("New record start: %" GSF_OFF_T_FORMAT, record->offset); */ 1146 /* g_debug ("New record length: %" G_GSIZE_FORMAT, record->length); */ 1147 1148 /* Move stream pointer to the new location, beginning of next record */ 1149 gsf_input_seek (stream, record->offset, G_SEEK_SET); 1150 1151 /* Every CONTINUE records starts with a bitmask + optional fields that 1152 * should be skipped properly */ 1153 read_excel_string_flags (stream, NULL, NULL, NULL); 1154 1155 /* And finally, read the second part */ 1156 if (gsf_input_read (stream, 1157 chunk_size_second_record, 1158 &buffer[chunk_size_first_record])) { 1159 /* All OK! */ 1160 return TRUE; 1161 } 1162 } 1163 } 1164 1165 return FALSE; 1166 } 1167 } 1168 1169 1170 1171 /** 1172 * [MS-XLS] — v20090708 1173 * Excel Binary File Format (.xls) Structure Specification 1174 * Copyright © 2009 Microsoft Corporation. 1175 * Release: Wednesday, July 8, 2009 1176 * 1177 * 2.5.293 XLUnicodeRichExtendedString 1178 * This structure specifies a Unicode string, which can contain 1179 * formatting information and phoneticstring data. 1180 1181 * This structure‘s non-variable fields MUST be specified in the same 1182 * record. This structure‘s variable fields can be extended with 1183 * Continue records. A value from the table for fHighByte MUST be 1184 * specified in the first byte of the continue field of the Continue 1185 * record followed by the remaining portions of this structure‘s 1186 * variable fields. 1187 * 1 2 3 1188 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1189 * cch A B C D reserved2 cRun (optional) 1190 * ... cbExtRst (optional) 1191 * ... rgb (variable) 1192 * ... 1193 * rgRun (variable, optional) 1194 * ... 1195 * ExtRst (variable, optional) 1196 * ... 1197 * cch (2 bytes): An unsigned integer that specifies the count of 1198 * characters in the string. 1199 * 1200 * A - fHighByte (1 bit): A bit that specifies whether the characters 1201 * in rgb are double-byte characters. MUST be a value from the 1202 * following table: 1203 * 1204 * Value Meaning 1205 * 0x0 All the characters in the string have a high byte of 0x00 1206 * and only the low bytes are in rgb. 1207 * 0x1 All the characters in the string are saved as double-byte 1208 * characters in rgb. 1209 * B - reserved1 (1 bit): MUST be zero, and MUST be ignored. 1210 * C - fExtSt (1 bit): A bit that specifies whether the string 1211 * contains phonetic string data. 1212 * D - fRichSt (1 bit): A bit that specifies whether the string is a 1213 * rich string and the string has at least two character formats 1214 * applied. 1215 * 1216 * reserved2 (4 bits): MUST be zero, and MUST be ignored. 1217 * 1218 * cRun (2 bytes): An optional unsigned integer that specifies the 1219 * number of elements in rgRun. MUST exist if and only if fRichSt is 1220 * 0x1. 1221 * 1222 * cbExtRst (4 bytes): An optional signed integer that specifies the 1223 * byte count of ExtRst. MUST exist if and only if fExtSt is 0x1. MUST 1224 * be zero or greater. 1225 * 1226 * rgb (variable): An array of bytes that specifies the characters in 1227 * the string. If fHighByte is 0x0, the size of the array is cch. If 1228 * fHighByte is 0x1, the size of the array is cch*2. If fHighByte is 1229 * 0x1 and rgb is extended with a Continue record the break MUST occur 1230 * at the double-byte character boundary. 1231 * 1232 * rgRun (variable): An optional array of FormatRun structures that 1233 * specifies the formatting for each text run. The number of elements 1234 * in the array is cRun. MUST exist if and only if fRichSt is 0x1. 1235 * 1236 * ExtRst (variable): An optional ExtRst that specifies the phonetic 1237 * string data. The size of this field is cbExtRst. MUST exist if and 1238 * only if fExtSt is 0x1. 1239 */ 1240 static void 1241 xls_get_extended_record_string (GsfInput *stream, 1242 GArray *list, 1243 gsize *p_bytes_remaining, 1244 GString **p_content) 1245 { 1246 ExcelExtendedStringRecord *record; 1247 guint32 cst_unique; 1248 guint parsing_record = 0; 1249 guint8 tmp_buffer[4] = { 0 }; 1250 guint i; 1251 guint8 *buffer = NULL; 1252 gsize buffer_size = 0; 1253 1254 /* Parsing the record from the list */ 1255 record = &g_array_index (list, ExcelExtendedStringRecord, parsing_record); 1256 1257 /* First record parsing */ 1258 if (gsf_input_seek (stream, record->offset, G_SEEK_SET)) { 1259 return; 1260 } 1261 1262 /* Note: The first record is ALWAYS the SST, so coming with cst_total and 1263 * cst_unique values. 1264 * Some extra background: Records with data longer than 8,224 bytes MUST be 1265 * split into several records, so in this case, if the SST record is big 1266 * enough, it will have one or more CONTINUE records 1267 * 1268 * SST record: http://msdn.microsoft.com/en-us/library/dd773037%28v=office.12%29.aspx 1269 * CONTINUE record: http://msdn.microsoft.com/en-us/library/dd949081%28v=office.12%29.aspx 1270 **/ 1271 1272 /* Reading cst total */ 1273 gsf_input_read (stream, 4, tmp_buffer); 1274 read_32bit (tmp_buffer); 1275 1276 /* Reading cst unique */ 1277 gsf_input_read (stream, 4, tmp_buffer); 1278 cst_unique = read_32bit (tmp_buffer); 1279 1280 /* Iterate over chunks... 1281 * Loop is halted whenever one of this conditions is met: 1282 * a) Max bytes to be read reached 1283 * b) No more chunks to read 1284 */ 1285 i = 0; 1286 while (*p_bytes_remaining > 0 && 1287 i < cst_unique) { 1288 guint16 cch; 1289 guint16 c_run; 1290 guint16 cb_ext_rst; 1291 gboolean is_high_byte; 1292 gsize chunk_size; 1293 1294 /* RECORD may have been changed here */ 1295 if (change_excel_record_if_needed (stream, list, &parsing_record) && 1296 parsing_record >= list->len) { 1297 /* When reached max number of records, stop loop */ 1298 break; 1299 } 1300 1301 /* Reading 2 bytes for cch */ 1302 gsf_input_read (stream, 2, tmp_buffer); 1303 1304 /* Reading cch - char count of current string */ 1305 cch = read_16bit (tmp_buffer); 1306 1307 /* Read string flags */ 1308 read_excel_string_flags (stream, 1309 &is_high_byte, 1310 &c_run, 1311 &cb_ext_rst); 1312 1313 /* RECORD may have been changed here, but it is managed when reading the 1314 * string contents */ 1315 1316 1317 /* NOTE: In order to avoid reading unnecessary bytes, limit it based 1318 * on the number of bytes remaining */ 1319 chunk_size = MIN (cch, *p_bytes_remaining); 1320 1321 /* If High Byte, chunk size *2 as stream is in UTF-16 */ 1322 if (is_high_byte) { 1323 chunk_size *= 2; 1324 } 1325 1326 /* If the new chunk size is longer than our reused buffer, 1327 * make the buffer bigger */ 1328 if (chunk_size > buffer_size) { 1329 buffer = g_realloc (buffer, chunk_size); 1330 buffer_size = chunk_size; 1331 } 1332 1333 /* Read the chunk! NOTE that it may be split in several records... */ 1334 if (!read_excel_string (stream, buffer, chunk_size, list, &parsing_record)) { 1335 break; 1336 } 1337 1338 /* Read whole stream in one operation */ 1339 msoffice_convert_and_normalize_chunk (buffer, 1340 chunk_size, 1341 !is_high_byte, 1342 p_bytes_remaining, 1343 p_content); 1344 1345 /* Formatting string */ 1346 if (c_run > 0) { 1347 /* rgRun (variable): An optional array of 1348 * FormatRun structures that specifies the 1349 * formatting for each ext run. The number of 1350 * elements in the array is cRun. MUST exist 1351 * if and only if fRichSt is 0x1. 1352 * 1353 * Note: As defined in MSDN, a FormatRun structure has a size 1354 * of 4 bytes, so the size of this rgRun variable is really 1355 * (4*cRun) bytes. 1356 * http://msdn.microsoft.com/en-us/library/dd921712.aspx 1357 * 1358 * Skiping this as it will not be useful in 1359 * our case. 1360 */ 1361 gsf_input_seek (stream, 4 * c_run, G_SEEK_CUR); 1362 /* Note that we may be now out of the current record after having 1363 * done this seek operation. */ 1364 } 1365 1366 /* ExtString */ 1367 if (cb_ext_rst > 0) { 1368 /* Again its not so clear may be it will not 1369 * useful in our case. 1370 */ 1371 gsf_input_seek (stream, cb_ext_rst, G_SEEK_CUR); 1372 /* Note that we may be now out of the current record after having 1373 * done this seek operation. */ 1374 } 1375 1376 /* Go to next chunk */ 1377 i++; 1378 } 1379 } 1380 1381 /** 1382 * @brief Extract excel content from specified infile 1383 * @param infile file to read summary from 1384 * @param n_words number of max words to extract 1385 * @param n_bytes max number of bytes to extract 1386 * @param is_encrypted 1387 * @Notes :- About SST record 1388 * 1389 * This record specifies string constants. 1390 * [MS-XLS] — v20090708 1391 * Excel Binary File Format (.xls) Structure Specification 1392 * Copyright © 2009 Microsoft Corporation. 1393 * Release: Wednesday, July 8, 2009 1394 * 1395 * Each string constant in this record has one or more references in 1396 * the workbook, with the goal of improving performance in opening and 1397 * saving the file. The LabelSst record specifies how to make a 1398 * reference to a string in this record. 1399 * 1 2 3 1400 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1401 * cstTotal 1402 * cstUnique 1403 * rgb (variable) 1404 * ... 1405 * cstTotal (4 bytes): A signed integer that specifies the total 1406 * number of references in the workbook to the strings in the shared 1407 * string table. MUST be greater than or equal to 0. 1408 * 1409 * cstUnique (4 bytes): A signed integer that specifies the number of 1410 * unique strings in the shared string table. MUST be greater than or 1411 * equal to 0. 1412 * 1413 * rgb (variable): An array of XLUnicodeRichExtendedString structures. 1414 * Records in this array are unique. 1415 */ 1416 static gchar* 1417 extract_excel_content (GsfInfile *infile, 1418 gsize n_bytes, 1419 gboolean *is_encrypted) 1420 { 1421 ExcelBiffHeader header1; 1422 GString *content = NULL; 1423 GsfInput *stream; 1424 guint saved_offset; 1425 gsize n_bytes_remaining = n_bytes; 1426 1427 /* If no content requested, return */ 1428 if (n_bytes == 0) { 1429 return NULL; 1430 } 1431 1432 stream = gsf_infile_child_by_name (infile, "Workbook"); 1433 1434 if (!stream) { 1435 return NULL; 1436 } 1437 1438 /* Read until we reach eof or any of our limits reached */ 1439 while (n_bytes_remaining > 0 && 1440 !gsf_input_eof (stream)) { 1441 guint8 tmp_buffer[4] = { 0 }; 1442 1443 /* Reading 4 bytes to read header */ 1444 gsf_input_read (stream, 4, tmp_buffer); 1445 header1.id = read_16bit (tmp_buffer); 1446 header1.length = read_16bit (tmp_buffer + 2); 1447 1448 /* g_debug ("id: %d , length %d", header.id, header.length); */ 1449 1450 /* We are interested only in SST record */ 1451 if (header1.id == RECORD_TYPE_SST) { 1452 ExcelExtendedStringRecord record; 1453 ExcelBiffHeader header2; 1454 GArray *list; 1455 guint length = 0; 1456 1457 /* Saving length and offset so that will 1458 * return to saved once we are done!! 1459 */ 1460 length = header1.length; 1461 saved_offset = gsf_input_tell (stream); 1462 1463 /* Saving ExtendendString Record offset and 1464 * length. 1465 */ 1466 record.offset = gsf_input_tell (stream); 1467 record.length = length; 1468 1469 /* g_debug ("record.offset: %u record.length:%d", */ 1470 /* record.offset, record.length); */ 1471 1472 /* Allocation new array of ExtendendString Record */ 1473 list = g_array_new (TRUE, TRUE, sizeof (ExcelExtendedStringRecord)); 1474 1475 if (!list) { 1476 break; 1477 } 1478 1479 g_array_append_val (list, record); 1480 1481 /* Reading to parse continue record. 1482 * 1483 * Note: we are justing parsing notrequired 1484 * to read data so passing null data 1485 */ 1486 gsf_input_seek (stream, length, G_SEEK_CUR); 1487 1488 /* Reading & Assigning biff header 4 bytes */ 1489 gsf_input_read (stream, 4, tmp_buffer); 1490 1491 header2.id = read_16bit (tmp_buffer); 1492 header2.length = read_16bit (tmp_buffer + 2); 1493 1494 /* g_debug ("bf id :%d length:%d", header2.id, header2.length); */ 1495 /* g_debug ("offset: %u", (guint) gsf_input_tell (stream)); */ 1496 1497 while (header2.id == RECORD_TYPE_CONTINUE) { 1498 /* Assigning to linkedlist we will use 1499 * it to read data 1500 */ 1501 record.offset = gsf_input_tell (stream); 1502 record.length = header2.length; 1503 g_array_append_val (list, record); 1504 1505 /* g_debug ("record.offset: %u record.length:%d", */ 1506 /* record.offset, record.length); */ 1507 1508 /* Then parse the data from the stream */ 1509 gsf_input_seek (stream, header2.length, G_SEEK_CUR); 1510 1511 /* Reading and assigning biff header */ 1512 gsf_input_read (stream, 4, tmp_buffer); 1513 header2.id = read_16bit (tmp_buffer); 1514 header2.length = read_16bit (tmp_buffer + 2); 1515 1516 /* g_debug ("bf id :%d length:%d", header2.id, header2.length); */ 1517 }; 1518 1519 /* Read extended string */ 1520 xls_get_extended_record_string (stream, 1521 list, 1522 &n_bytes_remaining, 1523 &content); 1524 1525 g_array_unref (list); 1526 1527 /* Restoring the old_offset */ 1528 gsf_input_seek (stream, saved_offset, G_SEEK_SET); 1529 break; 1530 } 1531 1532 /* Moving stream pointer to record length */ 1533 if (gsf_input_seek (stream, header1.length, G_SEEK_CUR)) { 1534 break; 1535 } 1536 } 1537 1538 g_object_unref (stream); 1539 1540 g_debug ("Bytes extracted: %" G_GSIZE_FORMAT, 1541 n_bytes - n_bytes_remaining); 1542 1543 return content ? g_string_free (content, FALSE) : NULL; 1544 } 1545 1546 /** 1547 * @brief Extract summary OLE stream from specified uri 1548 * @param metadata where to store summary 1549 * @param infile file to read summary from 1550 * @param uri uri of the file 1551 */ 1552 static gboolean 1553 extract_summary (TrackerSparqlBuilder *metadata, 1554 GsfInfile *infile, 1555 const gchar *uri) 1556 { 1557 GsfInput *stream; 1558 1559 tracker_sparql_builder_predicate (metadata, "a"); 1560 tracker_sparql_builder_object (metadata, "nfo:PaginatedTextDocument"); 1561 1562 stream = gsf_infile_child_by_name (infile, "\05SummaryInformation"); 1563 1564 if (stream) { 1565 GsfDocMetaData *md; 1566 MetadataInfo info; 1567 GError *error = NULL; 1568 1569 md = gsf_doc_meta_data_new (); 1570 error = gsf_doc_meta_data_read_from_msole (md, stream); 1571 1572 if (error) { 1573 g_warning ("Could not extract summary information, %s", 1574 error->message ? error->message : "no error given"); 1575 1576 g_error_free (error); 1577 g_object_unref (md); 1578 g_object_unref (stream); 1579 gsf_shutdown (); 1580 1581 return FALSE; 1582 } 1583 1584 info.metadata = metadata; 1585 info.uri = uri; 1586 1587 gsf_doc_meta_data_foreach (md, summary_metadata_cb, &info); 1588 1589 g_object_unref (md); 1590 g_object_unref (stream); 1591 } 1592 1593 stream = gsf_infile_child_by_name (infile, "\05DocumentSummaryInformation"); 1594 1595 if (stream) { 1596 GsfDocMetaData *md; 1597 MetadataInfo info; 1598 GError *error = NULL; 1599 1600 md = gsf_doc_meta_data_new (); 1601 1602 error = gsf_doc_meta_data_read_from_msole (md, stream); 1603 if (error) { 1604 g_warning ("Could not extract document summary information, %s", 1605 error->message ? error->message : "no error given"); 1606 1607 g_error_free (error); 1608 g_object_unref (md); 1609 g_object_unref (stream); 1610 gsf_shutdown (); 1611 1612 return FALSE; 1613 } 1614 1615 info.metadata = metadata; 1616 info.uri = uri; 1617 1618 gsf_doc_meta_data_foreach (md, document_metadata_cb, &info); 1619 1620 g_object_unref (md); 1621 g_object_unref (stream); 1622 } 1623 1624 return TRUE; 1625 } 1626 1627 /** 1628 * @brief Extract data from generic office files 1629 * 1630 * At the moment only extracts document summary from summary OLE stream. 1631 * @param uri URI of the file to extract data 1632 * @param metadata where to store extracted data to 1633 */ 1634 G_MODULE_EXPORT gboolean 1635 tracker_extract_get_metadata (TrackerExtractInfo *info) 1636 { 1637 TrackerSparqlBuilder *metadata; 1638 TrackerConfig *config; 1639 GsfInfile *infile = NULL; 1640 gchar *content = NULL, *uri; 1641 gboolean is_encrypted = FALSE; 1642 const gchar *mime_used; 1643 gsize max_bytes; 1644 GFile *file; 1645 gchar *filename; 1646 FILE *mfile; 1647 1648 gsf_init (); 1649 1650 metadata = tracker_extract_info_get_metadata_builder (info); 1651 mime_used = tracker_extract_info_get_mimetype (info); 1652 1653 file = tracker_extract_info_get_file (info); 1654 uri = g_file_get_uri (file); 1655 1656 filename = g_filename_from_uri (uri, NULL, NULL); 1657 1658 mfile = tracker_file_open (filename); 1659 g_free (filename); 1660 1661 if (!mfile) { 1662 g_warning ("Can't open file from uri '%s': %s", 1663 uri, g_strerror (errno)); 1664 g_free (uri); 1665 return FALSE; 1666 } 1667 1668 infile = open_file (uri, mfile); 1669 if (!infile) { 1670 gsf_shutdown (); 1671 g_free (uri); 1672 if (mfile) { 1673 tracker_file_close (mfile, FALSE); 1674 } 1675 return FALSE; 1676 } 1677 1678 /* Extracting summary */ 1679 extract_summary (metadata, infile, uri); 1680 1681 /* Set max bytes to read from content */ 1682 config = tracker_main_get_config (); 1683 max_bytes = tracker_config_get_max_bytes (config); 1684 1685 if (g_ascii_strcasecmp (mime_used, "application/msword") == 0) { 1686 /* Word file */ 1687 content = extract_msword_content (infile, max_bytes, &is_encrypted); 1688 } else if (g_ascii_strcasecmp (mime_used, "application/vnd.ms-powerpoint") == 0) { 1689 /* PowerPoint file */ 1690 tracker_sparql_builder_predicate (metadata, "a"); 1691 tracker_sparql_builder_object (metadata, "nfo:Presentation"); 1692 1693 content = extract_powerpoint_content (infile, max_bytes, &is_encrypted); 1694 } else if (g_ascii_strcasecmp (mime_used, "application/vnd.ms-excel") == 0) { 1695 /* Excel File */ 1696 tracker_sparql_builder_predicate (metadata, "a"); 1697 tracker_sparql_builder_object (metadata, "nfo:Spreadsheet"); 1698 1699 content = extract_excel_content (infile, max_bytes, &is_encrypted); 1700 } else { 1701 g_message ("Mime type was not recognised:'%s'", mime_used); 1702 } 1703 1704 if (content) { 1705 tracker_sparql_builder_predicate (metadata, "nie:plainTextContent"); 1706 tracker_sparql_builder_object_unvalidated (metadata, content); 1707 g_free (content); 1708 } 1709 1710 if (is_encrypted) { 1711 tracker_sparql_builder_predicate (metadata, "nfo:isContentEncrypted"); 1712 tracker_sparql_builder_object_boolean (metadata, TRUE); 1713 } 1714 1715 g_object_unref (infile); 1716 g_free (uri); 1717 gsf_shutdown (); 1718 if (mfile) { 1719 tracker_file_close (mfile, FALSE); 1720 } 1721 1722 return TRUE; 1723 }