Location	Tool	Test ID	Function	Issue
tracker-extract-msoffice.c:158:2	gcc	pointer-sign	msoffice_string_process_octal_triplets	pointer targets in passing argument 1 of 'strlen' differ in signedness
tracker-extract-msoffice.c:252:4	gcc	pointer-sign	metadata_add_gvalue	pointer targets in passing argument 1 of 'msoffice_string_process_octal_triplets' differ in signedness
tracker-extract-msoffice.c:422:30	gcc	pointer-sign	msoffice_convert_and_normalize_chunk	pointer targets in passing argument 1 of 'g_convert' differ in signedness
Tool	Failure ID	Location	Function	Message	Data
clang-analyzer	no-output-found	tracker-extract-msoffice.c		Message(text='Unable to locate XML output from invoke-clang-analyzer')	None
   1 /*
   2  * Copyright (C) 2006, Edward Duffy <eduffy@gmail.com>
   3  * Copyright (C) 2006, Laurent Aguerreche <laurent.aguerreche@free.fr>
   4  * Copyright (C) 2008, Nokia <ivan.frade@nokia.com>
   5  *
   6  * This library is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2.1 of the License, or (at your option) any later version.
  10  *
  11  * This library is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with this library; if not, write to the
  18  * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
  19  * Boston, MA  02110-1301, USA.
  20  */
  21 
  22 #include "config.h"
  23 
  24 #include <errno.h>
  25 #include <string.h>
  26 
  27 #include <glib.h>
  28 
  29 #include <gsf/gsf.h>
  30 #include <gsf/gsf-doc-meta-data.h>
  31 #include <gsf/gsf-infile.h>
  32 #include <gsf/gsf-infile-msole.h>
  33 #include <gsf/gsf-input-stdio.h>
  34 #include <gsf/gsf-msole-utils.h>
  35 #include <gsf/gsf-utils.h>
  36 #include <gsf/gsf-infile-zip.h>
  37 
  38 #include <libtracker-common/tracker-utils.h>
  39 #include <libtracker-common/tracker-file-utils.h>
  40 #include <libtracker-common/tracker-os-dependant.h>
  41 
  42 #include <libtracker-extract/tracker-extract.h>
  43 
  44 #include "tracker-main.h"
  45 #include "tracker-gsf.h"
  46 
  47 /* Powerpoint files comprise of structures. Each structure contains a
  48  * header. Within that header is a record type that specifies what
  49  * strcture it is. It is called record type.
  50  *
  51  * Here are are some record types and description of the structure
  52  * (called atom) they contain.
  53  */
  54 
  55 /* An atom record that specifies Unicode characters with no high byte
  56  * of a UTF-16 Unicode character. High byte is always 0.
  57  * http://msdn.microsoft.com/en-us/library/dd947905%28v=office.12%29.aspx
  58  */
  59 #define TEXTBYTESATOM_RECORD_TYPE      0x0FA8
  60 
  61 /* An atom record that specifies Unicode characters.
  62  * http://msdn.microsoft.com/en-us/library/dd772921%28v=office.12%29.aspx
  63  */
  64 #define TEXTCHARSATOM_RECORD_TYPE      0x0FA0
  65 
  66 /* A container record that specifies information about the powerpoint
  67  * document.
  68  */
  69 #define DOCUMENTCONTAINER_RECORD_TYPE  0x03E8
  70 
  71 /* Variant type of record. Within Powerpoint text extraction we are
  72  * interested of SlideListWithTextContainer type that contains the
  73  * textual content of the slide(s).
  74  */
  75 #define SLIDELISTWITHTEXT_RECORD_TYPE  0x0FF0
  76 
  77 /**
  78  * @brief Header for all powerpoint structures
  79  *
  80  * A structure at the beginning of each container record and each atom record in
  81  * the file. The values in the record header and the context of the record are
  82  * used to identify and interpret the record data that follows.
  83  */
  84 typedef struct {
  85 	/**
  86 	 * @brief An unsigned integer that specifies the version of the record
  87 	 * data that follows the record header. A value of 0xF specifies that the
  88 	 * record is a container record.
  89 	 */
  90 	guint recVer;
  91 
  92 	/**
  93 	 * @brief An unsigned integer that specifies the record instance data.
  94 	 * Interpretation of the value is dependent on the particular record
  95 	 * type.
  96 	 */
  97 	guint recInstance;
  98 
  99 	/**
 100 	 * @brief A RecordType enumeration that specifies the type of the record
 101 	 * data that follows the record header.
 102 	 */
 103 	gint recType;
 104 
 105 	/**
 106 	 * @brief An unsigned integer that specifies the length, in bytes, of the
 107 	 * record data that follows the record header.
 108 	 */
 109 	guint recLen;
 110 } PowerPointRecordHeader;
 111 
 112 /* Excel spec record type to read shared string */
 113 typedef enum {
 114 	RECORD_TYPE_SST      = 252,
 115 	RECORD_TYPE_CONTINUE = 60,
 116 	RECORD_TYPE_EOF      = 10
 117 } ExcelRecordType;
 118 
 119 /* ExcelBiffHeader to read excel spec header */
 120 typedef struct {
 121 	ExcelRecordType id;
 122 	guint length;
 123 } ExcelBiffHeader;
 124 
 125 /* ExtendendString Record offset in stream and length */
 126 typedef struct {
 127 	gsf_off_t offset; /* 64 bits!! */
 128 	gsize     length;
 129 } ExcelExtendedStringRecord;
 130 
 131 typedef struct {
 132 	TrackerSparqlBuilder *metadata;
 133 	const gchar *uri;
 134 } MetadataInfo;
 135 
 136 /* Valid range from \000 to \377 (0 to 255) */
 137 #define octal_ascii_triplet_is_valid(slash, a2, a1, a0) \
 138 	(slash == '\\' && \
 139 	 a2 >= '0' && a2 <= '3' && \
 140 	 a1 >= '0' && a1 <= '7' && \
 141 	 a0 >= '0' && a0 <= '7')
 142 
 143 #define octal_ascii_triplet_to_decimal_int(a2, a1, a0) \
 144 	((a0 - '0') + 8 * ((a1 - '0') + 8 * (a2 - '0')))
 145 
 146 /*
 147  * So, we may get input strings with UTF-8 characters encoded in OCTAL and
 148  * represented in ASCII, like this:
 149  *     K\303\230BENHAVNS UNIVERSITET
 150  * which is equivalent to:
 151  *     KØBENHAVNS UNIVERSITET
 152  */
 153 static void
 154 msoffice_string_process_octal_triplets (guchar *str)
 155 {
 156 	guint i = 0; /* index in original string */
 157 	guint j = 0; /* index in processed string */
 158 	guint length = strlen (str);
   pointer targets in passing argument 1 of 'strlen' differ in signedness
   (emitted by gcc) 159 
 160 	/* Changing the string IN PLACE, note that j<=i ALWAYS! */
 161 	while (i < length) {
 162 		if (length - i >= 4 &&
 163 		    octal_ascii_triplet_is_valid (str[i], str[i+1], str[i+2], str[i+3])) {
 164 			/* Found a new octal triplet */
 165 			str[j] = octal_ascii_triplet_to_decimal_int (str[i+1], str[i+2], str[i+3]);
 166 			i += 4;
 167 		} else if (i != j) {
 168 			/* We previously found an octal triplet,
 169 			 * we need to update the string */
 170 			str[j] = str[i];
 171 			i++;
 172 		} else {
 173 			/* No need to update the string yet */
 174 			i++;
 175 		}
 176 		j++;
 177 	}
 178 	/* New end of string */
 179 	str[j]='\0';
 180 }
 181 
 182 static void
 183 metadata_add_gvalue (TrackerSparqlBuilder *metadata,
 184                      const gchar          *uri,
 185                      const gchar          *key,
 186                      GValue const         *val,
 187                      const gchar          *type,
 188                      const gchar          *predicate,
 189                      gboolean              is_date)
 190 {
 191 	gchar *s;
 192 
 193 	g_return_if_fail (metadata != NULL);
 194 	g_return_if_fail (key != NULL);
 195 
 196 	if (!val) {
 197 		return;
 198 	}
 199 
 200 	s = g_strdup_value_contents (val);
 201 
 202 	if (!s) {
 203 		return;
 204 	}
 205 
 206 	if (!tracker_is_empty_string (s)) {
 207 		gchar *str_val;
 208 
 209 		/* Some fun: strings are always written "str" with double quotes
 210 		 * around, but not numbers!
 211 		 */
 212 		if (s[0] == '"') {
 213 			size_t len;
 214 
 215 			len = strlen (s);
 216 
 217 			if (s[len - 1] == '"') {
 218 				if (is_date) {
 219 					if (len > 2) {
 220 						gchar *str = g_strndup (s + 1, len - 2);
 221 						str_val = tracker_date_guess (str);
 222 						g_free (str);
 223 					} else {
 224 						str_val = NULL;
 225 					}
 226 				} else {
 227 					str_val = len > 2 ? g_strndup (s + 1, len - 2) : NULL;
 228 				}
 229 			} else {
 230 				/* We have a string that begins with a double
 231 				 * quote but which finishes by something
 232 				 * different... We copy the string from the
 233 				 * beginning.
 234 				 */
 235 				if (is_date) {
 236 					str_val = tracker_date_guess (s);
 237 				} else {
 238 					str_val = g_strdup (s);
 239 				}
 240 			}
 241 		} else {
 242 			/* Here, we probably have a number */
 243 			if (is_date) {
 244 				str_val = tracker_date_guess (s);
 245 			} else {
 246 				str_val = g_strdup (s);
 247 			}
 248 		}
 249 
 250 		if (str_val) {
 251 			/* Process (in place) octal triplets if found */
 252 			msoffice_string_process_octal_triplets (str_val);
   pointer targets in passing argument 1 of 'msoffice_string_process_octal_triplets' differ in signedness
   (emitted by gcc) 253 
 254 			if (type && predicate) {
 255 				tracker_sparql_builder_predicate (metadata, key);
 256 
 257 				tracker_sparql_builder_object_blank_open (metadata);
 258 				tracker_sparql_builder_predicate (metadata, "a");
 259 				tracker_sparql_builder_object (metadata, type);
 260 
 261 				tracker_sparql_builder_predicate (metadata, predicate);
 262 				tracker_sparql_builder_object_unvalidated (metadata, str_val);
 263 				tracker_sparql_builder_object_blank_close (metadata);
 264 			} else {
 265 				tracker_sparql_builder_predicate (metadata, key);
 266 				tracker_sparql_builder_object_unvalidated (metadata, str_val);
 267 			}
 268 
 269 			g_free (str_val);
 270 		}
 271 	}
 272 
 273 	g_free (s);
 274 }
 275 
 276 static void
 277 summary_metadata_cb (gpointer key,
 278                      gpointer value,
 279                      gpointer user_data)
 280 {
 281 	MetadataInfo *info = user_data;
 282 	GValue const *val;
 283 
 284 	val = gsf_doc_prop_get_val (value);
 285 
 286 	if (g_strcmp0 (key, "dc:title") == 0) {
 287 		metadata_add_gvalue (info->metadata, info->uri, "nie:title", val, NULL, NULL, FALSE);
 288 	} else if (g_strcmp0 (key, "dc:subject") == 0) {
 289 		metadata_add_gvalue (info->metadata, info->uri, "nie:subject", val, NULL, NULL, FALSE);
 290 	} else if (g_strcmp0 (key, "dc:creator") == 0) {
 291 		metadata_add_gvalue (info->metadata, info->uri, "nco:creator", val, "nco:Contact", "nco:fullname", FALSE);
 292 	} else if (g_strcmp0 (key, "dc:keywords") == 0) {
 293 		gchar *keywords = g_strdup_value_contents (val);
 294 		gchar *lasts, *keyw;
 295 		size_t len;
 296 
 297 		keyw = keywords;
 298 		keywords = strchr (keywords, '"');
 299 
 300 		if (keywords) {
 301 			keywords++;
 302 		} else {
 303 			keywords = keyw;
 304 		}
 305 
 306 		len = strlen (keywords);
 307 		if (keywords[len - 1] == '"') {
 308 			keywords[len - 1] = '\0';
 309 		}
 310 
 311 		for (keyw = strtok_r (keywords, ",; ", &lasts); keyw;
 312 		     keyw = strtok_r (NULL, ",; ", &lasts)) {
 313 			tracker_sparql_builder_predicate (info->metadata, "nie:keyword");
 314 			tracker_sparql_builder_object_unvalidated (info->metadata, keyw);
 315 		}
 316 
 317 		g_free (keyw);
 318 	} else if (g_strcmp0 (key, "dc:description") == 0) {
 319 		metadata_add_gvalue (info->metadata, info->uri, "nie:comment", val, NULL, NULL, FALSE);
 320 	} else if (g_strcmp0 (key, "gsf:page-count") == 0) {
 321 		metadata_add_gvalue (info->metadata, info->uri, "nfo:pageCount", val, NULL, NULL, FALSE);
 322 	} else if (g_strcmp0 (key, "gsf:word-count") == 0) {
 323 		metadata_add_gvalue (info->metadata, info->uri, "nfo:wordCount", val, NULL, NULL, FALSE);
 324 	} else if (g_strcmp0 (key, "meta:creation-date") == 0) {
 325 		metadata_add_gvalue (info->metadata, info->uri, "nie:contentCreated", val, NULL, NULL, TRUE);
 326 	} else if (g_strcmp0 (key, "meta:generator") == 0) {
 327 		metadata_add_gvalue (info->metadata, info->uri, "nie:generator", val, NULL, NULL, FALSE);
 328 	}
 329 }
 330 
 331 static void
 332 document_metadata_cb (gpointer key,
 333                       gpointer value,
 334                       gpointer user_data)
 335 {
 336 	if (g_strcmp0 (key, "CreativeCommons_LicenseURL") == 0) {
 337 		MetadataInfo *info = user_data;
 338 
 339 		metadata_add_gvalue (info->metadata,
 340 		                     info->uri,
 341 		                     "nie:license",
 342 		                     gsf_doc_prop_get_val (value),
 343 		                     NULL,
 344 		                     NULL,
 345 		                     FALSE);
 346 	}
 347 }
 348 
 349 /**
 350  * @brief Read 8 bit unsigned integer
 351  * @param buffer data to read integer from
 352  * @return 16 bit unsigned integer
 353  */
 354 static guint
 355 read_8bit (const guint8 *buffer)
 356 {
 357 	return buffer[0];
 358 }
 359 
 360 /**
 361  * @brief Read 16 bit unsigned integer
 362  * @param buffer data to read integer from
 363  * @return 16 bit unsigned integer
 364  */
 365 static guint16
 366 read_16bit (const guint8 *buffer)
 367 {
 368 	return buffer[0] + (buffer[1] << 8);
 369 }
 370 
 371 /**
 372  * @brief Read 32 bit unsigned integer
 373  * @param buffer data to read integer from
 374  * @return 32 bit unsigned integer
 375  */
 376 static guint32
 377 read_32bit (const guint8 *buffer)
 378 {
 379 	return buffer[0] + (buffer[1] << 8) + (buffer[2] << 16) + (buffer[3] << 24);
 380 }
 381 
 382 /**
 383  * @brief Common conversion and normalization method for all msoffice type
 384  *  documents.
 385  * @param buffer Input buffer with the string contents
 386  * @param chunk_size Number of valid bytes in the input buffer
 387  * @param is_ansi If %TRUE, input text should be encoded in CP1252, and
 388  *  in UTF-16 otherwise.
 389  * @param p_bytes_remaining Pointer to #gsize specifying how many bytes
 390  *  should still be considered.
 391  * @param p_content Pointer to a #GString where the output normalized words
 392  *  will be appended.
 393  */
 394 static void
 395 msoffice_convert_and_normalize_chunk (guint8    *buffer,
 396                                       gsize      chunk_size,
 397                                       gboolean   is_ansi,
 398                                       gsize     *bytes_remaining,
 399                                       GString  **content)
 400 {
 401 	gsize n_bytes_utf8;
 402 	gchar *converted_text;
 403 	GError *error = NULL;
 404 
 405 	g_return_if_fail (buffer != NULL);
 406 	g_return_if_fail (chunk_size > 0);
 407 	g_return_if_fail (bytes_remaining != NULL);
 408 	g_return_if_fail (content != NULL);
 409 
 410 	/* chunks can have different encoding
 411 	 *
 412 	 * TODO: Using g_iconv, this extra heap allocation could be
 413 	 * avoided, re-using over and over again the same output buffer
 414 	 * for the UTF-8 encoded string
 415 	 */
 416 	converted_text = g_convert (buffer,
 417 	                            chunk_size,
 418 	                            "UTF-8",
 419 	                            is_ansi ? "CP1252" : "UTF-16",
 420 	                            NULL,
 421 	                            &n_bytes_utf8,
 422 	                            &error);
   pointer targets in passing argument 1 of 'g_convert' differ in signedness
   (emitted by gcc) 423 
 424 	if (converted_text) {
 425 		gsize len_to_validate;
 426 
 427 		len_to_validate = MIN (*bytes_remaining, n_bytes_utf8);
 428 
 429 		if (tracker_text_validate_utf8 (converted_text,
 430 		                                len_to_validate,
 431 		                                content,
 432 		                                NULL)) {
 433 			/* A whitespace is added to separate next strings appended */
 434 			g_string_append_c (*content, ' ');
 435 		}
 436 
 437 		/* Update accumulated UTF-8 bytes read */
 438 		*bytes_remaining -= len_to_validate;
 439 		g_free (converted_text);
 440 	} else {
 441 		g_warning ("Couldn't convert %" G_GSIZE_FORMAT " bytes from %s to UTF-8: %s",
 442 		           chunk_size,
 443 		           is_ansi ? "CP1252" : "UTF-16",
 444 		           error ? error->message : "no error given");
 445 	}
 446 
 447 	/* Note that error may be set even if some converted text is
 448 	 * available, due to G_CONVERT_ERROR_ILLEGAL_SEQUENCE for example */
 449 	g_clear_error (&error);
 450 }
 451 
 452 /**
 453  * @brief Read header data from given stream
 454  * @param stream Stream to read header data
 455  * @param header Pointer to header where to store results
 456  */
 457 static gboolean
 458 ppt_read_header (GsfInput               *stream,
 459                  PowerPointRecordHeader *header)
 460 {
 461 	guint8 buffer[8] = {0};
 462 
 463 	g_return_val_if_fail (stream, FALSE);
 464 	g_return_val_if_fail (header, FALSE);
 465 	g_return_val_if_fail (!gsf_input_eof (stream), FALSE);
 466 
 467 
 468 	/* Header is always 8 bytes, read it */
 469 	g_return_val_if_fail (gsf_input_read (stream, 8, buffer), FALSE);
 470 
 471 	/* Then parse individual details
 472 	 *
 473 	 * Record header is 8 bytes long. Data is split as follows:
 474 	 * recVer (4 bits)
 475 	 * recInstance (12 bits)
 476 	 * recType (2 bytes)
 477 	 * recLen (4 bytes)
 478 	 *
 479 	 * See RecordHeader for more detailed explanation of each field.
 480 	 *
 481 	 * Here we parse each of those fields.
 482 	 */
 483 
 484 	header->recType = read_16bit (&buffer[2]);
 485 	header->recLen = read_32bit (&buffer[4]);
 486 	header->recVer = (read_16bit (buffer) & 0xF000) >> 12;
 487 	header->recInstance = read_16bit (buffer) & 0x0FFF;
 488 
 489 	return TRUE;
 490 }
 491 
 492 /**
 493  * @brief Read powerpoint text from given stream.
 494  *
 495  * Powerpoint contains texts in either TextBytesAtom or TextCharsAtom. Below
 496  * are excerpt from [MS-PPT].pdf file describing the ppt file struture:
 497  *
 498  * TextCharsAtom contains an array of UTF-16 Unicode [RFC2781] characters that
 499  * specifies the characters of the corresponding text. The length, in bytes, of
 500  * the array is specified by rh.recLen. The array MUST NOT contain the NUL
 501  * character 0x0000.
 502  *
 503  * TextBytesAtom contains an array of bytes that specifies the characters of the
 504  * corresponding text. Each item represents the low byte of a UTF-16 Unicode
 505  * [RFC2781] character whose high byte is 0x00. The length, in bytes, of the
 506  * array is specified by rh.recLen. The array MUST NOT contain a 0x00 byte.
 507  *
 508  * @param stream Stream to read text bytes/chars atom
 509  * @return read text or NULL if no text was read. Has to be freed by the caller
 510  */
 511 static void
 512 ppt_read_text (GsfInput  *stream,
 513                guint8   **p_buffer,
 514                gsize     *p_buffer_size,
 515                gsize     *p_read_size)
 516 {
 517 	PowerPointRecordHeader header;
 518 	gsize required_size;
 519 
 520 	g_return_if_fail (stream);
 521 	g_return_if_fail (p_buffer);
 522 	g_return_if_fail (p_buffer_size);
 523 	g_return_if_fail (p_read_size);
 524 
 525 	/* First read the header that describes the structures type
 526 	 * (TextBytesAtom or TextCharsAtom) and it's length.
 527 	 */
 528 	g_return_if_fail (ppt_read_header (stream, &header));
 529 
 530 	/* We only want header with type either TEXTBYTESATOM_RECORD_TYPE
 531 	 * (TextBytesAtom) or TEXTCHARSATOM_RECORD_TYPE (TextCharsAtom).
 532 	 *
 533 	 * We don't care about anything else
 534 	 */
 535 	if (header.recType != TEXTBYTESATOM_RECORD_TYPE &&
 536 	    header.recType != TEXTCHARSATOM_RECORD_TYPE) {
 537 		return;
 538 	}
 539 
 540 	/* Then we'll allocate data for the actual texts */
 541 	if (header.recType == TEXTBYTESATOM_RECORD_TYPE) {
 542 		/* TextBytesAtom doesn't include high bytes propably in order to
 543 		 * save space on the ppt files. We'll have to allocate double the
 544 		 * size for it to get the high bytes
 545 		 */
 546 		required_size = header.recLen * 2;
 547 	} else {
 548 		required_size = header.recLen;
 549 	}
 550 
 551 	/* Resize reused buffer if needed */
 552 	if (required_size > *p_buffer_size) {
 553 		*p_buffer = g_realloc (*p_buffer, required_size);
 554 		*p_buffer_size = required_size;
 555 	}
 556 
 557 	/* Then read the textual data from the stream */
 558 	if (!gsf_input_read (stream, header.recLen, *p_buffer)) {
 559 		return;
 560 	}
 561 
 562 	/* Again if we are reading TextBytesAtom we'll need to add those utf16
 563 	 * high bytes ourselves. They are zero as specified in [MS-PPT].pdf
 564 	 * and this function's comments
 565 	 */
 566 	if (header.recType == TEXTBYTESATOM_RECORD_TYPE) {
 567 		gint i;
 568 
 569 		for (i = 0; i < header.recLen; i++) {
 570 			/* We'll add an empty 0 byte between each byte in the array */
 571 			(*p_buffer)[(header.recLen - i - 1) * 2] = (*p_buffer)[header.recLen - i - 1];
 572 			(*p_buffer)[((header.recLen - i - 1) * 2) + 1] = '\0';
 573 		}
 574 	}
 575 
 576 	/* Set read size as output */
 577 	*p_read_size = required_size;
 578 }
 579 
 580 /**
 581  * @brief Find a specific header from given stream
 582  * @param stream Stream to parse headers from
 583  * @param type1 first type of header to look for
 584  * @param type2 convenience parameter if we are looking for either of two
 585  * header types
 586  * @param rewind if a proper header is found should this function seek
 587  * to the start of the header (TRUE)
 588  * @return TRUE if either of specified headers was found
 589  */
 590 static gboolean
 591 ppt_seek_header (GsfInput *stream,
 592                  gint      type1,
 593                  gint      type2,
 594                  gboolean  rewind)
 595 {
 596 	PowerPointRecordHeader header;
 597 
 598 	g_return_val_if_fail (stream,FALSE);
 599 
 600 	/* Read until we reach eof */
 601 	while (!gsf_input_eof (stream)) {
 602 		/* Read first header */
 603 		g_return_val_if_fail (ppt_read_header (stream, &header), FALSE);
 604 
 605 		/* Check if it's the correct type */
 606 		if (header.recType == type1 || header.recType == type2) {
 607 			/* Sometimes it's needed to rewind to the start of the
 608 			 * header
 609 			 */
 610 			if (rewind) {
 611 				gsf_input_seek (stream, -8, G_SEEK_CUR);
 612 			}
 613 
 614 			return TRUE;
 615 		}
 616 
 617 		/* If it's not the correct type, seek to the beginning of the
 618 		 * next header
 619 		 */
 620 		g_return_val_if_fail (!gsf_input_seek (stream,
 621 		                                       header.recLen,
 622 		                                       G_SEEK_CUR),
 623 		                      FALSE);
 624 	}
 625 
 626 	return FALSE;
 627 }
 628 
 629 static gchar *
 630 extract_powerpoint_content (GsfInfile *infile,
 631                             gsize      max_bytes,
 632                             gboolean  *is_encrypted)
 633 {
 634 	/* Try to find Powerpoint Document stream */
 635 	GsfInput *stream;
 636 	GString *all_texts = NULL;
 637 	gsf_off_t last_document_container;
 638 
 639 	/* If no content requested, return */
 640 	if (max_bytes == 0) {
 641 		return NULL;
 642 	}
 643 
 644 	stream = gsf_infile_child_by_name (infile, "PowerPoint Document");
 645 
 646 	if (is_encrypted) {
 647 		*is_encrypted = FALSE;
 648 	}
 649 
 650 	if (!stream) {
 651 		return NULL;
 652 	}
 653 
 654 	/* Powerpoint documents have a "editing history" stored within them.
 655 	 * There is a structure that defines what changes were made each time
 656 	 * but it is just easier to get the current/latest version just by
 657 	 * finding the last occurrence of DocumentContainer structure
 658 	 */
 659 	last_document_container = -1;
 660 
 661 	/* Read until we reach eof. */
 662 	while (!gsf_input_eof (stream)) {
 663 		PowerPointRecordHeader header;
 664 
 665 		/*
 666 		 * We only read headers of data structures
 667 		 */
 668 		if (!ppt_read_header (stream, &header)) {
 669 			break;
 670 		}
 671 
 672 		/* And we only care about headers with type 1000,
 673 		 * DocumentContainer
 674 		 */
 675 
 676 		if (header.recType == DOCUMENTCONTAINER_RECORD_TYPE) {
 677 			last_document_container = gsf_input_tell (stream);
 678 		}
 679 
 680 		/* and then seek to the start of the next data
 681 		 * structure so it is fast and we don't have to read
 682 		 * through the whole file
 683 		 */
 684 		if (gsf_input_seek (stream, header.recLen, G_SEEK_CUR)) {
 685 			break;
 686 		}
 687 	}
 688 
 689 	/* If a DocumentContainer was found and we are able to seek to it.
 690 	 *
 691 	 * Then we'll have to find the second header with type
 692 	 * SLIDELISTWITHTEXT_RECORD_TYPE since DocumentContainer
 693 	 * contains MasterListWithTextContainer and
 694 	 * SlideListWithTextContainer structures with both having the
 695 	 * same header type. We however only want
 696 	 * SlideListWithTextContainer which contains the textual
 697 	 * content of the power point file.
 698 	 */
 699 	if (last_document_container >= 0 &&
 700 	    !gsf_input_seek (stream, last_document_container, G_SEEK_SET) &&
 701 	    ppt_seek_header (stream,
 702 	                     SLIDELISTWITHTEXT_RECORD_TYPE,
 703 	                     SLIDELISTWITHTEXT_RECORD_TYPE,
 704 	                     FALSE) &&
 705 	    ppt_seek_header (stream,
 706 	                     SLIDELISTWITHTEXT_RECORD_TYPE,
 707 	                     SLIDELISTWITHTEXT_RECORD_TYPE,
 708 	                     FALSE)) {
 709 		gsize bytes_remaining = max_bytes;
 710 		guint8 *buffer = NULL;
 711 		gsize buffer_size = 0;
 712 
 713 		/*
 714 		 * Read while we have either TextBytesAtom or
 715 		 * TextCharsAtom and we have read less than max_bytes
 716 		 * (in UTF-8)
 717 		 */
 718 		while (bytes_remaining > 0 &&
 719 		       ppt_seek_header (stream,
 720 		                        TEXTBYTESATOM_RECORD_TYPE,
 721 		                        TEXTCHARSATOM_RECORD_TYPE,
 722 		                        TRUE)) {
 723 			gsize read_size = 0;
 724 
 725 			/* Read the UTF-16 text in the reused buffer, and also get
 726 			 *  number of read bytes */
 727 			ppt_read_text (stream, &buffer, &buffer_size, &read_size);
 728 
 729 			/* Avoid empty strings */
 730 			if (read_size > 0) {
 731 				/* Convert, normalize and limit max words & bytes.
 732 				 * NOTE: `is_ansi' argument is FALSE, as the string is
 733 				 *  always in UTF-16 */
 734 				msoffice_convert_and_normalize_chunk (buffer,
 735 				                                      read_size,
 736 				                                      FALSE, /* Always UTF-16 */
 737 				                                      &bytes_remaining,
 738 				                                      &all_texts);
 739 			}
 740 		}
 741 
 742 		g_free (buffer);
 743 	}
 744 
 745 	g_object_unref (stream);
 746 
 747 	return all_texts ? g_string_free (all_texts, FALSE) : NULL;
 748 }
 749 
 750 static GsfInfile *
 751 open_file (const gchar *filename, FILE *file)
 752 {
 753 	GsfInput *input;
 754 	GsfInfile *infile;
 755 	GError *error = NULL;
 756 
 757 	input = gsf_input_stdio_new_FILE (filename, file, TRUE);
 758 	
 759 	if (!input) {
 760 		return NULL;
 761 	}
 762 
 763 	infile = gsf_infile_msole_new (input, &error);
 764 
 765 	if (error) {
 766 		g_warning ("Failed to open file: %s", error->message);
 767 		g_error_free (error);
 768 	}
 769 
 770 	g_object_unref (input);
 771 
 772 	return infile;
 773 }
 774 
 775 /* This function was programmed by using ideas and algorithms from
 776  * b2xtranslator project (http://b2xtranslator.sourceforge.net/)
 777  */
 778 static gchar *
 779 extract_msword_content (GsfInfile *infile,
 780                         gsize      n_bytes,
 781                         gboolean  *is_encrypted)
 782 {
 783 	GsfInput *document_stream, *table_stream;
 784 	gint16 i = 0;
 785 	guint8 tmp_buffer[4] = { 0 };
 786 	gint fcClx, lcbClx;
 787 	guint8 *piece_table = NULL;
 788 	guint8 *clx = NULL;
 789 	gint lcb_piece_table;
 790 	gint piece_count = 0;
 791 	gint32 fc;
 792 	GString *content = NULL;
 793 	guint8 *text_buffer = NULL;
 794 	gint text_buffer_size = 0;
 795 	gsize n_bytes_remaining;
 796 
 797 	/* If no content requested, return */
 798 	if (n_bytes == 0) {
 799 		return NULL;
 800 	}
 801 
 802 	document_stream = gsf_infile_child_by_name (infile, "WordDocument");
 803 	if (document_stream == NULL) {
 804 		return NULL;
 805 	}
 806 
 807 	/* abort if FIB can't be found from beginning of WordDocument stream */
 808 	gsf_input_seek (document_stream, 0, G_SEEK_SET);
 809 	gsf_input_read (document_stream, 2, tmp_buffer);
 810 	if (read_16bit (tmp_buffer) != 0xa5ec) {
 811 		g_object_unref (document_stream);
 812 		return NULL;
 813 	}
 814 
 815 	/* abort if document is encrypted */
 816 	gsf_input_seek (document_stream, 11, G_SEEK_SET);
 817 	gsf_input_read (document_stream, 1, tmp_buffer);
 818 	if ((tmp_buffer[0] & 0x1) == 0x1) {
 819 		g_object_unref (document_stream);
 820 		*is_encrypted = TRUE;
 821 		return NULL;
 822 	} else
 823 		*is_encrypted = FALSE;
 824 
 825 	/* document can have 0Table or 1Table or both. If flag 0x0200 is
 826 	 * set to true in word 0x000A of the FIB then 1Table is used
 827 	 */
 828 	gsf_input_seek (document_stream, 0x000A, G_SEEK_SET);
 829 	gsf_input_read (document_stream, 2, tmp_buffer);
 830 	i = read_16bit (tmp_buffer);
 831 
 832 	if ((i & 0x0200) == 0x0200) {
 833 		table_stream = gsf_infile_child_by_name (infile, "1Table");
 834 	} else {
 835 		table_stream = gsf_infile_child_by_name (infile, "0Table");
 836 	}
 837 
 838 	if (table_stream == NULL) {
 839 		g_object_unref (G_OBJECT (document_stream));
 840 		return NULL;
 841 	}
 842 
 843 	/* find out location and length of piece table from FIB */
 844 	gsf_input_seek (document_stream, 418, G_SEEK_SET);
 845 	gsf_input_read (document_stream, 4, tmp_buffer);
 846 	fcClx = read_32bit (tmp_buffer);
 847 	gsf_input_read (document_stream, 4, tmp_buffer);
 848 	lcbClx = read_32bit (tmp_buffer);
 849 
 850 	/* If we got an invalid or empty length of piece table, just return
 851 	 * as we cannot iterate over pieces */
 852 	if (lcbClx <= 0) {
 853 		g_object_unref (document_stream);
 854 		g_object_unref (table_stream);
 855 		return NULL;
 856 	}
 857 
 858 	/* copy the structure holding the piece table into the clx array. */
 859 	clx = g_malloc (lcbClx);
 860 	gsf_input_seek (table_stream, fcClx, G_SEEK_SET);
 861 	gsf_input_read (table_stream, lcbClx, clx);
 862 
 863 	/* find out piece table from clx and set piece_table -pointer to it */
 864 	i = 0;
 865 	lcb_piece_table = 0;
 866 
 867 	while (TRUE) {
 868 		if (clx[i] == 2) {
 869 			/* Nice, a proper structure with contents, no need to
 870 			 * iterate more. */
 871 			lcb_piece_table = read_32bit (clx + (i + 1));
 872 			piece_table = clx + i + 5;
 873 			piece_count = (lcb_piece_table - 4) / 12;
 874 			break;
 875 		} else if (clx[i] == 1) {
 876 			/* Oh, a PRC structure with properties of text, not
 877 			 * real text, so skip it */
 878 			guint16 GrpPrl_len;
 879 
 880 			GrpPrl_len = read_16bit (&clx[i+1]);
 881 			/* 3 is the length of clxt (1byte) and cbGrpprl(2bytes) */
 882 			i = i + 3 + GrpPrl_len;
 883 		} else {
 884 			break;
 885 		}
 886 	}
 887 
 888 	/* Iterate over pieces...
 889 	 *   Loop is halted whenever one of this conditions is met:
 890 	 *     a) Max bytes to be read reached
 891 	 *     b) No more pieces to read
 892 	 */
 893 	i = 0;
 894 	n_bytes_remaining = n_bytes;
 895 	while (n_bytes_remaining > 0 &&
 896 	       i < piece_count) {
 897 		guint8 *piece_descriptor;
 898 		gint piece_start;
 899 		gint piece_end;
 900 		gint piece_size;
 901 		gboolean is_ansi;
 902 
 903 		/* logical position of the text piece in the document_stream */
 904 		piece_start = read_32bit (piece_table + (i * 4));
 905 		piece_end = read_32bit (piece_table + ((i + 1) * 4));
 906 
 907 		/* descriptor of single piece from piece table */
 908 		piece_descriptor = piece_table + ((piece_count + 1) * 4) + (i * 8);
 909 
 910 		/* file character position */
 911 		fc = read_32bit (piece_descriptor + 2);
 912 
 913 		/* second bit is set to 1 if text is saved in ANSI encoding */
 914 		is_ansi = (fc & 0x40000000) == 0x40000000;
 915 
 916 		/* modify file character position according to text encoding */
 917 		if (!is_ansi) {
 918 			fc = (fc & 0xBFFFFFFF);
 919 		} else {
 920 			fc = (fc & 0xBFFFFFFF) >> 1;
 921 		}
 922 
 923 		piece_size  = piece_end - piece_start;
 924 
 925 		/* NOTE: Very very long pieces may appear. In fact, a single
 926 		 *  piece document seems to be quite normal. Thus, we limit
 927 		 *  here the number of bytes to read from the stream, based
 928 		 *  on the maximum number of bytes in UTF-8. Assuming, then
 929 		 *  that a safe limit is 2*n_bytes_remaining if UTF-16 input,
 930 		 *  and just n_bytes_remaining in CP1251 input */
 931 		piece_size = MIN (piece_size, n_bytes_remaining);
 932 
 933 		/* UTF-16 uses twice as many bytes as CP1252
 934 		 *  NOTE: Not quite sure about this. Some unicode points will be
 935 		 *  encoded using 4 bytes in UTF-16 */
 936 		if (!is_ansi) {
 937 			piece_size *= 2;
 938 		}
 939 
 940 		/* Avoid empty pieces */
 941 		if (piece_size >= 1) {
 942 
 943 			/* Re-allocate buffer to make it bigger if needed.
 944 			 *  This text buffer is re-used over and over in each
 945 			 *  iteration.  */
 946 			if (piece_size > text_buffer_size) {
 947 				text_buffer = g_realloc (text_buffer, piece_size);
 948 				text_buffer_size = piece_size;
 949 			}
 950 
 951 			/* read and parse single text piece from document_stream */
 952 			gsf_input_seek (document_stream, fc, G_SEEK_SET);
 953 			gsf_input_read (document_stream, piece_size, text_buffer);
 954 
 955 			msoffice_convert_and_normalize_chunk (text_buffer,
 956 			                                      piece_size,
 957 			                                      is_ansi,
 958 			                                      &n_bytes_remaining,
 959 			                                      &content);
 960 		}
 961 
 962 		/* Go on to next piece */
 963 		i++;
 964 	}
 965 
 966 	g_free (text_buffer);
 967 	g_object_unref (document_stream);
 968 	g_object_unref (table_stream);
 969 	g_free (clx);
 970 
 971 	return content ? g_string_free (content, FALSE) : NULL;
 972 }
 973 
 974 /* Reads and interprets the flags of a given string. May be
 975  *  used just to skip the fields, as when this bitmask-byte
 976  *  comes as the first byte of a new record.
 977  * NOTE: For a detailed meaning of each field parsed here,
 978  *  take a look at the XLUnicodeRichExtendedString format:
 979  *  http://msdn.microsoft.com/en-us/library/dd943830.aspx
 980  **/
 981 static void
 982 read_excel_string_flags (GsfInput *stream,
 983                          gboolean *p_is_high_byte,
 984                          guint16  *p_c_run,
 985                          guint16  *p_cb_ext_rst)
 986 {
 987 	guint8 tmp_buffer[4] = { 0 };
 988 	guint8 bit_mask;
 989 	gboolean is_ext_string;
 990 	gboolean is_rich_string;
 991 
 992 	/* Note that output arguments may be NULL if we don't need
 993 	 * their values... */
 994 
 995 	/* Reading 1 byte for mask */
 996 	gsf_input_read (stream, 1, tmp_buffer);
 997 	bit_mask = read_8bit (tmp_buffer);
 998 
 999 	/* Get flags */
1000 	if (p_is_high_byte) {
1001 		*p_is_high_byte = (bit_mask & 0x01) == 0x01;
1002 	}
1003 	is_ext_string = (bit_mask & 0x04) == 0x04;
1004 	is_rich_string = (bit_mask & 0x08) == 0x08;
1005 
1006 	/* If the c_run value is required as output, read it */
1007 	if (p_c_run) {
1008 		if (is_rich_string) {
1009 			/* Reading 2 Bytes */
1010 			gsf_input_read (stream, 2, tmp_buffer);
1011 
1012 			/* Reading cRun */
1013 			*p_c_run = read_16bit (tmp_buffer);
1014 		} else {
1015 			*p_c_run = 0;
1016 		}
1017 	} else if (is_rich_string) {
1018 		/* If not required, just skip those bytes */
1019 		gsf_input_seek (stream, 2, G_SEEK_CUR);
1020 	}
1021 
1022 	/* If the cb_ext_rst value is required as output, read it */
1023 	if (p_cb_ext_rst) {
1024 		if (is_ext_string) {
1025 			/* Reading 4 Bytes */
1026 			gsf_input_read (stream, 4, tmp_buffer);
1027 
1028 			/* Reading cRun */
1029 			*p_cb_ext_rst = read_16bit (tmp_buffer);
1030 		} else {
1031 			*p_cb_ext_rst = 0;
1032 		}
1033 	} else if (is_ext_string) {
1034 		/* If not required, just skip those bytes */
1035 		gsf_input_seek (stream, 4, G_SEEK_CUR);
1036 	}
1037 }
1038 
1039 /* Returns TRUE if record was changed. BUT, the value of the
1040  *  current_record should be checked by the caller to know
1041  *  if there are no more records */
1042 static gboolean
1043 change_excel_record_if_needed (GsfInput *stream,
1044                                GArray   *record_array,
1045                                guint    *p_current_record)
1046 {
1047 	ExcelExtendedStringRecord *record;
1048 
1049 	/* Get current record */
1050 	record = &g_array_index (record_array,
1051 	                         ExcelExtendedStringRecord,
1052 	                         *p_current_record);
1053 
1054 	/* We may already have surpassed the record, so adjust if so */
1055 	if (gsf_input_tell (stream) >= (record->offset + record->length)) {
1056 		/* Switch records and read from the second one... */
1057 		(*p_current_record)++;
1058 
1059 		if (*p_current_record < record_array->len) {
1060 			record = &g_array_index (record_array,
1061 			                         ExcelExtendedStringRecord,
1062 			                         *p_current_record);
1063 
1064 			gsf_input_seek (stream, record->offset, G_SEEK_SET);
1065 		}
1066 
1067 		return TRUE;
1068 	}
1069 
1070 	return FALSE;
1071 }
1072 
1073 /* Returns TRUE if correctly read
1074  *
1075  *  Note that p_current_record may get changed if the required
1076  *  bytes to read were split into two different records.
1077  */
1078 static gboolean
1079 read_excel_string (GsfInput *stream,
1080                    guint8   *buffer,
1081                    gsize     chunk_size,
1082                    GArray   *record_array,
1083                    guint    *p_current_record)
1084 {
1085 	ExcelExtendedStringRecord *record;
1086 	gsf_off_t current_position;
1087 	gsf_off_t current_record_end;
1088 
1089 	/* Record may have changed when we want to read the string contents
1090 	 *  This is a pretty special case, where the new CONTINUE record
1091 	 * shouldn't start with a bitmask */
1092 	if (change_excel_record_if_needed (stream, record_array, p_current_record) &&
1093 	    *p_current_record >= record_array->len) {
1094 		/* When reached max number of records, just return */
1095 		return FALSE;
1096 	}
1097 
1098 	/* Get current record */
1099 	record = &g_array_index (record_array,
1100 	                         ExcelExtendedStringRecord,
1101 	                         *p_current_record);
1102 
1103 	/* Compute current position in the stream and end of current record*/
1104 	current_position = gsf_input_tell (stream);
1105 	current_record_end = record->offset + record->length;
1106 
1107 	/* The best case is when the whole number of bytes to read are in the
1108 	 * current record, as no record switching is therefore needed */
1109 	if (current_position + chunk_size <= current_record_end) {
1110 		return gsf_input_read (stream, chunk_size, buffer) != NULL ? TRUE : FALSE;
1111 	} else if (current_record_end < current_position) {
1112 		/* Safety check, actually pretty important */
1113 		return FALSE;
1114 	} else {
1115 		/* Read the string in two chunks */
1116 		gsize chunk_size_first_record;
1117 		gsize chunk_size_second_record;
1118 
1119 		/* Compute how much to read in each record */
1120 		chunk_size_first_record = current_record_end - current_position;
1121 		chunk_size_second_record = chunk_size - chunk_size_first_record;
1122 
1123 		/* g_debug ("Current position:      %" GSF_OFF_T_FORMAT, current_position); */
1124 		/* g_debug ("Current record index:  %u", *p_current_record); */
1125 		/* g_debug ("Current record start:  %" GSF_OFF_T_FORMAT, record->offset); */
1126 		/* g_debug ("Current record length: %" G_GSIZE_FORMAT, record->length); */
1127 		/* g_debug ("Current record end:    %" GSF_OFF_T_FORMAT, current_record_end); */
1128 		/* g_debug ("Bytes to read:         %" G_GSIZE_FORMAT,   chunk_size); */
1129 		/* g_debug ("Bytes to read (1st):   %" G_GSIZE_FORMAT,   chunk_size_first_record); */
1130 		/* g_debug ("Bytes to read (2nd):   %" G_GSIZE_FORMAT,   chunk_size_second_record); */
1131 
1132 		/* Now, read from first record... */
1133 		if (gsf_input_read (stream,
1134 		                    chunk_size_first_record,
1135 		                    buffer)) {
1136 			/* Now switch records and read from the second one... */
1137 			(*p_current_record)++;
1138 
1139 			if (*p_current_record < record_array->len) {
1140 				record = &g_array_index (record_array,
1141 				                         ExcelExtendedStringRecord,
1142 				                         *p_current_record);
1143 
1144 				/* g_debug ("New record index:  %u", *p_current_record); */
1145 				/* g_debug ("New record start:  %" GSF_OFF_T_FORMAT, record->offset); */
1146 				/* g_debug ("New record length: %" G_GSIZE_FORMAT, record->length); */
1147 
1148 				/* Move stream pointer to the new location, beginning of next record */
1149 				gsf_input_seek (stream, record->offset, G_SEEK_SET);
1150 
1151 				/* Every CONTINUE records starts with a bitmask + optional fields that
1152 				 * should be skipped properly */
1153 				read_excel_string_flags (stream, NULL, NULL, NULL);
1154 
1155 				/* And finally, read the second part */
1156 				if (gsf_input_read (stream,
1157 				                    chunk_size_second_record,
1158 				                    &buffer[chunk_size_first_record])) {
1159 					/* All OK! */
1160 					return TRUE;
1161 				}
1162 			}
1163 		}
1164 
1165 		return FALSE;
1166 	}
1167 }
1168 
1169 
1170 
1171 /**
1172  * [MS-XLS] — v20090708
1173  * Excel Binary File Format (.xls) Structure Specification
1174  * Copyright © 2009 Microsoft Corporation.
1175  *  Release: Wednesday, July 8, 2009
1176  *
1177  * 2.5.293 XLUnicodeRichExtendedString
1178  * This structure specifies a Unicode string, which can contain
1179  * formatting information and phoneticstring data.
1180 
1181  * This structure‘s non-variable fields MUST be specified in the same
1182  * record. This structure‘s variable fields can be extended with
1183  * Continue records. A value from the table for fHighByte MUST be
1184  * specified in the first byte of the continue field of the Continue
1185  * record followed by the remaining portions of this structure‘s
1186  * variable fields.
1187  *                       1                   2                   3
1188  *   0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1189  *                            cch    A B C D reserved2 cRun (optional)
1190  *               ...                   cbExtRst (optional)
1191  *               ...                   rgb (variable)
1192  *               ...
1193  *                         rgRun (variable, optional)
1194  *               ...
1195  *                         ExtRst (variable, optional)
1196  *               ...
1197  * cch (2 bytes): An unsigned integer that specifies the count of
1198  * characters in the string.
1199  *
1200  * A - fHighByte (1 bit): A bit that specifies whether the characters
1201  * in rgb are double-byte characters. MUST be a value from the
1202  * following table:
1203  *
1204  *  Value  Meaning
1205  *  0x0    All the characters in the string have a high byte of 0x00
1206  *         and only the low bytes are in rgb.
1207  *  0x1    All the characters in the string are saved as double-byte
1208  *         characters in rgb.
1209  * B - reserved1 (1 bit): MUST be zero, and MUST be ignored.
1210  * C - fExtSt (1 bit): A bit that specifies whether the string
1211  *     contains phonetic string data.
1212  * D - fRichSt (1 bit): A bit that specifies whether the string is a
1213  *     rich string and the string has at least two character formats
1214  *     applied.
1215  *
1216  * reserved2 (4 bits): MUST be zero, and MUST be ignored.
1217  *
1218  * cRun (2 bytes): An optional unsigned integer that specifies the
1219  * number of elements in rgRun. MUST exist if and only if fRichSt is
1220  * 0x1.
1221  *
1222  * cbExtRst (4 bytes): An optional signed integer that specifies the
1223  * byte count of ExtRst. MUST exist if and only if fExtSt is 0x1. MUST
1224  * be zero or greater.
1225  *
1226  * rgb (variable): An array of bytes that specifies the characters in
1227  * the string. If fHighByte is 0x0, the size of the array is cch. If
1228  * fHighByte is 0x1, the size of the array is cch*2. If fHighByte is
1229  * 0x1 and rgb is extended with a Continue record the break MUST occur
1230  * at the double-byte character boundary.
1231  *
1232  * rgRun (variable): An optional array of FormatRun structures that
1233  * specifies the formatting for each text run. The number of elements
1234  * in the array is cRun. MUST exist if and only if fRichSt is 0x1.
1235  *
1236  * ExtRst (variable): An optional ExtRst that specifies the phonetic
1237  * string data. The size of this field is cbExtRst. MUST exist if and
1238  * only if fExtSt is 0x1.
1239  */
1240 static void
1241 xls_get_extended_record_string (GsfInput  *stream,
1242                                 GArray    *list,
1243                                 gsize     *p_bytes_remaining,
1244                                 GString  **p_content)
1245 {
1246 	ExcelExtendedStringRecord *record;
1247 	guint32 cst_unique;
1248 	guint parsing_record = 0;
1249 	guint8 tmp_buffer[4] = { 0 };
1250 	guint i;
1251 	guint8 *buffer = NULL;
1252 	gsize buffer_size = 0;
1253 
1254 	/* Parsing the record from the list */
1255 	record = &g_array_index (list, ExcelExtendedStringRecord, parsing_record);
1256 
1257 	/* First record parsing */
1258 	if (gsf_input_seek (stream, record->offset, G_SEEK_SET)) {
1259 		return;
1260 	}
1261 
1262 	/* Note: The first record is ALWAYS the SST, so coming with cst_total and
1263 	 * cst_unique values.
1264 	 * Some extra background: Records with data longer than 8,224 bytes MUST be
1265 	 * split into several records, so in this case, if the SST record is big
1266 	 * enough, it will have one or more CONTINUE records
1267 	 *
1268 	 * SST record: http://msdn.microsoft.com/en-us/library/dd773037%28v=office.12%29.aspx
1269 	 * CONTINUE record: http://msdn.microsoft.com/en-us/library/dd949081%28v=office.12%29.aspx
1270 	 **/
1271 
1272 	/* Reading cst total */
1273 	gsf_input_read (stream, 4, tmp_buffer);
1274 	read_32bit (tmp_buffer);
1275 
1276 	/* Reading cst unique */
1277 	gsf_input_read (stream, 4, tmp_buffer);
1278 	cst_unique = read_32bit (tmp_buffer);
1279 
1280 	/* Iterate over chunks...
1281 	 *   Loop is halted whenever one of this conditions is met:
1282 	 *     a) Max bytes to be read reached
1283 	 *     b) No more chunks to read
1284 	 */
1285 	i = 0;
1286 	while (*p_bytes_remaining > 0 &&
1287 	       i < cst_unique) {
1288 		guint16 cch;
1289 		guint16 c_run;
1290 		guint16 cb_ext_rst;
1291 		gboolean is_high_byte;
1292 		gsize chunk_size;
1293 
1294 		/* RECORD may have been changed here */
1295 		if (change_excel_record_if_needed (stream, list, &parsing_record) &&
1296 		    parsing_record >= list->len) {
1297 			/* When reached max number of records, stop loop */
1298 			break;
1299 		}
1300 
1301 		/* Reading 2 bytes for cch */
1302 		gsf_input_read (stream, 2, tmp_buffer);
1303 
1304 		/* Reading cch - char count of current string */
1305 		cch = read_16bit (tmp_buffer);
1306 
1307 		/* Read string flags */
1308 		read_excel_string_flags (stream,
1309 		                         &is_high_byte,
1310 		                         &c_run,
1311 		                         &cb_ext_rst);
1312 
1313 		/* RECORD may have been changed here, but it is managed when reading the
1314 		 *  string contents */
1315 
1316 
1317 		/* NOTE: In order to avoid reading unnecessary bytes, limit it based
1318 		 * on the number of bytes remaining */
1319 		chunk_size = MIN (cch, *p_bytes_remaining);
1320 
1321 		/* If High Byte, chunk size *2 as stream is in UTF-16 */
1322 		if (is_high_byte) {
1323 			chunk_size *= 2;
1324 		}
1325 
1326 		/* If the new chunk size is longer than our reused buffer,
1327 		 * make the buffer bigger */
1328 		if (chunk_size > buffer_size) {
1329 			buffer = g_realloc (buffer, chunk_size);
1330 			buffer_size = chunk_size;
1331 		}
1332 
1333 		/* Read the chunk! NOTE that it may be split in several records... */
1334 		if (!read_excel_string (stream, buffer, chunk_size, list, &parsing_record)) {
1335 			break;
1336 		}
1337 
1338 		/* Read whole stream in one operation */
1339 		msoffice_convert_and_normalize_chunk (buffer,
1340 		                                      chunk_size,
1341 		                                      !is_high_byte,
1342 		                                      p_bytes_remaining,
1343 		                                      p_content);
1344 
1345 		/* Formatting string */
1346 		if (c_run > 0) {
1347 			/* rgRun (variable): An optional array of
1348 			 * FormatRun structures that specifies the
1349 			 * formatting for each ext run. The number of
1350 			 * elements in the array is cRun. MUST exist
1351 			 * if and only if fRichSt is 0x1.
1352 			 *
1353 			 * Note: As defined in MSDN, a FormatRun structure has a size
1354 			 *  of 4 bytes, so the size of this rgRun variable is really
1355 			 *  (4*cRun) bytes.
1356 			 *  http://msdn.microsoft.com/en-us/library/dd921712.aspx
1357 			 *
1358 			 * Skiping this as it will not be useful in
1359 			 * our case.
1360 			 */
1361 			gsf_input_seek (stream, 4 * c_run, G_SEEK_CUR);
1362 			/* Note that we may be now out of the current record after having
1363 			 * done this seek operation. */
1364 		}
1365 
1366 		/* ExtString */
1367 		if (cb_ext_rst > 0) {
1368 			/* Again its not so clear may be it will not
1369 			 * useful in our case.
1370 			 */
1371 			gsf_input_seek (stream, cb_ext_rst, G_SEEK_CUR);
1372 			/* Note that we may be now out of the current record after having
1373 			 * done this seek operation. */
1374 		}
1375 
1376 		/* Go to next chunk */
1377 		i++;
1378 	}
1379 }
1380 
1381 /**
1382  * @brief Extract excel content from specified infile
1383  * @param infile file to read summary from
1384  * @param n_words number of max words to extract
1385  * @param n_bytes max number of bytes to extract
1386  * @param is_encrypted
1387  * @Notes :- About SST record
1388  *
1389  * This record specifies string constants.
1390  * [MS-XLS] — v20090708
1391  * Excel Binary File Format (.xls) Structure Specification
1392  * Copyright © 2009 Microsoft Corporation.
1393  * Release: Wednesday, July 8, 2009
1394  *
1395  * Each string constant in this record has one or more references in
1396  * the workbook, with the goal of improving performance in opening and
1397  * saving the file. The LabelSst record specifies how to make a
1398  * reference to a string in this record.
1399  *                     1                   2                   3
1400  * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1401  *                           cstTotal
1402  *                           cstUnique
1403  *                           rgb (variable)
1404  *                           ...
1405  * cstTotal (4 bytes): A signed integer that specifies the total
1406  * number of references in the workbook to the strings in the shared
1407  * string table. MUST be greater than or equal to 0.
1408  *
1409  * cstUnique (4 bytes): A signed integer that specifies the number of
1410  * unique strings in the shared string table. MUST be greater than or
1411  * equal to 0.
1412  *
1413  * rgb (variable): An array of XLUnicodeRichExtendedString structures.
1414  * Records in this array are unique.
1415  */
1416 static gchar*
1417 extract_excel_content (GsfInfile *infile,
1418                        gsize      n_bytes,
1419                        gboolean  *is_encrypted)
1420 {
1421 	ExcelBiffHeader header1;
1422 	GString *content = NULL;
1423 	GsfInput *stream;
1424 	guint saved_offset;
1425 	gsize n_bytes_remaining = n_bytes;
1426 
1427 	/* If no content requested, return */
1428 	if (n_bytes == 0) {
1429 		return NULL;
1430 	}
1431 
1432 	stream = gsf_infile_child_by_name (infile, "Workbook");
1433 
1434 	if (!stream) {
1435 		return NULL;
1436 	}
1437 
1438 	/* Read until we reach eof or any of our limits reached */
1439 	while (n_bytes_remaining > 0 &&
1440 	       !gsf_input_eof (stream)) {
1441 		guint8 tmp_buffer[4] = { 0 };
1442 
1443 		/* Reading 4 bytes to read header */
1444 		gsf_input_read (stream, 4, tmp_buffer);
1445 		header1.id = read_16bit (tmp_buffer);
1446 		header1.length = read_16bit (tmp_buffer + 2);
1447 
1448 		/* g_debug ("id: %d , length %d", header.id, header.length); */
1449 
1450 		/* We are interested only in SST record */
1451 		if (header1.id == RECORD_TYPE_SST) {
1452 			ExcelExtendedStringRecord record;
1453 			ExcelBiffHeader header2;
1454 			GArray *list;
1455 			guint length = 0;
1456 
1457 			/* Saving length and offset so that will
1458 			 * return to saved once we are done!!
1459 			 */
1460 			length = header1.length;
1461 			saved_offset = gsf_input_tell (stream);
1462 
1463 			/* Saving ExtendendString Record offset and
1464 			 * length.
1465 			 */
1466 			record.offset = gsf_input_tell (stream);
1467 			record.length = length;
1468 
1469 			/* g_debug ("record.offset: %u record.length:%d",  */
1470 			/*           record.offset, record.length); */
1471 
1472 			/* Allocation new array of ExtendendString Record */
1473 			list = g_array_new (TRUE, TRUE, sizeof (ExcelExtendedStringRecord));
1474 
1475 			if (!list) {
1476 				break;
1477 			}
1478 
1479 			g_array_append_val (list, record);
1480 
1481 			/* Reading to parse continue record.
1482 			 *
1483 			 * Note: we are justing parsing notrequired
1484 			 * to read data so passing null data
1485 			 */
1486 			gsf_input_seek (stream, length, G_SEEK_CUR);
1487 
1488 			/* Reading & Assigning biff header 4 bytes */
1489 			gsf_input_read (stream, 4, tmp_buffer);
1490 
1491 			header2.id = read_16bit (tmp_buffer);
1492 			header2.length = read_16bit (tmp_buffer + 2);
1493 
1494 			/* g_debug ("bf id :%d length:%d", header2.id, header2.length); */
1495 			/* g_debug ("offset: %u", (guint) gsf_input_tell (stream)); */
1496 
1497 			while (header2.id == RECORD_TYPE_CONTINUE) {
1498 				/* Assigning to linkedlist we will use
1499 				 * it to read data
1500 				 */
1501 				record.offset = gsf_input_tell (stream);
1502 				record.length = header2.length;
1503 				g_array_append_val (list, record);
1504 
1505 				/* g_debug ("record.offset: %u record.length:%d", */
1506 				/*           record.offset, record.length); */
1507 
1508 				/* Then parse the data from the stream */
1509 				gsf_input_seek (stream, header2.length, G_SEEK_CUR);
1510 
1511 				/* Reading and assigning biff header */
1512 				gsf_input_read (stream, 4, tmp_buffer);
1513 				header2.id = read_16bit (tmp_buffer);
1514 				header2.length = read_16bit (tmp_buffer + 2);
1515 
1516 				/* g_debug ("bf id :%d length:%d", header2.id, header2.length); */
1517 			};
1518 
1519 			/* Read extended string */
1520 			xls_get_extended_record_string (stream,
1521 			                                list,
1522 			                                &n_bytes_remaining,
1523 			                                &content);
1524 
1525 			g_array_unref (list);
1526 
1527 			/* Restoring the old_offset */
1528 			gsf_input_seek (stream, saved_offset, G_SEEK_SET);
1529 			break;
1530 		}
1531 
1532 		/* Moving stream pointer to record length */
1533 		if (gsf_input_seek (stream, header1.length, G_SEEK_CUR)) {
1534 			break;
1535 		}
1536 	}
1537 
1538 	g_object_unref (stream);
1539 
1540 	g_debug ("Bytes extracted: %" G_GSIZE_FORMAT,
1541 	         n_bytes - n_bytes_remaining);
1542 
1543 	return content ? g_string_free (content, FALSE) : NULL;
1544 }
1545 
1546 /**
1547  * @brief Extract summary OLE stream from specified uri
1548  * @param metadata where to store summary
1549  * @param infile file to read summary from
1550  * @param uri uri of the file
1551  */
1552 static gboolean
1553 extract_summary (TrackerSparqlBuilder *metadata,
1554                  GsfInfile            *infile,
1555                  const gchar          *uri)
1556 {
1557 	GsfInput *stream;
1558 
1559 	tracker_sparql_builder_predicate (metadata, "a");
1560 	tracker_sparql_builder_object (metadata, "nfo:PaginatedTextDocument");
1561 
1562 	stream = gsf_infile_child_by_name (infile, "\05SummaryInformation");
1563 
1564 	if (stream) {
1565 		GsfDocMetaData *md;
1566 		MetadataInfo info;
1567 		GError *error = NULL;
1568 
1569 		md = gsf_doc_meta_data_new ();
1570 		error = gsf_doc_meta_data_read_from_msole (md, stream);
1571 
1572 		if (error) {
1573 			g_warning ("Could not extract summary information, %s",
1574 			           error->message ? error->message : "no error given");
1575 
1576 			g_error_free (error);
1577 			g_object_unref (md);
1578 			g_object_unref (stream);
1579 			gsf_shutdown ();
1580 
1581 			return FALSE;
1582 		}
1583 
1584 		info.metadata = metadata;
1585 		info.uri = uri;
1586 
1587 		gsf_doc_meta_data_foreach (md, summary_metadata_cb, &info);
1588 
1589 		g_object_unref (md);
1590 		g_object_unref (stream);
1591 	}
1592 
1593 	stream = gsf_infile_child_by_name (infile, "\05DocumentSummaryInformation");
1594 
1595 	if (stream) {
1596 		GsfDocMetaData *md;
1597 		MetadataInfo info;
1598 		GError *error = NULL;
1599 
1600 		md = gsf_doc_meta_data_new ();
1601 
1602 		error = gsf_doc_meta_data_read_from_msole (md, stream);
1603 		if (error) {
1604 			g_warning ("Could not extract document summary information, %s",
1605 			           error->message ? error->message : "no error given");
1606 
1607 			g_error_free (error);
1608 			g_object_unref (md);
1609 			g_object_unref (stream);
1610 			gsf_shutdown ();
1611 
1612 			return FALSE;
1613 		}
1614 
1615 		info.metadata = metadata;
1616 		info.uri = uri;
1617 
1618 		gsf_doc_meta_data_foreach (md, document_metadata_cb, &info);
1619 
1620 		g_object_unref (md);
1621 		g_object_unref (stream);
1622 	}
1623 
1624 	return TRUE;
1625 }
1626 
1627 /**
1628  * @brief Extract data from generic office files
1629  *
1630  * At the moment only extracts document summary from summary OLE stream.
1631  * @param uri URI of the file to extract data
1632  * @param metadata where to store extracted data to
1633  */
1634 G_MODULE_EXPORT gboolean
1635 tracker_extract_get_metadata (TrackerExtractInfo *info)
1636 {
1637 	TrackerSparqlBuilder *metadata;
1638 	TrackerConfig *config;
1639 	GsfInfile *infile = NULL;
1640 	gchar *content = NULL, *uri;
1641 	gboolean is_encrypted = FALSE;
1642 	const gchar *mime_used;
1643 	gsize max_bytes;
1644 	GFile *file;
1645 	gchar *filename;
1646 	FILE *mfile;
1647 
1648 	gsf_init ();
1649 
1650 	metadata = tracker_extract_info_get_metadata_builder (info);
1651 	mime_used = tracker_extract_info_get_mimetype (info);
1652 
1653 	file = tracker_extract_info_get_file (info);
1654 	uri = g_file_get_uri (file);
1655 
1656 	filename = g_filename_from_uri (uri, NULL, NULL);
1657 
1658 	mfile = tracker_file_open (filename);
1659 	g_free (filename);
1660 
1661 	if (!mfile) {
1662 		g_warning ("Can't open file from uri '%s': %s",
1663 		           uri, g_strerror (errno));
1664 		g_free (uri);
1665 		return FALSE;
1666 	}
1667 
1668 	infile = open_file (uri, mfile);
1669 	if (!infile) {
1670 		gsf_shutdown ();
1671 		g_free (uri);
1672 		if (mfile) {
1673 			tracker_file_close (mfile, FALSE);
1674 		}
1675 		return FALSE;
1676 	}
1677 
1678 	/* Extracting summary */
1679 	extract_summary (metadata, infile, uri);
1680 
1681 	/* Set max bytes to read from content */
1682 	config = tracker_main_get_config ();
1683 	max_bytes = tracker_config_get_max_bytes (config);
1684 
1685 	if (g_ascii_strcasecmp (mime_used, "application/msword") == 0) {
1686 		/* Word file */
1687 		content = extract_msword_content (infile, max_bytes, &is_encrypted);
1688 	} else if (g_ascii_strcasecmp (mime_used, "application/vnd.ms-powerpoint") == 0) {
1689 		/* PowerPoint file */
1690 		tracker_sparql_builder_predicate (metadata, "a");
1691 		tracker_sparql_builder_object (metadata, "nfo:Presentation");
1692 
1693 		content = extract_powerpoint_content (infile, max_bytes, &is_encrypted);
1694 	} else if (g_ascii_strcasecmp (mime_used, "application/vnd.ms-excel") == 0) {
1695 		/* Excel File */
1696 		tracker_sparql_builder_predicate (metadata, "a");
1697 		tracker_sparql_builder_object (metadata, "nfo:Spreadsheet");
1698 
1699 		content = extract_excel_content (infile, max_bytes, &is_encrypted);
1700 	} else {
1701 		g_message ("Mime type was not recognised:'%s'", mime_used);
1702 	}
1703 
1704 	if (content) {
1705 		tracker_sparql_builder_predicate (metadata, "nie:plainTextContent");
1706 		tracker_sparql_builder_object_unvalidated (metadata, content);
1707 		g_free (content);
1708 	}
1709 
1710 	if (is_encrypted) {
1711 		tracker_sparql_builder_predicate (metadata, "nfo:isContentEncrypted");
1712 		tracker_sparql_builder_object_boolean (metadata, TRUE);
1713 	}
1714 
1715 	g_object_unref (infile);
1716 	g_free (uri);
1717 	gsf_shutdown ();
1718 	if (mfile) {
1719 		tracker_file_close (mfile, FALSE);
1720 	}
1721 
1722 	return TRUE;
1723 }
tracker-0.16.2/src/tracker-extract/tracker-extract-msoffice.c

Incomplete coverage