1 /*
2 * Copyright (C) 2006, Edward Duffy <eduffy@gmail.com>
3 * Copyright (C) 2006, Laurent Aguerreche <laurent.aguerreche@free.fr>
4 * Copyright (C) 2008, Nokia <ivan.frade@nokia.com>
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the
18 * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
19 * Boston, MA 02110-1301, USA.
20 */
21
22 #include "config.h"
23
24 #include <errno.h>
25 #include <string.h>
26
27 #include <glib.h>
28
29 #include <gsf/gsf.h>
30 #include <gsf/gsf-doc-meta-data.h>
31 #include <gsf/gsf-infile.h>
32 #include <gsf/gsf-infile-msole.h>
33 #include <gsf/gsf-input-stdio.h>
34 #include <gsf/gsf-msole-utils.h>
35 #include <gsf/gsf-utils.h>
36 #include <gsf/gsf-infile-zip.h>
37
38 #include <libtracker-common/tracker-utils.h>
39 #include <libtracker-common/tracker-file-utils.h>
40 #include <libtracker-common/tracker-os-dependant.h>
41
42 #include <libtracker-extract/tracker-extract.h>
43
44 #include "tracker-main.h"
45 #include "tracker-gsf.h"
46
47 /* Powerpoint files comprise of structures. Each structure contains a
48 * header. Within that header is a record type that specifies what
49 * strcture it is. It is called record type.
50 *
51 * Here are are some record types and description of the structure
52 * (called atom) they contain.
53 */
54
55 /* An atom record that specifies Unicode characters with no high byte
56 * of a UTF-16 Unicode character. High byte is always 0.
57 * http://msdn.microsoft.com/en-us/library/dd947905%28v=office.12%29.aspx
58 */
59 #define TEXTBYTESATOM_RECORD_TYPE 0x0FA8
60
61 /* An atom record that specifies Unicode characters.
62 * http://msdn.microsoft.com/en-us/library/dd772921%28v=office.12%29.aspx
63 */
64 #define TEXTCHARSATOM_RECORD_TYPE 0x0FA0
65
66 /* A container record that specifies information about the powerpoint
67 * document.
68 */
69 #define DOCUMENTCONTAINER_RECORD_TYPE 0x03E8
70
71 /* Variant type of record. Within Powerpoint text extraction we are
72 * interested of SlideListWithTextContainer type that contains the
73 * textual content of the slide(s).
74 */
75 #define SLIDELISTWITHTEXT_RECORD_TYPE 0x0FF0
76
77 /**
78 * @brief Header for all powerpoint structures
79 *
80 * A structure at the beginning of each container record and each atom record in
81 * the file. The values in the record header and the context of the record are
82 * used to identify and interpret the record data that follows.
83 */
84 typedef struct {
85 /**
86 * @brief An unsigned integer that specifies the version of the record
87 * data that follows the record header. A value of 0xF specifies that the
88 * record is a container record.
89 */
90 guint recVer;
91
92 /**
93 * @brief An unsigned integer that specifies the record instance data.
94 * Interpretation of the value is dependent on the particular record
95 * type.
96 */
97 guint recInstance;
98
99 /**
100 * @brief A RecordType enumeration that specifies the type of the record
101 * data that follows the record header.
102 */
103 gint recType;
104
105 /**
106 * @brief An unsigned integer that specifies the length, in bytes, of the
107 * record data that follows the record header.
108 */
109 guint recLen;
110 } PowerPointRecordHeader;
111
112 /* Excel spec record type to read shared string */
113 typedef enum {
114 RECORD_TYPE_SST = 252,
115 RECORD_TYPE_CONTINUE = 60,
116 RECORD_TYPE_EOF = 10
117 } ExcelRecordType;
118
119 /* ExcelBiffHeader to read excel spec header */
120 typedef struct {
121 ExcelRecordType id;
122 guint length;
123 } ExcelBiffHeader;
124
125 /* ExtendendString Record offset in stream and length */
126 typedef struct {
127 gsf_off_t offset; /* 64 bits!! */
128 gsize length;
129 } ExcelExtendedStringRecord;
130
131 typedef struct {
132 TrackerSparqlBuilder *metadata;
133 const gchar *uri;
134 } MetadataInfo;
135
136 /* Valid range from \000 to \377 (0 to 255) */
137 #define octal_ascii_triplet_is_valid(slash, a2, a1, a0) \
138 (slash == '\\' && \
139 a2 >= '0' && a2 <= '3' && \
140 a1 >= '0' && a1 <= '7' && \
141 a0 >= '0' && a0 <= '7')
142
143 #define octal_ascii_triplet_to_decimal_int(a2, a1, a0) \
144 ((a0 - '0') + 8 * ((a1 - '0') + 8 * (a2 - '0')))
145
146 /*
147 * So, we may get input strings with UTF-8 characters encoded in OCTAL and
148 * represented in ASCII, like this:
149 * K\303\230BENHAVNS UNIVERSITET
150 * which is equivalent to:
151 * KØBENHAVNS UNIVERSITET
152 */
153 static void
154 msoffice_string_process_octal_triplets (guchar *str)
155 {
156 guint i = 0; /* index in original string */
157 guint j = 0; /* index in processed string */
158 guint length = strlen (str);
pointer targets in passing argument 1 of 'strlen' differ in signedness
(emitted by gcc)
159
160 /* Changing the string IN PLACE, note that j<=i ALWAYS! */
161 while (i < length) {
162 if (length - i >= 4 &&
163 octal_ascii_triplet_is_valid (str[i], str[i+1], str[i+2], str[i+3])) {
164 /* Found a new octal triplet */
165 str[j] = octal_ascii_triplet_to_decimal_int (str[i+1], str[i+2], str[i+3]);
166 i += 4;
167 } else if (i != j) {
168 /* We previously found an octal triplet,
169 * we need to update the string */
170 str[j] = str[i];
171 i++;
172 } else {
173 /* No need to update the string yet */
174 i++;
175 }
176 j++;
177 }
178 /* New end of string */
179 str[j]='\0';
180 }
181
182 static void
183 metadata_add_gvalue (TrackerSparqlBuilder *metadata,
184 const gchar *uri,
185 const gchar *key,
186 GValue const *val,
187 const gchar *type,
188 const gchar *predicate,
189 gboolean is_date)
190 {
191 gchar *s;
192
193 g_return_if_fail (metadata != NULL);
194 g_return_if_fail (key != NULL);
195
196 if (!val) {
197 return;
198 }
199
200 s = g_strdup_value_contents (val);
201
202 if (!s) {
203 return;
204 }
205
206 if (!tracker_is_empty_string (s)) {
207 gchar *str_val;
208
209 /* Some fun: strings are always written "str" with double quotes
210 * around, but not numbers!
211 */
212 if (s[0] == '"') {
213 size_t len;
214
215 len = strlen (s);
216
217 if (s[len - 1] == '"') {
218 if (is_date) {
219 if (len > 2) {
220 gchar *str = g_strndup (s + 1, len - 2);
221 str_val = tracker_date_guess (str);
222 g_free (str);
223 } else {
224 str_val = NULL;
225 }
226 } else {
227 str_val = len > 2 ? g_strndup (s + 1, len - 2) : NULL;
228 }
229 } else {
230 /* We have a string that begins with a double
231 * quote but which finishes by something
232 * different... We copy the string from the
233 * beginning.
234 */
235 if (is_date) {
236 str_val = tracker_date_guess (s);
237 } else {
238 str_val = g_strdup (s);
239 }
240 }
241 } else {
242 /* Here, we probably have a number */
243 if (is_date) {
244 str_val = tracker_date_guess (s);
245 } else {
246 str_val = g_strdup (s);
247 }
248 }
249
250 if (str_val) {
251 /* Process (in place) octal triplets if found */
252 msoffice_string_process_octal_triplets (str_val);
pointer targets in passing argument 1 of 'msoffice_string_process_octal_triplets' differ in signedness
(emitted by gcc)
253
254 if (type && predicate) {
255 tracker_sparql_builder_predicate (metadata, key);
256
257 tracker_sparql_builder_object_blank_open (metadata);
258 tracker_sparql_builder_predicate (metadata, "a");
259 tracker_sparql_builder_object (metadata, type);
260
261 tracker_sparql_builder_predicate (metadata, predicate);
262 tracker_sparql_builder_object_unvalidated (metadata, str_val);
263 tracker_sparql_builder_object_blank_close (metadata);
264 } else {
265 tracker_sparql_builder_predicate (metadata, key);
266 tracker_sparql_builder_object_unvalidated (metadata, str_val);
267 }
268
269 g_free (str_val);
270 }
271 }
272
273 g_free (s);
274 }
275
276 static void
277 summary_metadata_cb (gpointer key,
278 gpointer value,
279 gpointer user_data)
280 {
281 MetadataInfo *info = user_data;
282 GValue const *val;
283
284 val = gsf_doc_prop_get_val (value);
285
286 if (g_strcmp0 (key, "dc:title") == 0) {
287 metadata_add_gvalue (info->metadata, info->uri, "nie:title", val, NULL, NULL, FALSE);
288 } else if (g_strcmp0 (key, "dc:subject") == 0) {
289 metadata_add_gvalue (info->metadata, info->uri, "nie:subject", val, NULL, NULL, FALSE);
290 } else if (g_strcmp0 (key, "dc:creator") == 0) {
291 metadata_add_gvalue (info->metadata, info->uri, "nco:creator", val, "nco:Contact", "nco:fullname", FALSE);
292 } else if (g_strcmp0 (key, "dc:keywords") == 0) {
293 gchar *keywords = g_strdup_value_contents (val);
294 gchar *lasts, *keyw;
295 size_t len;
296
297 keyw = keywords;
298 keywords = strchr (keywords, '"');
299
300 if (keywords) {
301 keywords++;
302 } else {
303 keywords = keyw;
304 }
305
306 len = strlen (keywords);
307 if (keywords[len - 1] == '"') {
308 keywords[len - 1] = '\0';
309 }
310
311 for (keyw = strtok_r (keywords, ",; ", &lasts); keyw;
312 keyw = strtok_r (NULL, ",; ", &lasts)) {
313 tracker_sparql_builder_predicate (info->metadata, "nie:keyword");
314 tracker_sparql_builder_object_unvalidated (info->metadata, keyw);
315 }
316
317 g_free (keyw);
318 } else if (g_strcmp0 (key, "dc:description") == 0) {
319 metadata_add_gvalue (info->metadata, info->uri, "nie:comment", val, NULL, NULL, FALSE);
320 } else if (g_strcmp0 (key, "gsf:page-count") == 0) {
321 metadata_add_gvalue (info->metadata, info->uri, "nfo:pageCount", val, NULL, NULL, FALSE);
322 } else if (g_strcmp0 (key, "gsf:word-count") == 0) {
323 metadata_add_gvalue (info->metadata, info->uri, "nfo:wordCount", val, NULL, NULL, FALSE);
324 } else if (g_strcmp0 (key, "meta:creation-date") == 0) {
325 metadata_add_gvalue (info->metadata, info->uri, "nie:contentCreated", val, NULL, NULL, TRUE);
326 } else if (g_strcmp0 (key, "meta:generator") == 0) {
327 metadata_add_gvalue (info->metadata, info->uri, "nie:generator", val, NULL, NULL, FALSE);
328 }
329 }
330
331 static void
332 document_metadata_cb (gpointer key,
333 gpointer value,
334 gpointer user_data)
335 {
336 if (g_strcmp0 (key, "CreativeCommons_LicenseURL") == 0) {
337 MetadataInfo *info = user_data;
338
339 metadata_add_gvalue (info->metadata,
340 info->uri,
341 "nie:license",
342 gsf_doc_prop_get_val (value),
343 NULL,
344 NULL,
345 FALSE);
346 }
347 }
348
349 /**
350 * @brief Read 8 bit unsigned integer
351 * @param buffer data to read integer from
352 * @return 16 bit unsigned integer
353 */
354 static guint
355 read_8bit (const guint8 *buffer)
356 {
357 return buffer[0];
358 }
359
360 /**
361 * @brief Read 16 bit unsigned integer
362 * @param buffer data to read integer from
363 * @return 16 bit unsigned integer
364 */
365 static guint16
366 read_16bit (const guint8 *buffer)
367 {
368 return buffer[0] + (buffer[1] << 8);
369 }
370
371 /**
372 * @brief Read 32 bit unsigned integer
373 * @param buffer data to read integer from
374 * @return 32 bit unsigned integer
375 */
376 static guint32
377 read_32bit (const guint8 *buffer)
378 {
379 return buffer[0] + (buffer[1] << 8) + (buffer[2] << 16) + (buffer[3] << 24);
380 }
381
382 /**
383 * @brief Common conversion and normalization method for all msoffice type
384 * documents.
385 * @param buffer Input buffer with the string contents
386 * @param chunk_size Number of valid bytes in the input buffer
387 * @param is_ansi If %TRUE, input text should be encoded in CP1252, and
388 * in UTF-16 otherwise.
389 * @param p_bytes_remaining Pointer to #gsize specifying how many bytes
390 * should still be considered.
391 * @param p_content Pointer to a #GString where the output normalized words
392 * will be appended.
393 */
394 static void
395 msoffice_convert_and_normalize_chunk (guint8 *buffer,
396 gsize chunk_size,
397 gboolean is_ansi,
398 gsize *bytes_remaining,
399 GString **content)
400 {
401 gsize n_bytes_utf8;
402 gchar *converted_text;
403 GError *error = NULL;
404
405 g_return_if_fail (buffer != NULL);
406 g_return_if_fail (chunk_size > 0);
407 g_return_if_fail (bytes_remaining != NULL);
408 g_return_if_fail (content != NULL);
409
410 /* chunks can have different encoding
411 *
412 * TODO: Using g_iconv, this extra heap allocation could be
413 * avoided, re-using over and over again the same output buffer
414 * for the UTF-8 encoded string
415 */
416 converted_text = g_convert (buffer,
417 chunk_size,
418 "UTF-8",
419 is_ansi ? "CP1252" : "UTF-16",
420 NULL,
421 &n_bytes_utf8,
422 &error);
pointer targets in passing argument 1 of 'g_convert' differ in signedness
(emitted by gcc)
423
424 if (converted_text) {
425 gsize len_to_validate;
426
427 len_to_validate = MIN (*bytes_remaining, n_bytes_utf8);
428
429 if (tracker_text_validate_utf8 (converted_text,
430 len_to_validate,
431 content,
432 NULL)) {
433 /* A whitespace is added to separate next strings appended */
434 g_string_append_c (*content, ' ');
435 }
436
437 /* Update accumulated UTF-8 bytes read */
438 *bytes_remaining -= len_to_validate;
439 g_free (converted_text);
440 } else {
441 g_warning ("Couldn't convert %" G_GSIZE_FORMAT " bytes from %s to UTF-8: %s",
442 chunk_size,
443 is_ansi ? "CP1252" : "UTF-16",
444 error ? error->message : "no error given");
445 }
446
447 /* Note that error may be set even if some converted text is
448 * available, due to G_CONVERT_ERROR_ILLEGAL_SEQUENCE for example */
449 g_clear_error (&error);
450 }
451
452 /**
453 * @brief Read header data from given stream
454 * @param stream Stream to read header data
455 * @param header Pointer to header where to store results
456 */
457 static gboolean
458 ppt_read_header (GsfInput *stream,
459 PowerPointRecordHeader *header)
460 {
461 guint8 buffer[8] = {0};
462
463 g_return_val_if_fail (stream, FALSE);
464 g_return_val_if_fail (header, FALSE);
465 g_return_val_if_fail (!gsf_input_eof (stream), FALSE);
466
467
468 /* Header is always 8 bytes, read it */
469 g_return_val_if_fail (gsf_input_read (stream, 8, buffer), FALSE);
470
471 /* Then parse individual details
472 *
473 * Record header is 8 bytes long. Data is split as follows:
474 * recVer (4 bits)
475 * recInstance (12 bits)
476 * recType (2 bytes)
477 * recLen (4 bytes)
478 *
479 * See RecordHeader for more detailed explanation of each field.
480 *
481 * Here we parse each of those fields.
482 */
483
484 header->recType = read_16bit (&buffer[2]);
485 header->recLen = read_32bit (&buffer[4]);
486 header->recVer = (read_16bit (buffer) & 0xF000) >> 12;
487 header->recInstance = read_16bit (buffer) & 0x0FFF;
488
489 return TRUE;
490 }
491
492 /**
493 * @brief Read powerpoint text from given stream.
494 *
495 * Powerpoint contains texts in either TextBytesAtom or TextCharsAtom. Below
496 * are excerpt from [MS-PPT].pdf file describing the ppt file struture:
497 *
498 * TextCharsAtom contains an array of UTF-16 Unicode [RFC2781] characters that
499 * specifies the characters of the corresponding text. The length, in bytes, of
500 * the array is specified by rh.recLen. The array MUST NOT contain the NUL
501 * character 0x0000.
502 *
503 * TextBytesAtom contains an array of bytes that specifies the characters of the
504 * corresponding text. Each item represents the low byte of a UTF-16 Unicode
505 * [RFC2781] character whose high byte is 0x00. The length, in bytes, of the
506 * array is specified by rh.recLen. The array MUST NOT contain a 0x00 byte.
507 *
508 * @param stream Stream to read text bytes/chars atom
509 * @return read text or NULL if no text was read. Has to be freed by the caller
510 */
511 static void
512 ppt_read_text (GsfInput *stream,
513 guint8 **p_buffer,
514 gsize *p_buffer_size,
515 gsize *p_read_size)
516 {
517 PowerPointRecordHeader header;
518 gsize required_size;
519
520 g_return_if_fail (stream);
521 g_return_if_fail (p_buffer);
522 g_return_if_fail (p_buffer_size);
523 g_return_if_fail (p_read_size);
524
525 /* First read the header that describes the structures type
526 * (TextBytesAtom or TextCharsAtom) and it's length.
527 */
528 g_return_if_fail (ppt_read_header (stream, &header));
529
530 /* We only want header with type either TEXTBYTESATOM_RECORD_TYPE
531 * (TextBytesAtom) or TEXTCHARSATOM_RECORD_TYPE (TextCharsAtom).
532 *
533 * We don't care about anything else
534 */
535 if (header.recType != TEXTBYTESATOM_RECORD_TYPE &&
536 header.recType != TEXTCHARSATOM_RECORD_TYPE) {
537 return;
538 }
539
540 /* Then we'll allocate data for the actual texts */
541 if (header.recType == TEXTBYTESATOM_RECORD_TYPE) {
542 /* TextBytesAtom doesn't include high bytes propably in order to
543 * save space on the ppt files. We'll have to allocate double the
544 * size for it to get the high bytes
545 */
546 required_size = header.recLen * 2;
547 } else {
548 required_size = header.recLen;
549 }
550
551 /* Resize reused buffer if needed */
552 if (required_size > *p_buffer_size) {
553 *p_buffer = g_realloc (*p_buffer, required_size);
554 *p_buffer_size = required_size;
555 }
556
557 /* Then read the textual data from the stream */
558 if (!gsf_input_read (stream, header.recLen, *p_buffer)) {
559 return;
560 }
561
562 /* Again if we are reading TextBytesAtom we'll need to add those utf16
563 * high bytes ourselves. They are zero as specified in [MS-PPT].pdf
564 * and this function's comments
565 */
566 if (header.recType == TEXTBYTESATOM_RECORD_TYPE) {
567 gint i;
568
569 for (i = 0; i < header.recLen; i++) {
570 /* We'll add an empty 0 byte between each byte in the array */
571 (*p_buffer)[(header.recLen - i - 1) * 2] = (*p_buffer)[header.recLen - i - 1];
572 (*p_buffer)[((header.recLen - i - 1) * 2) + 1] = '\0';
573 }
574 }
575
576 /* Set read size as output */
577 *p_read_size = required_size;
578 }
579
580 /**
581 * @brief Find a specific header from given stream
582 * @param stream Stream to parse headers from
583 * @param type1 first type of header to look for
584 * @param type2 convenience parameter if we are looking for either of two
585 * header types
586 * @param rewind if a proper header is found should this function seek
587 * to the start of the header (TRUE)
588 * @return TRUE if either of specified headers was found
589 */
590 static gboolean
591 ppt_seek_header (GsfInput *stream,
592 gint type1,
593 gint type2,
594 gboolean rewind)
595 {
596 PowerPointRecordHeader header;
597
598 g_return_val_if_fail (stream,FALSE);
599
600 /* Read until we reach eof */
601 while (!gsf_input_eof (stream)) {
602 /* Read first header */
603 g_return_val_if_fail (ppt_read_header (stream, &header), FALSE);
604
605 /* Check if it's the correct type */
606 if (header.recType == type1 || header.recType == type2) {
607 /* Sometimes it's needed to rewind to the start of the
608 * header
609 */
610 if (rewind) {
611 gsf_input_seek (stream, -8, G_SEEK_CUR);
612 }
613
614 return TRUE;
615 }
616
617 /* If it's not the correct type, seek to the beginning of the
618 * next header
619 */
620 g_return_val_if_fail (!gsf_input_seek (stream,
621 header.recLen,
622 G_SEEK_CUR),
623 FALSE);
624 }
625
626 return FALSE;
627 }
628
629 static gchar *
630 extract_powerpoint_content (GsfInfile *infile,
631 gsize max_bytes,
632 gboolean *is_encrypted)
633 {
634 /* Try to find Powerpoint Document stream */
635 GsfInput *stream;
636 GString *all_texts = NULL;
637 gsf_off_t last_document_container;
638
639 /* If no content requested, return */
640 if (max_bytes == 0) {
641 return NULL;
642 }
643
644 stream = gsf_infile_child_by_name (infile, "PowerPoint Document");
645
646 if (is_encrypted) {
647 *is_encrypted = FALSE;
648 }
649
650 if (!stream) {
651 return NULL;
652 }
653
654 /* Powerpoint documents have a "editing history" stored within them.
655 * There is a structure that defines what changes were made each time
656 * but it is just easier to get the current/latest version just by
657 * finding the last occurrence of DocumentContainer structure
658 */
659 last_document_container = -1;
660
661 /* Read until we reach eof. */
662 while (!gsf_input_eof (stream)) {
663 PowerPointRecordHeader header;
664
665 /*
666 * We only read headers of data structures
667 */
668 if (!ppt_read_header (stream, &header)) {
669 break;
670 }
671
672 /* And we only care about headers with type 1000,
673 * DocumentContainer
674 */
675
676 if (header.recType == DOCUMENTCONTAINER_RECORD_TYPE) {
677 last_document_container = gsf_input_tell (stream);
678 }
679
680 /* and then seek to the start of the next data
681 * structure so it is fast and we don't have to read
682 * through the whole file
683 */
684 if (gsf_input_seek (stream, header.recLen, G_SEEK_CUR)) {
685 break;
686 }
687 }
688
689 /* If a DocumentContainer was found and we are able to seek to it.
690 *
691 * Then we'll have to find the second header with type
692 * SLIDELISTWITHTEXT_RECORD_TYPE since DocumentContainer
693 * contains MasterListWithTextContainer and
694 * SlideListWithTextContainer structures with both having the
695 * same header type. We however only want
696 * SlideListWithTextContainer which contains the textual
697 * content of the power point file.
698 */
699 if (last_document_container >= 0 &&
700 !gsf_input_seek (stream, last_document_container, G_SEEK_SET) &&
701 ppt_seek_header (stream,
702 SLIDELISTWITHTEXT_RECORD_TYPE,
703 SLIDELISTWITHTEXT_RECORD_TYPE,
704 FALSE) &&
705 ppt_seek_header (stream,
706 SLIDELISTWITHTEXT_RECORD_TYPE,
707 SLIDELISTWITHTEXT_RECORD_TYPE,
708 FALSE)) {
709 gsize bytes_remaining = max_bytes;
710 guint8 *buffer = NULL;
711 gsize buffer_size = 0;
712
713 /*
714 * Read while we have either TextBytesAtom or
715 * TextCharsAtom and we have read less than max_bytes
716 * (in UTF-8)
717 */
718 while (bytes_remaining > 0 &&
719 ppt_seek_header (stream,
720 TEXTBYTESATOM_RECORD_TYPE,
721 TEXTCHARSATOM_RECORD_TYPE,
722 TRUE)) {
723 gsize read_size = 0;
724
725 /* Read the UTF-16 text in the reused buffer, and also get
726 * number of read bytes */
727 ppt_read_text (stream, &buffer, &buffer_size, &read_size);
728
729 /* Avoid empty strings */
730 if (read_size > 0) {
731 /* Convert, normalize and limit max words & bytes.
732 * NOTE: `is_ansi' argument is FALSE, as the string is
733 * always in UTF-16 */
734 msoffice_convert_and_normalize_chunk (buffer,
735 read_size,
736 FALSE, /* Always UTF-16 */
737 &bytes_remaining,
738 &all_texts);
739 }
740 }
741
742 g_free (buffer);
743 }
744
745 g_object_unref (stream);
746
747 return all_texts ? g_string_free (all_texts, FALSE) : NULL;
748 }
749
750 static GsfInfile *
751 open_file (const gchar *filename, FILE *file)
752 {
753 GsfInput *input;
754 GsfInfile *infile;
755 GError *error = NULL;
756
757 input = gsf_input_stdio_new_FILE (filename, file, TRUE);
758
759 if (!input) {
760 return NULL;
761 }
762
763 infile = gsf_infile_msole_new (input, &error);
764
765 if (error) {
766 g_warning ("Failed to open file: %s", error->message);
767 g_error_free (error);
768 }
769
770 g_object_unref (input);
771
772 return infile;
773 }
774
775 /* This function was programmed by using ideas and algorithms from
776 * b2xtranslator project (http://b2xtranslator.sourceforge.net/)
777 */
778 static gchar *
779 extract_msword_content (GsfInfile *infile,
780 gsize n_bytes,
781 gboolean *is_encrypted)
782 {
783 GsfInput *document_stream, *table_stream;
784 gint16 i = 0;
785 guint8 tmp_buffer[4] = { 0 };
786 gint fcClx, lcbClx;
787 guint8 *piece_table = NULL;
788 guint8 *clx = NULL;
789 gint lcb_piece_table;
790 gint piece_count = 0;
791 gint32 fc;
792 GString *content = NULL;
793 guint8 *text_buffer = NULL;
794 gint text_buffer_size = 0;
795 gsize n_bytes_remaining;
796
797 /* If no content requested, return */
798 if (n_bytes == 0) {
799 return NULL;
800 }
801
802 document_stream = gsf_infile_child_by_name (infile, "WordDocument");
803 if (document_stream == NULL) {
804 return NULL;
805 }
806
807 /* abort if FIB can't be found from beginning of WordDocument stream */
808 gsf_input_seek (document_stream, 0, G_SEEK_SET);
809 gsf_input_read (document_stream, 2, tmp_buffer);
810 if (read_16bit (tmp_buffer) != 0xa5ec) {
811 g_object_unref (document_stream);
812 return NULL;
813 }
814
815 /* abort if document is encrypted */
816 gsf_input_seek (document_stream, 11, G_SEEK_SET);
817 gsf_input_read (document_stream, 1, tmp_buffer);
818 if ((tmp_buffer[0] & 0x1) == 0x1) {
819 g_object_unref (document_stream);
820 *is_encrypted = TRUE;
821 return NULL;
822 } else
823 *is_encrypted = FALSE;
824
825 /* document can have 0Table or 1Table or both. If flag 0x0200 is
826 * set to true in word 0x000A of the FIB then 1Table is used
827 */
828 gsf_input_seek (document_stream, 0x000A, G_SEEK_SET);
829 gsf_input_read (document_stream, 2, tmp_buffer);
830 i = read_16bit (tmp_buffer);
831
832 if ((i & 0x0200) == 0x0200) {
833 table_stream = gsf_infile_child_by_name (infile, "1Table");
834 } else {
835 table_stream = gsf_infile_child_by_name (infile, "0Table");
836 }
837
838 if (table_stream == NULL) {
839 g_object_unref (G_OBJECT (document_stream));
840 return NULL;
841 }
842
843 /* find out location and length of piece table from FIB */
844 gsf_input_seek (document_stream, 418, G_SEEK_SET);
845 gsf_input_read (document_stream, 4, tmp_buffer);
846 fcClx = read_32bit (tmp_buffer);
847 gsf_input_read (document_stream, 4, tmp_buffer);
848 lcbClx = read_32bit (tmp_buffer);
849
850 /* If we got an invalid or empty length of piece table, just return
851 * as we cannot iterate over pieces */
852 if (lcbClx <= 0) {
853 g_object_unref (document_stream);
854 g_object_unref (table_stream);
855 return NULL;
856 }
857
858 /* copy the structure holding the piece table into the clx array. */
859 clx = g_malloc (lcbClx);
860 gsf_input_seek (table_stream, fcClx, G_SEEK_SET);
861 gsf_input_read (table_stream, lcbClx, clx);
862
863 /* find out piece table from clx and set piece_table -pointer to it */
864 i = 0;
865 lcb_piece_table = 0;
866
867 while (TRUE) {
868 if (clx[i] == 2) {
869 /* Nice, a proper structure with contents, no need to
870 * iterate more. */
871 lcb_piece_table = read_32bit (clx + (i + 1));
872 piece_table = clx + i + 5;
873 piece_count = (lcb_piece_table - 4) / 12;
874 break;
875 } else if (clx[i] == 1) {
876 /* Oh, a PRC structure with properties of text, not
877 * real text, so skip it */
878 guint16 GrpPrl_len;
879
880 GrpPrl_len = read_16bit (&clx[i+1]);
881 /* 3 is the length of clxt (1byte) and cbGrpprl(2bytes) */
882 i = i + 3 + GrpPrl_len;
883 } else {
884 break;
885 }
886 }
887
888 /* Iterate over pieces...
889 * Loop is halted whenever one of this conditions is met:
890 * a) Max bytes to be read reached
891 * b) No more pieces to read
892 */
893 i = 0;
894 n_bytes_remaining = n_bytes;
895 while (n_bytes_remaining > 0 &&
896 i < piece_count) {
897 guint8 *piece_descriptor;
898 gint piece_start;
899 gint piece_end;
900 gint piece_size;
901 gboolean is_ansi;
902
903 /* logical position of the text piece in the document_stream */
904 piece_start = read_32bit (piece_table + (i * 4));
905 piece_end = read_32bit (piece_table + ((i + 1) * 4));
906
907 /* descriptor of single piece from piece table */
908 piece_descriptor = piece_table + ((piece_count + 1) * 4) + (i * 8);
909
910 /* file character position */
911 fc = read_32bit (piece_descriptor + 2);
912
913 /* second bit is set to 1 if text is saved in ANSI encoding */
914 is_ansi = (fc & 0x40000000) == 0x40000000;
915
916 /* modify file character position according to text encoding */
917 if (!is_ansi) {
918 fc = (fc & 0xBFFFFFFF);
919 } else {
920 fc = (fc & 0xBFFFFFFF) >> 1;
921 }
922
923 piece_size = piece_end - piece_start;
924
925 /* NOTE: Very very long pieces may appear. In fact, a single
926 * piece document seems to be quite normal. Thus, we limit
927 * here the number of bytes to read from the stream, based
928 * on the maximum number of bytes in UTF-8. Assuming, then
929 * that a safe limit is 2*n_bytes_remaining if UTF-16 input,
930 * and just n_bytes_remaining in CP1251 input */
931 piece_size = MIN (piece_size, n_bytes_remaining);
932
933 /* UTF-16 uses twice as many bytes as CP1252
934 * NOTE: Not quite sure about this. Some unicode points will be
935 * encoded using 4 bytes in UTF-16 */
936 if (!is_ansi) {
937 piece_size *= 2;
938 }
939
940 /* Avoid empty pieces */
941 if (piece_size >= 1) {
942
943 /* Re-allocate buffer to make it bigger if needed.
944 * This text buffer is re-used over and over in each
945 * iteration. */
946 if (piece_size > text_buffer_size) {
947 text_buffer = g_realloc (text_buffer, piece_size);
948 text_buffer_size = piece_size;
949 }
950
951 /* read and parse single text piece from document_stream */
952 gsf_input_seek (document_stream, fc, G_SEEK_SET);
953 gsf_input_read (document_stream, piece_size, text_buffer);
954
955 msoffice_convert_and_normalize_chunk (text_buffer,
956 piece_size,
957 is_ansi,
958 &n_bytes_remaining,
959 &content);
960 }
961
962 /* Go on to next piece */
963 i++;
964 }
965
966 g_free (text_buffer);
967 g_object_unref (document_stream);
968 g_object_unref (table_stream);
969 g_free (clx);
970
971 return content ? g_string_free (content, FALSE) : NULL;
972 }
973
974 /* Reads and interprets the flags of a given string. May be
975 * used just to skip the fields, as when this bitmask-byte
976 * comes as the first byte of a new record.
977 * NOTE: For a detailed meaning of each field parsed here,
978 * take a look at the XLUnicodeRichExtendedString format:
979 * http://msdn.microsoft.com/en-us/library/dd943830.aspx
980 **/
981 static void
982 read_excel_string_flags (GsfInput *stream,
983 gboolean *p_is_high_byte,
984 guint16 *p_c_run,
985 guint16 *p_cb_ext_rst)
986 {
987 guint8 tmp_buffer[4] = { 0 };
988 guint8 bit_mask;
989 gboolean is_ext_string;
990 gboolean is_rich_string;
991
992 /* Note that output arguments may be NULL if we don't need
993 * their values... */
994
995 /* Reading 1 byte for mask */
996 gsf_input_read (stream, 1, tmp_buffer);
997 bit_mask = read_8bit (tmp_buffer);
998
999 /* Get flags */
1000 if (p_is_high_byte) {
1001 *p_is_high_byte = (bit_mask & 0x01) == 0x01;
1002 }
1003 is_ext_string = (bit_mask & 0x04) == 0x04;
1004 is_rich_string = (bit_mask & 0x08) == 0x08;
1005
1006 /* If the c_run value is required as output, read it */
1007 if (p_c_run) {
1008 if (is_rich_string) {
1009 /* Reading 2 Bytes */
1010 gsf_input_read (stream, 2, tmp_buffer);
1011
1012 /* Reading cRun */
1013 *p_c_run = read_16bit (tmp_buffer);
1014 } else {
1015 *p_c_run = 0;
1016 }
1017 } else if (is_rich_string) {
1018 /* If not required, just skip those bytes */
1019 gsf_input_seek (stream, 2, G_SEEK_CUR);
1020 }
1021
1022 /* If the cb_ext_rst value is required as output, read it */
1023 if (p_cb_ext_rst) {
1024 if (is_ext_string) {
1025 /* Reading 4 Bytes */
1026 gsf_input_read (stream, 4, tmp_buffer);
1027
1028 /* Reading cRun */
1029 *p_cb_ext_rst = read_16bit (tmp_buffer);
1030 } else {
1031 *p_cb_ext_rst = 0;
1032 }
1033 } else if (is_ext_string) {
1034 /* If not required, just skip those bytes */
1035 gsf_input_seek (stream, 4, G_SEEK_CUR);
1036 }
1037 }
1038
1039 /* Returns TRUE if record was changed. BUT, the value of the
1040 * current_record should be checked by the caller to know
1041 * if there are no more records */
1042 static gboolean
1043 change_excel_record_if_needed (GsfInput *stream,
1044 GArray *record_array,
1045 guint *p_current_record)
1046 {
1047 ExcelExtendedStringRecord *record;
1048
1049 /* Get current record */
1050 record = &g_array_index (record_array,
1051 ExcelExtendedStringRecord,
1052 *p_current_record);
1053
1054 /* We may already have surpassed the record, so adjust if so */
1055 if (gsf_input_tell (stream) >= (record->offset + record->length)) {
1056 /* Switch records and read from the second one... */
1057 (*p_current_record)++;
1058
1059 if (*p_current_record < record_array->len) {
1060 record = &g_array_index (record_array,
1061 ExcelExtendedStringRecord,
1062 *p_current_record);
1063
1064 gsf_input_seek (stream, record->offset, G_SEEK_SET);
1065 }
1066
1067 return TRUE;
1068 }
1069
1070 return FALSE;
1071 }
1072
1073 /* Returns TRUE if correctly read
1074 *
1075 * Note that p_current_record may get changed if the required
1076 * bytes to read were split into two different records.
1077 */
1078 static gboolean
1079 read_excel_string (GsfInput *stream,
1080 guint8 *buffer,
1081 gsize chunk_size,
1082 GArray *record_array,
1083 guint *p_current_record)
1084 {
1085 ExcelExtendedStringRecord *record;
1086 gsf_off_t current_position;
1087 gsf_off_t current_record_end;
1088
1089 /* Record may have changed when we want to read the string contents
1090 * This is a pretty special case, where the new CONTINUE record
1091 * shouldn't start with a bitmask */
1092 if (change_excel_record_if_needed (stream, record_array, p_current_record) &&
1093 *p_current_record >= record_array->len) {
1094 /* When reached max number of records, just return */
1095 return FALSE;
1096 }
1097
1098 /* Get current record */
1099 record = &g_array_index (record_array,
1100 ExcelExtendedStringRecord,
1101 *p_current_record);
1102
1103 /* Compute current position in the stream and end of current record*/
1104 current_position = gsf_input_tell (stream);
1105 current_record_end = record->offset + record->length;
1106
1107 /* The best case is when the whole number of bytes to read are in the
1108 * current record, as no record switching is therefore needed */
1109 if (current_position + chunk_size <= current_record_end) {
1110 return gsf_input_read (stream, chunk_size, buffer) != NULL ? TRUE : FALSE;
1111 } else if (current_record_end < current_position) {
1112 /* Safety check, actually pretty important */
1113 return FALSE;
1114 } else {
1115 /* Read the string in two chunks */
1116 gsize chunk_size_first_record;
1117 gsize chunk_size_second_record;
1118
1119 /* Compute how much to read in each record */
1120 chunk_size_first_record = current_record_end - current_position;
1121 chunk_size_second_record = chunk_size - chunk_size_first_record;
1122
1123 /* g_debug ("Current position: %" GSF_OFF_T_FORMAT, current_position); */
1124 /* g_debug ("Current record index: %u", *p_current_record); */
1125 /* g_debug ("Current record start: %" GSF_OFF_T_FORMAT, record->offset); */
1126 /* g_debug ("Current record length: %" G_GSIZE_FORMAT, record->length); */
1127 /* g_debug ("Current record end: %" GSF_OFF_T_FORMAT, current_record_end); */
1128 /* g_debug ("Bytes to read: %" G_GSIZE_FORMAT, chunk_size); */
1129 /* g_debug ("Bytes to read (1st): %" G_GSIZE_FORMAT, chunk_size_first_record); */
1130 /* g_debug ("Bytes to read (2nd): %" G_GSIZE_FORMAT, chunk_size_second_record); */
1131
1132 /* Now, read from first record... */
1133 if (gsf_input_read (stream,
1134 chunk_size_first_record,
1135 buffer)) {
1136 /* Now switch records and read from the second one... */
1137 (*p_current_record)++;
1138
1139 if (*p_current_record < record_array->len) {
1140 record = &g_array_index (record_array,
1141 ExcelExtendedStringRecord,
1142 *p_current_record);
1143
1144 /* g_debug ("New record index: %u", *p_current_record); */
1145 /* g_debug ("New record start: %" GSF_OFF_T_FORMAT, record->offset); */
1146 /* g_debug ("New record length: %" G_GSIZE_FORMAT, record->length); */
1147
1148 /* Move stream pointer to the new location, beginning of next record */
1149 gsf_input_seek (stream, record->offset, G_SEEK_SET);
1150
1151 /* Every CONTINUE records starts with a bitmask + optional fields that
1152 * should be skipped properly */
1153 read_excel_string_flags (stream, NULL, NULL, NULL);
1154
1155 /* And finally, read the second part */
1156 if (gsf_input_read (stream,
1157 chunk_size_second_record,
1158 &buffer[chunk_size_first_record])) {
1159 /* All OK! */
1160 return TRUE;
1161 }
1162 }
1163 }
1164
1165 return FALSE;
1166 }
1167 }
1168
1169
1170
1171 /**
1172 * [MS-XLS] — v20090708
1173 * Excel Binary File Format (.xls) Structure Specification
1174 * Copyright © 2009 Microsoft Corporation.
1175 * Release: Wednesday, July 8, 2009
1176 *
1177 * 2.5.293 XLUnicodeRichExtendedString
1178 * This structure specifies a Unicode string, which can contain
1179 * formatting information and phoneticstring data.
1180
1181 * This structure‘s non-variable fields MUST be specified in the same
1182 * record. This structure‘s variable fields can be extended with
1183 * Continue records. A value from the table for fHighByte MUST be
1184 * specified in the first byte of the continue field of the Continue
1185 * record followed by the remaining portions of this structure‘s
1186 * variable fields.
1187 * 1 2 3
1188 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1189 * cch A B C D reserved2 cRun (optional)
1190 * ... cbExtRst (optional)
1191 * ... rgb (variable)
1192 * ...
1193 * rgRun (variable, optional)
1194 * ...
1195 * ExtRst (variable, optional)
1196 * ...
1197 * cch (2 bytes): An unsigned integer that specifies the count of
1198 * characters in the string.
1199 *
1200 * A - fHighByte (1 bit): A bit that specifies whether the characters
1201 * in rgb are double-byte characters. MUST be a value from the
1202 * following table:
1203 *
1204 * Value Meaning
1205 * 0x0 All the characters in the string have a high byte of 0x00
1206 * and only the low bytes are in rgb.
1207 * 0x1 All the characters in the string are saved as double-byte
1208 * characters in rgb.
1209 * B - reserved1 (1 bit): MUST be zero, and MUST be ignored.
1210 * C - fExtSt (1 bit): A bit that specifies whether the string
1211 * contains phonetic string data.
1212 * D - fRichSt (1 bit): A bit that specifies whether the string is a
1213 * rich string and the string has at least two character formats
1214 * applied.
1215 *
1216 * reserved2 (4 bits): MUST be zero, and MUST be ignored.
1217 *
1218 * cRun (2 bytes): An optional unsigned integer that specifies the
1219 * number of elements in rgRun. MUST exist if and only if fRichSt is
1220 * 0x1.
1221 *
1222 * cbExtRst (4 bytes): An optional signed integer that specifies the
1223 * byte count of ExtRst. MUST exist if and only if fExtSt is 0x1. MUST
1224 * be zero or greater.
1225 *
1226 * rgb (variable): An array of bytes that specifies the characters in
1227 * the string. If fHighByte is 0x0, the size of the array is cch. If
1228 * fHighByte is 0x1, the size of the array is cch*2. If fHighByte is
1229 * 0x1 and rgb is extended with a Continue record the break MUST occur
1230 * at the double-byte character boundary.
1231 *
1232 * rgRun (variable): An optional array of FormatRun structures that
1233 * specifies the formatting for each text run. The number of elements
1234 * in the array is cRun. MUST exist if and only if fRichSt is 0x1.
1235 *
1236 * ExtRst (variable): An optional ExtRst that specifies the phonetic
1237 * string data. The size of this field is cbExtRst. MUST exist if and
1238 * only if fExtSt is 0x1.
1239 */
1240 static void
1241 xls_get_extended_record_string (GsfInput *stream,
1242 GArray *list,
1243 gsize *p_bytes_remaining,
1244 GString **p_content)
1245 {
1246 ExcelExtendedStringRecord *record;
1247 guint32 cst_unique;
1248 guint parsing_record = 0;
1249 guint8 tmp_buffer[4] = { 0 };
1250 guint i;
1251 guint8 *buffer = NULL;
1252 gsize buffer_size = 0;
1253
1254 /* Parsing the record from the list */
1255 record = &g_array_index (list, ExcelExtendedStringRecord, parsing_record);
1256
1257 /* First record parsing */
1258 if (gsf_input_seek (stream, record->offset, G_SEEK_SET)) {
1259 return;
1260 }
1261
1262 /* Note: The first record is ALWAYS the SST, so coming with cst_total and
1263 * cst_unique values.
1264 * Some extra background: Records with data longer than 8,224 bytes MUST be
1265 * split into several records, so in this case, if the SST record is big
1266 * enough, it will have one or more CONTINUE records
1267 *
1268 * SST record: http://msdn.microsoft.com/en-us/library/dd773037%28v=office.12%29.aspx
1269 * CONTINUE record: http://msdn.microsoft.com/en-us/library/dd949081%28v=office.12%29.aspx
1270 **/
1271
1272 /* Reading cst total */
1273 gsf_input_read (stream, 4, tmp_buffer);
1274 read_32bit (tmp_buffer);
1275
1276 /* Reading cst unique */
1277 gsf_input_read (stream, 4, tmp_buffer);
1278 cst_unique = read_32bit (tmp_buffer);
1279
1280 /* Iterate over chunks...
1281 * Loop is halted whenever one of this conditions is met:
1282 * a) Max bytes to be read reached
1283 * b) No more chunks to read
1284 */
1285 i = 0;
1286 while (*p_bytes_remaining > 0 &&
1287 i < cst_unique) {
1288 guint16 cch;
1289 guint16 c_run;
1290 guint16 cb_ext_rst;
1291 gboolean is_high_byte;
1292 gsize chunk_size;
1293
1294 /* RECORD may have been changed here */
1295 if (change_excel_record_if_needed (stream, list, &parsing_record) &&
1296 parsing_record >= list->len) {
1297 /* When reached max number of records, stop loop */
1298 break;
1299 }
1300
1301 /* Reading 2 bytes for cch */
1302 gsf_input_read (stream, 2, tmp_buffer);
1303
1304 /* Reading cch - char count of current string */
1305 cch = read_16bit (tmp_buffer);
1306
1307 /* Read string flags */
1308 read_excel_string_flags (stream,
1309 &is_high_byte,
1310 &c_run,
1311 &cb_ext_rst);
1312
1313 /* RECORD may have been changed here, but it is managed when reading the
1314 * string contents */
1315
1316
1317 /* NOTE: In order to avoid reading unnecessary bytes, limit it based
1318 * on the number of bytes remaining */
1319 chunk_size = MIN (cch, *p_bytes_remaining);
1320
1321 /* If High Byte, chunk size *2 as stream is in UTF-16 */
1322 if (is_high_byte) {
1323 chunk_size *= 2;
1324 }
1325
1326 /* If the new chunk size is longer than our reused buffer,
1327 * make the buffer bigger */
1328 if (chunk_size > buffer_size) {
1329 buffer = g_realloc (buffer, chunk_size);
1330 buffer_size = chunk_size;
1331 }
1332
1333 /* Read the chunk! NOTE that it may be split in several records... */
1334 if (!read_excel_string (stream, buffer, chunk_size, list, &parsing_record)) {
1335 break;
1336 }
1337
1338 /* Read whole stream in one operation */
1339 msoffice_convert_and_normalize_chunk (buffer,
1340 chunk_size,
1341 !is_high_byte,
1342 p_bytes_remaining,
1343 p_content);
1344
1345 /* Formatting string */
1346 if (c_run > 0) {
1347 /* rgRun (variable): An optional array of
1348 * FormatRun structures that specifies the
1349 * formatting for each ext run. The number of
1350 * elements in the array is cRun. MUST exist
1351 * if and only if fRichSt is 0x1.
1352 *
1353 * Note: As defined in MSDN, a FormatRun structure has a size
1354 * of 4 bytes, so the size of this rgRun variable is really
1355 * (4*cRun) bytes.
1356 * http://msdn.microsoft.com/en-us/library/dd921712.aspx
1357 *
1358 * Skiping this as it will not be useful in
1359 * our case.
1360 */
1361 gsf_input_seek (stream, 4 * c_run, G_SEEK_CUR);
1362 /* Note that we may be now out of the current record after having
1363 * done this seek operation. */
1364 }
1365
1366 /* ExtString */
1367 if (cb_ext_rst > 0) {
1368 /* Again its not so clear may be it will not
1369 * useful in our case.
1370 */
1371 gsf_input_seek (stream, cb_ext_rst, G_SEEK_CUR);
1372 /* Note that we may be now out of the current record after having
1373 * done this seek operation. */
1374 }
1375
1376 /* Go to next chunk */
1377 i++;
1378 }
1379 }
1380
1381 /**
1382 * @brief Extract excel content from specified infile
1383 * @param infile file to read summary from
1384 * @param n_words number of max words to extract
1385 * @param n_bytes max number of bytes to extract
1386 * @param is_encrypted
1387 * @Notes :- About SST record
1388 *
1389 * This record specifies string constants.
1390 * [MS-XLS] — v20090708
1391 * Excel Binary File Format (.xls) Structure Specification
1392 * Copyright © 2009 Microsoft Corporation.
1393 * Release: Wednesday, July 8, 2009
1394 *
1395 * Each string constant in this record has one or more references in
1396 * the workbook, with the goal of improving performance in opening and
1397 * saving the file. The LabelSst record specifies how to make a
1398 * reference to a string in this record.
1399 * 1 2 3
1400 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1401 * cstTotal
1402 * cstUnique
1403 * rgb (variable)
1404 * ...
1405 * cstTotal (4 bytes): A signed integer that specifies the total
1406 * number of references in the workbook to the strings in the shared
1407 * string table. MUST be greater than or equal to 0.
1408 *
1409 * cstUnique (4 bytes): A signed integer that specifies the number of
1410 * unique strings in the shared string table. MUST be greater than or
1411 * equal to 0.
1412 *
1413 * rgb (variable): An array of XLUnicodeRichExtendedString structures.
1414 * Records in this array are unique.
1415 */
1416 static gchar*
1417 extract_excel_content (GsfInfile *infile,
1418 gsize n_bytes,
1419 gboolean *is_encrypted)
1420 {
1421 ExcelBiffHeader header1;
1422 GString *content = NULL;
1423 GsfInput *stream;
1424 guint saved_offset;
1425 gsize n_bytes_remaining = n_bytes;
1426
1427 /* If no content requested, return */
1428 if (n_bytes == 0) {
1429 return NULL;
1430 }
1431
1432 stream = gsf_infile_child_by_name (infile, "Workbook");
1433
1434 if (!stream) {
1435 return NULL;
1436 }
1437
1438 /* Read until we reach eof or any of our limits reached */
1439 while (n_bytes_remaining > 0 &&
1440 !gsf_input_eof (stream)) {
1441 guint8 tmp_buffer[4] = { 0 };
1442
1443 /* Reading 4 bytes to read header */
1444 gsf_input_read (stream, 4, tmp_buffer);
1445 header1.id = read_16bit (tmp_buffer);
1446 header1.length = read_16bit (tmp_buffer + 2);
1447
1448 /* g_debug ("id: %d , length %d", header.id, header.length); */
1449
1450 /* We are interested only in SST record */
1451 if (header1.id == RECORD_TYPE_SST) {
1452 ExcelExtendedStringRecord record;
1453 ExcelBiffHeader header2;
1454 GArray *list;
1455 guint length = 0;
1456
1457 /* Saving length and offset so that will
1458 * return to saved once we are done!!
1459 */
1460 length = header1.length;
1461 saved_offset = gsf_input_tell (stream);
1462
1463 /* Saving ExtendendString Record offset and
1464 * length.
1465 */
1466 record.offset = gsf_input_tell (stream);
1467 record.length = length;
1468
1469 /* g_debug ("record.offset: %u record.length:%d", */
1470 /* record.offset, record.length); */
1471
1472 /* Allocation new array of ExtendendString Record */
1473 list = g_array_new (TRUE, TRUE, sizeof (ExcelExtendedStringRecord));
1474
1475 if (!list) {
1476 break;
1477 }
1478
1479 g_array_append_val (list, record);
1480
1481 /* Reading to parse continue record.
1482 *
1483 * Note: we are justing parsing notrequired
1484 * to read data so passing null data
1485 */
1486 gsf_input_seek (stream, length, G_SEEK_CUR);
1487
1488 /* Reading & Assigning biff header 4 bytes */
1489 gsf_input_read (stream, 4, tmp_buffer);
1490
1491 header2.id = read_16bit (tmp_buffer);
1492 header2.length = read_16bit (tmp_buffer + 2);
1493
1494 /* g_debug ("bf id :%d length:%d", header2.id, header2.length); */
1495 /* g_debug ("offset: %u", (guint) gsf_input_tell (stream)); */
1496
1497 while (header2.id == RECORD_TYPE_CONTINUE) {
1498 /* Assigning to linkedlist we will use
1499 * it to read data
1500 */
1501 record.offset = gsf_input_tell (stream);
1502 record.length = header2.length;
1503 g_array_append_val (list, record);
1504
1505 /* g_debug ("record.offset: %u record.length:%d", */
1506 /* record.offset, record.length); */
1507
1508 /* Then parse the data from the stream */
1509 gsf_input_seek (stream, header2.length, G_SEEK_CUR);
1510
1511 /* Reading and assigning biff header */
1512 gsf_input_read (stream, 4, tmp_buffer);
1513 header2.id = read_16bit (tmp_buffer);
1514 header2.length = read_16bit (tmp_buffer + 2);
1515
1516 /* g_debug ("bf id :%d length:%d", header2.id, header2.length); */
1517 };
1518
1519 /* Read extended string */
1520 xls_get_extended_record_string (stream,
1521 list,
1522 &n_bytes_remaining,
1523 &content);
1524
1525 g_array_unref (list);
1526
1527 /* Restoring the old_offset */
1528 gsf_input_seek (stream, saved_offset, G_SEEK_SET);
1529 break;
1530 }
1531
1532 /* Moving stream pointer to record length */
1533 if (gsf_input_seek (stream, header1.length, G_SEEK_CUR)) {
1534 break;
1535 }
1536 }
1537
1538 g_object_unref (stream);
1539
1540 g_debug ("Bytes extracted: %" G_GSIZE_FORMAT,
1541 n_bytes - n_bytes_remaining);
1542
1543 return content ? g_string_free (content, FALSE) : NULL;
1544 }
1545
1546 /**
1547 * @brief Extract summary OLE stream from specified uri
1548 * @param metadata where to store summary
1549 * @param infile file to read summary from
1550 * @param uri uri of the file
1551 */
1552 static gboolean
1553 extract_summary (TrackerSparqlBuilder *metadata,
1554 GsfInfile *infile,
1555 const gchar *uri)
1556 {
1557 GsfInput *stream;
1558
1559 tracker_sparql_builder_predicate (metadata, "a");
1560 tracker_sparql_builder_object (metadata, "nfo:PaginatedTextDocument");
1561
1562 stream = gsf_infile_child_by_name (infile, "\05SummaryInformation");
1563
1564 if (stream) {
1565 GsfDocMetaData *md;
1566 MetadataInfo info;
1567 GError *error = NULL;
1568
1569 md = gsf_doc_meta_data_new ();
1570 error = gsf_doc_meta_data_read_from_msole (md, stream);
1571
1572 if (error) {
1573 g_warning ("Could not extract summary information, %s",
1574 error->message ? error->message : "no error given");
1575
1576 g_error_free (error);
1577 g_object_unref (md);
1578 g_object_unref (stream);
1579 gsf_shutdown ();
1580
1581 return FALSE;
1582 }
1583
1584 info.metadata = metadata;
1585 info.uri = uri;
1586
1587 gsf_doc_meta_data_foreach (md, summary_metadata_cb, &info);
1588
1589 g_object_unref (md);
1590 g_object_unref (stream);
1591 }
1592
1593 stream = gsf_infile_child_by_name (infile, "\05DocumentSummaryInformation");
1594
1595 if (stream) {
1596 GsfDocMetaData *md;
1597 MetadataInfo info;
1598 GError *error = NULL;
1599
1600 md = gsf_doc_meta_data_new ();
1601
1602 error = gsf_doc_meta_data_read_from_msole (md, stream);
1603 if (error) {
1604 g_warning ("Could not extract document summary information, %s",
1605 error->message ? error->message : "no error given");
1606
1607 g_error_free (error);
1608 g_object_unref (md);
1609 g_object_unref (stream);
1610 gsf_shutdown ();
1611
1612 return FALSE;
1613 }
1614
1615 info.metadata = metadata;
1616 info.uri = uri;
1617
1618 gsf_doc_meta_data_foreach (md, document_metadata_cb, &info);
1619
1620 g_object_unref (md);
1621 g_object_unref (stream);
1622 }
1623
1624 return TRUE;
1625 }
1626
1627 /**
1628 * @brief Extract data from generic office files
1629 *
1630 * At the moment only extracts document summary from summary OLE stream.
1631 * @param uri URI of the file to extract data
1632 * @param metadata where to store extracted data to
1633 */
1634 G_MODULE_EXPORT gboolean
1635 tracker_extract_get_metadata (TrackerExtractInfo *info)
1636 {
1637 TrackerSparqlBuilder *metadata;
1638 TrackerConfig *config;
1639 GsfInfile *infile = NULL;
1640 gchar *content = NULL, *uri;
1641 gboolean is_encrypted = FALSE;
1642 const gchar *mime_used;
1643 gsize max_bytes;
1644 GFile *file;
1645 gchar *filename;
1646 FILE *mfile;
1647
1648 gsf_init ();
1649
1650 metadata = tracker_extract_info_get_metadata_builder (info);
1651 mime_used = tracker_extract_info_get_mimetype (info);
1652
1653 file = tracker_extract_info_get_file (info);
1654 uri = g_file_get_uri (file);
1655
1656 filename = g_filename_from_uri (uri, NULL, NULL);
1657
1658 mfile = tracker_file_open (filename);
1659 g_free (filename);
1660
1661 if (!mfile) {
1662 g_warning ("Can't open file from uri '%s': %s",
1663 uri, g_strerror (errno));
1664 g_free (uri);
1665 return FALSE;
1666 }
1667
1668 infile = open_file (uri, mfile);
1669 if (!infile) {
1670 gsf_shutdown ();
1671 g_free (uri);
1672 if (mfile) {
1673 tracker_file_close (mfile, FALSE);
1674 }
1675 return FALSE;
1676 }
1677
1678 /* Extracting summary */
1679 extract_summary (metadata, infile, uri);
1680
1681 /* Set max bytes to read from content */
1682 config = tracker_main_get_config ();
1683 max_bytes = tracker_config_get_max_bytes (config);
1684
1685 if (g_ascii_strcasecmp (mime_used, "application/msword") == 0) {
1686 /* Word file */
1687 content = extract_msword_content (infile, max_bytes, &is_encrypted);
1688 } else if (g_ascii_strcasecmp (mime_used, "application/vnd.ms-powerpoint") == 0) {
1689 /* PowerPoint file */
1690 tracker_sparql_builder_predicate (metadata, "a");
1691 tracker_sparql_builder_object (metadata, "nfo:Presentation");
1692
1693 content = extract_powerpoint_content (infile, max_bytes, &is_encrypted);
1694 } else if (g_ascii_strcasecmp (mime_used, "application/vnd.ms-excel") == 0) {
1695 /* Excel File */
1696 tracker_sparql_builder_predicate (metadata, "a");
1697 tracker_sparql_builder_object (metadata, "nfo:Spreadsheet");
1698
1699 content = extract_excel_content (infile, max_bytes, &is_encrypted);
1700 } else {
1701 g_message ("Mime type was not recognised:'%s'", mime_used);
1702 }
1703
1704 if (content) {
1705 tracker_sparql_builder_predicate (metadata, "nie:plainTextContent");
1706 tracker_sparql_builder_object_unvalidated (metadata, content);
1707 g_free (content);
1708 }
1709
1710 if (is_encrypted) {
1711 tracker_sparql_builder_predicate (metadata, "nfo:isContentEncrypted");
1712 tracker_sparql_builder_object_boolean (metadata, TRUE);
1713 }
1714
1715 g_object_unref (infile);
1716 g_free (uri);
1717 gsf_shutdown ();
1718 if (mfile) {
1719 tracker_file_close (mfile, FALSE);
1720 }
1721
1722 return TRUE;
1723 }