No issues found
1 /*
2 * Copyright (C) 2010, Nokia <ivan.frade@nokia.com>
3 *
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public
6 * License as published by the Free Software Foundation; either
7 * version 2.1 of the License, or (at your option) any later version.
8 *
9 * This library is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
13 *
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the
16 * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
17 * Boston, MA 02110-1301, USA.
18 */
19
20 #include "config.h"
21
22 #include <string.h>
23 #include <unistd.h>
24 #include <fcntl.h>
25
26 #include <glib.h>
27 #include <gio/gio.h>
28
29 #include <libtracker-extract/tracker-extract.h>
30
31 #include "tracker-read.h"
32
33 /* Size of the buffer to use when reading, in bytes */
34 #define BUFFER_SIZE 65535
35
36 static gchar *
37 get_string_from_guessed_encoding (const gchar *str,
38 gsize str_len,
39 gsize *utf8_len)
40 {
41 const gchar *current = NULL;
42
43 /* If we have embedded NULs try UTF-16 directly */
44 if (memchr (str, '\0', str_len))
45 current = "UTF-16";
46 /* If locale charset is UTF-8, try with windows-1252.
47 * NOTE: g_get_charset() returns TRUE if locale charset is UTF-8 */
48 else if (g_get_charset (¤t))
49 current = "windows-1252";
50
51 while (current) {
52 gchar *utf8_str;
53 gsize bytes_read = 0;
54 gsize bytes_written = 0;
55
56 utf8_str = g_convert (str,
57 str_len,
58 "UTF-8",
59 current,
60 &bytes_read,
61 &bytes_written,
62 NULL);
63 if (utf8_str &&
64 str_len == bytes_read) {
65 g_debug ("Converted %" G_GSIZE_FORMAT " bytes in '%s' codeset "
66 "to %" G_GSIZE_FORMAT " bytes in UTF-8",
67 bytes_read,
68 current,
69 bytes_written);
70 *utf8_len = bytes_written;
71 return utf8_str;
72 }
73 g_free (utf8_str);
74
75 g_debug ("Text not in '%s' encoding", current);
76
77 if (!strcmp (current, "windows-1252") ||
78 !strcmp (current, "UTF-16"))
79 /* If we tried windows-1252 or UTF-16, don't try anything else */
80 current = NULL;
81 else
82 /* If we tried a locale encoding and didn't work, retry with
83 * windows-1252 */
84 current = "windows-1252";
85 }
86
87 return NULL;
88 }
89
90 /* Returns %TRUE if read operation should continue, %FALSE otherwise */
91 static gboolean
92 process_chunk (const gchar *read_bytes,
93 gsize read_size,
94 gsize buffer_size,
95 gsize *remaining_size,
96 GString **s)
97 {
98 /* If no more bytes to read, halt loop */
99 if (read_size == 0) {
100 return FALSE;
101 }
102
103 /* First of all, check if this is the first time we
104 * have tried to read the stream up to the BUFFER_SIZE
105 * limit. Then make sure that we read the maximum size
106 * of the buffer. If we don't do this, there is the
107 * case where we read 10 bytes in and it is just one
108 * line with no '\n'. Once we have confirmed this we
109 * check that the buffer has a '\n' to make sure the
110 * file is worth indexing. Similarly if the file has
111 * <= 3 bytes then we drop it.
112 *
113 * NOTE: We may have non-UTF8 content read (say,
114 * UTF-16LE), so we can't rely on methods which assume
115 * NUL-terminated strings, as g_strstr_len().
116 */
117 if (*s == NULL) {
118 if (read_size <= 3) {
119 g_debug (" File has less than 3 characters in it, "
120 "not indexing file");
121 return FALSE;
122 }
123
124 if (read_size == buffer_size) {
125 const gchar *i;
126 gboolean eol_found = FALSE;
127
128 i = read_bytes;
129 while (i != &read_bytes[read_size - 1]) {
130 if (*i == '\n') {
131 eol_found = TRUE;
132 break;
133 }
134 i++;
135 }
136
137 if (!eol_found) {
138 g_debug (" No '\\n' in the first %" G_GSSIZE_FORMAT " bytes, "
139 "not indexing file",
140 read_size);
141 return FALSE;
142 }
143 }
144 }
145
146 /* Update remaining bytes */
147 *remaining_size -= read_size;
148
149 g_debug (" Read "
150 "%" G_GSSIZE_FORMAT " bytes from file, %" G_GSIZE_FORMAT " "
151 "bytes remaining until configured threshold is reached",
152 read_size,
153 *remaining_size);
154
155 /* Append non-NIL terminated bytes */
156 *s = (*s ?
157 g_string_append_len (*s, read_bytes, read_size) :
158 g_string_new_len (read_bytes, read_size));
159
160 return TRUE;
161 }
162
163 static gchar *
164 process_whole_string (GString *s)
165 {
166 gchar *utf8 = NULL;
167 gsize utf8_len = 0;
168 gsize n_valid_utf8_bytes = 0;
169
170 /* Support also UTF-16 encoded text files, as the ones generated in
171 * Windows OS. We will only accept text files in UTF-16 which come
172 * with a proper BOM. */
173 if (s->len > 2) {
174 GError *error = NULL;
175
176 if (memcmp (s->str, "\xFF\xFE", 2) == 0) {
177 g_debug ("String comes in UTF-16LE, converting");
178 utf8 = g_convert (&(s->str[2]),
179 s->len - 2,
180 "UTF-8",
181 "UTF-16LE",
182 NULL,
183 &utf8_len,
184 &error);
185
186 } else if (memcmp (s->str, "\xFE\xFF", 2) == 0) {
187 g_debug ("String comes in UTF-16BE, converting");
188 utf8 = g_convert (&(s->str[2]),
189 s->len - 2,
190 "UTF-8",
191 "UTF-16BE",
192 NULL,
193 &utf8_len,
194 &error);
195 }
196
197 if (error) {
198 g_warning ("Couldn't convert string from UTF-16 to UTF-8...: %s",
199 error->message);
200 g_error_free (error);
201 g_string_free (s, TRUE);
202 return NULL;
203 }
204 }
205
206 if (!utf8) {
207 utf8_len = s->len;
208 utf8 = g_string_free (s, FALSE);
209 }
210
211 /* Get number of valid UTF-8 bytes found */
212 tracker_text_validate_utf8 (utf8,
213 utf8_len,
214 NULL,
215 &n_valid_utf8_bytes);
216
217 /* A valid UTF-8 file will be that where all read bytes are valid,
218 * with a margin of 3 bytes for the last UTF-8 character which might
219 * have been cut. */
220 if (utf8_len - n_valid_utf8_bytes > 3) {
221 gchar *from_guessed_str;
222 gsize from_guessed_str_len;
223
224 /* If not UTF-8, try to get contents in guessed encoding
225 * (returns valid UTF-8) */
226 from_guessed_str = get_string_from_guessed_encoding (utf8,
227 utf8_len,
228 &from_guessed_str_len);
229 g_free (utf8);
230 if (!from_guessed_str)
231 return NULL;
232 utf8 = from_guessed_str;
233 utf8_len = from_guessed_str_len;
234 } else if (n_valid_utf8_bytes < utf8_len) {
235 g_debug (" Truncating to last valid UTF-8 character "
236 "(%" G_GSSIZE_FORMAT "/%" G_GSSIZE_FORMAT " bytes)",
237 n_valid_utf8_bytes,
238 utf8_len);
239 utf8[n_valid_utf8_bytes] = '\0';
240 utf8_len = n_valid_utf8_bytes;
241 }
242
243 if (utf8_len < 1) {
244 g_free (utf8);
245 return NULL;
246 }
247
248 return utf8;
249 }
250
251 /**
252 * tracker_read_text_from_stream:
253 * @stream: input stream to read from
254 * @max_bytes: max number of bytes to read from @stream
255 *
256 * Reads up to @max_bytes from @stream, and validates the read text as proper
257 * UTF-8.
258 *
259 * If the input text is not UTF-8 it will also try to decode it based on the
260 * current locale, or windows-1252, or UTF-16.
261 *
262 * Returns: newly-allocated NUL-terminated UTF-8 string with the read text.
263 **/
264 gchar *
265 tracker_read_text_from_stream (GInputStream *stream,
266 gsize max_bytes)
267 {
268 GString *s = NULL;
269 gsize n_bytes_remaining = max_bytes;
270
271 g_return_val_if_fail (stream, NULL);
272 g_return_val_if_fail (max_bytes > 0, NULL);
273
274 /* Reading in chunks of BUFFER_SIZE
275 * Loop is halted whenever one of this conditions is met:
276 * a) Read bytes reached the maximum allowed (max_bytes)
277 * b) No more bytes to read
278 * c) Error reading
279 * d) Stream has less than 3 bytes
280 * e) Stream has a single line of BUFFER_SIZE bytes with no EOL
281 */
282 while (n_bytes_remaining > 0) {
283 gchar buf[BUFFER_SIZE];
284 GError *error = NULL;
285 gsize n_bytes_read;
286
287 /* Read bytes from stream */
288 if (!g_input_stream_read_all (stream,
289 buf,
290 MIN (BUFFER_SIZE, n_bytes_remaining),
291 &n_bytes_read,
292 NULL,
293 &error)) {
294 g_message ("Error reading from stream: '%s'",
295 error->message);
296 g_error_free (error);
297 break;
298 }
299
300 /* Process read bytes, and halt loop if needed */
301 if (!process_chunk (buf,
302 n_bytes_read,
303 BUFFER_SIZE,
304 &n_bytes_remaining,
305 &s)) {
306 break;
307 }
308 }
309
310 /* Validate UTF-8 if something was read, and return it */
311 return s ? process_whole_string (s) : NULL;
312 }
313
314
315 /**
316 * tracker_read_text_from_fd:
317 * @fd: input fd to read from
318 * @max_bytes: max number of bytes to read from @fd
319 *
320 * Reads up to @max_bytes from @fd, and validates the read text as proper
321 * UTF-8. Will also properly close the FD when finishes.
322 *
323 * If the input text is not UTF-8 it will also try to decode it based on the
324 * current locale, or windows-1252, or UTF-16.
325 *
326 * Returns: newly-allocated NUL-terminated UTF-8 string with the read text.
327 **/
328 gchar *
329 tracker_read_text_from_fd (gint fd,
330 gsize max_bytes)
331 {
332 FILE *fz;
333 GString *s = NULL;
334 gsize n_bytes_remaining = max_bytes;
335
336 g_return_val_if_fail (max_bytes > 0, NULL);
337
338 if ((fz = fdopen (fd, "r")) == NULL) {
339 g_warning ("Cannot read from FD... could not extract text");
340 close (fd);
341 return NULL;
342 }
343
344 /* Reading in chunks of BUFFER_SIZE
345 * Loop is halted whenever one of this conditions is met:
346 * a) Read bytes reached the maximum allowed (max_bytes)
347 * b) No more bytes to read
348 * c) Error reading
349 * d) Stream has less than 3 bytes
350 * e) Stream has a single line of BUFFER_SIZE bytes with no EOL
351 */
352 while (n_bytes_remaining > 0) {
353 gchar buf[BUFFER_SIZE];
354 gsize n_bytes_read;
355
356 /* Read bytes */
357 n_bytes_read = fread (buf,
358 1,
359 MIN (BUFFER_SIZE, n_bytes_remaining),
360 fz);
361
362 /* Process read bytes, and halt loop if needed */
363 if (!process_chunk (buf,
364 n_bytes_read,
365 BUFFER_SIZE,
366 &n_bytes_remaining,
367 &s)) {
368 break;
369 }
370 }
371
372 /* Close the file here */
373 #ifdef HAVE_POSIX_FADVISE
374 posix_fadvise (fd, 0, 0, POSIX_FADV_DONTNEED);
375 #endif /* HAVE_POSIX_FADVISE */
376 fclose (fz);
377
378 /* Validate UTF-8 if something was read, and return it */
379 return s ? process_whole_string (s) : NULL;
380 }