1 /*
  2  * Copyright (C) 2010, Nokia <ivan.frade@nokia.com>
  3  *
  4  * This library is free software; you can redistribute it and/or
  5  * modify it under the terms of the GNU Lesser General Public
  6  * License as published by the Free Software Foundation; either
  7  * version 2.1 of the License, or (at your option) any later version.
  8  *
  9  * This library is distributed in the hope that it will be useful,
 10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 12  * Lesser General Public License for more details.
 13  *
 14  * You should have received a copy of the GNU Lesser General Public
 15  * License along with this library; if not, write to the
 16  * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
 17  * Boston, MA  02110-1301, USA.
 18  */
 19 
 20 #include "config.h"
 21 
 22 #include <string.h>
 23 #include <unistd.h>
 24 #include <fcntl.h>
 25 
 26 #include <glib.h>
 27 #include <gio/gio.h>
 28 
 29 #include <libtracker-extract/tracker-extract.h>
 30 
 31 #include "tracker-read.h"
 32 
 33 /* Size of the buffer to use when reading, in bytes */
 34 #define BUFFER_SIZE 65535
 35 
 36 static gchar *
 37 get_string_from_guessed_encoding (const gchar *str,
 38                                   gsize        str_len,
 39                                   gsize       *utf8_len)
 40 {
 41 	const gchar *current = NULL;
 42 
 43 	/* If we have embedded NULs try UTF-16 directly */
 44 	if (memchr (str, '\0', str_len))
 45 		current = "UTF-16";
 46 	/* If locale charset is UTF-8, try with windows-1252.
 47 	 * NOTE: g_get_charset() returns TRUE if locale charset is UTF-8 */
 48 	else if (g_get_charset (&current))
 49 		current = "windows-1252";
 50 
 51 	while (current) {
 52 		gchar *utf8_str;
 53 		gsize bytes_read = 0;
 54 		gsize bytes_written = 0;
 55 
 56 		utf8_str = g_convert (str,
 57 		                      str_len,
 58 		                      "UTF-8",
 59 		                      current,
 60 		                      &bytes_read,
 61 		                      &bytes_written,
 62 		                      NULL);
 63 		if (utf8_str &&
 64 		    str_len == bytes_read) {
 65 			g_debug ("Converted %" G_GSIZE_FORMAT " bytes in '%s' codeset "
 66 			         "to %" G_GSIZE_FORMAT " bytes in UTF-8",
 67 			         bytes_read,
 68 			         current,
 69 			         bytes_written);
 70 			*utf8_len = bytes_written;
 71 			return utf8_str;
 72 		}
 73 		g_free (utf8_str);
 74 
 75 		g_debug ("Text not in '%s' encoding", current);
 76 
 77 		if (!strcmp (current, "windows-1252") ||
 78 		    !strcmp (current, "UTF-16"))
 79 			/* If we tried windows-1252 or UTF-16, don't try anything else */
 80 			current = NULL;
 81 		else
 82 			/* If we tried a locale encoding and didn't work, retry with
 83 			 * windows-1252 */
 84 			current = "windows-1252";
 85 	}
 86 
 87 	return NULL;
 88 }
 89 
 90 /* Returns %TRUE if read operation should continue, %FALSE otherwise */
 91 static gboolean
 92 process_chunk (const gchar  *read_bytes,
 93                gsize         read_size,
 94                gsize         buffer_size,
 95                gsize        *remaining_size,
 96                GString     **s)
 97 {
 98 	/* If no more bytes to read, halt loop */
 99 	if (read_size == 0) {
100 		return FALSE;
101 	}
102 
103 	/* First of all, check if this is the first time we
104 	 * have tried to read the stream up to the BUFFER_SIZE
105 	 * limit. Then make sure that we read the maximum size
106 	 * of the buffer. If we don't do this, there is the
107 	 * case where we read 10 bytes in and it is just one
108 	 * line with no '\n'. Once we have confirmed this we
109 	 * check that the buffer has a '\n' to make sure the
110 	 * file is worth indexing. Similarly if the file has
111 	 * <= 3 bytes then we drop it.
112 	 *
113 	 * NOTE: We may have non-UTF8 content read (say,
114 	 * UTF-16LE), so we can't rely on methods which assume
115 	 * NUL-terminated strings, as g_strstr_len().
116 	 */
117 	if (*s == NULL) {
118 		if (read_size <= 3) {
119 			g_debug ("  File has less than 3 characters in it, "
120 			         "not indexing file");
121 			return FALSE;
122 		}
123 
124 		if (read_size == buffer_size) {
125 			const gchar *i;
126 			gboolean eol_found = FALSE;
127 
128 			i = read_bytes;
129 			while (i != &read_bytes[read_size - 1]) {
130 				if (*i == '\n') {
131 					eol_found = TRUE;
132 					break;
133 				}
134 				i++;
135 			}
136 
137 			if (!eol_found) {
138 				g_debug ("  No '\\n' in the first %" G_GSSIZE_FORMAT " bytes, "
139 				         "not indexing file",
140 				         read_size);
141 				return FALSE;
142 			}
143 		}
144 	}
145 
146 	/* Update remaining bytes */
147 	*remaining_size -= read_size;
148 
149 	g_debug ("  Read "
150 	         "%" G_GSSIZE_FORMAT " bytes from file, %" G_GSIZE_FORMAT " "
151 	         "bytes remaining until configured threshold is reached",
152 	         read_size,
153 	         *remaining_size);
154 
155 	/* Append non-NIL terminated bytes */
156 	*s = (*s ?
157 	      g_string_append_len (*s, read_bytes, read_size) :
158 	      g_string_new_len (read_bytes, read_size));
159 
160 	return TRUE;
161 }
162 
163 static gchar *
164 process_whole_string (GString  *s)
165 {
166 	gchar *utf8 = NULL;
167 	gsize  utf8_len = 0;
168 	gsize n_valid_utf8_bytes = 0;
169 
170 	/* Support also UTF-16 encoded text files, as the ones generated in
171 	 * Windows OS. We will only accept text files in UTF-16 which come
172 	 * with a proper BOM. */
173 	if (s->len > 2) {
174 		GError *error = NULL;
175 
176 		if (memcmp (s->str, "\xFF\xFE", 2) == 0) {
177 			g_debug ("String comes in UTF-16LE, converting");
178 			utf8 = g_convert (&(s->str[2]),
179 			                  s->len - 2,
180 			                  "UTF-8",
181 			                  "UTF-16LE",
182 			                  NULL,
183 			                  &utf8_len,
184 			                  &error);
185 
186 		} else if (memcmp (s->str, "\xFE\xFF", 2) == 0) {
187 			g_debug ("String comes in UTF-16BE, converting");
188 			utf8 = g_convert (&(s->str[2]),
189 			                  s->len - 2,
190 			                  "UTF-8",
191 			                  "UTF-16BE",
192 			                  NULL,
193 			                  &utf8_len,
194 			                  &error);
195 		}
196 
197 		if (error) {
198 			g_warning ("Couldn't convert string from UTF-16 to UTF-8...: %s",
199 			           error->message);
200 			g_error_free (error);
201 			g_string_free (s, TRUE);
202 			return NULL;
203 		}
204 	}
205 
206 	if (!utf8) {
207 		utf8_len = s->len;
208 		utf8 = g_string_free (s, FALSE);
209 	}
210 
211 	/* Get number of valid UTF-8 bytes found */
212 	tracker_text_validate_utf8 (utf8,
213 	                            utf8_len,
214 	                            NULL,
215 	                            &n_valid_utf8_bytes);
216 
217 	/* A valid UTF-8 file will be that where all read bytes are valid,
218 	 *  with a margin of 3 bytes for the last UTF-8 character which might
219 	 *  have been cut. */
220 	if (utf8_len - n_valid_utf8_bytes > 3) {
221 		gchar *from_guessed_str;
222 		gsize  from_guessed_str_len;
223 
224 		/* If not UTF-8, try to get contents in guessed encoding
225 		 *  (returns valid UTF-8) */
226 		from_guessed_str = get_string_from_guessed_encoding (utf8,
227 		                                                     utf8_len,
228 		                                                     &from_guessed_str_len);
229 		g_free (utf8);
230 		if (!from_guessed_str)
231 			return NULL;
232 		utf8 = from_guessed_str;
233 		utf8_len = from_guessed_str_len;
234 	} else if (n_valid_utf8_bytes < utf8_len) {
235 		g_debug ("  Truncating to last valid UTF-8 character "
236 		         "(%" G_GSSIZE_FORMAT "/%" G_GSSIZE_FORMAT " bytes)",
237 		         n_valid_utf8_bytes,
238 		         utf8_len);
239 		utf8[n_valid_utf8_bytes] = '\0';
240 		utf8_len = n_valid_utf8_bytes;
241 	}
242 
243 	if (utf8_len < 1) {
244 		g_free (utf8);
245 		return NULL;
246 	}
247 
248 	return utf8;
249 }
250 
251 /**
252  * tracker_read_text_from_stream:
253  * @stream: input stream to read from
254  * @max_bytes: max number of bytes to read from @stream
255  *
256  * Reads up to @max_bytes from @stream, and validates the read text as proper
257  *  UTF-8.
258  *
259  * If the input text is not UTF-8 it will also try to decode it based on the
260  * current locale, or windows-1252, or UTF-16.
261  *
262  * Returns: newly-allocated NUL-terminated UTF-8 string with the read text.
263  **/
264 gchar *
265 tracker_read_text_from_stream (GInputStream *stream,
266                                gsize         max_bytes)
267 {
268 	GString *s = NULL;
269 	gsize n_bytes_remaining = max_bytes;
270 
271 	g_return_val_if_fail (stream, NULL);
272 	g_return_val_if_fail (max_bytes > 0, NULL);
273 
274 	/* Reading in chunks of BUFFER_SIZE
275 	 *   Loop is halted whenever one of this conditions is met:
276 	 *     a) Read bytes reached the maximum allowed (max_bytes)
277 	 *     b) No more bytes to read
278 	 *     c) Error reading
279 	 *     d) Stream has less than 3 bytes
280 	 *     e) Stream has a single line of BUFFER_SIZE bytes with no EOL
281 	 */
282 	while (n_bytes_remaining > 0) {
283 		gchar buf[BUFFER_SIZE];
284 		GError *error = NULL;
285 		gsize n_bytes_read;
286 
287 		/* Read bytes from stream */
288 		if (!g_input_stream_read_all (stream,
289 		                              buf,
290 		                              MIN (BUFFER_SIZE, n_bytes_remaining),
291 		                              &n_bytes_read,
292 		                              NULL,
293 		                              &error)) {
294 			g_message ("Error reading from stream: '%s'",
295 			           error->message);
296 			g_error_free (error);
297 			break;
298 		}
299 
300 		/* Process read bytes, and halt loop if needed */
301 		if (!process_chunk (buf,
302 		                    n_bytes_read,
303 		                    BUFFER_SIZE,
304 		                    &n_bytes_remaining,
305 		                    &s)) {
306 			break;
307 		}
308 	}
309 
310 	/* Validate UTF-8 if something was read, and return it */
311 	return s ? process_whole_string (s) : NULL;
312 }
313 
314 
315 /**
316  * tracker_read_text_from_fd:
317  * @fd: input fd to read from
318  * @max_bytes: max number of bytes to read from @fd
319  *
320  * Reads up to @max_bytes from @fd, and validates the read text as proper
321  *  UTF-8. Will also properly close the FD when finishes.
322  *
323  * If the input text is not UTF-8 it will also try to decode it based on the
324  * current locale, or windows-1252, or UTF-16.
325  *
326  * Returns: newly-allocated NUL-terminated UTF-8 string with the read text.
327  **/
328 gchar *
329 tracker_read_text_from_fd (gint  fd,
330                            gsize max_bytes)
331 {
332 	FILE *fz;
333 	GString *s = NULL;
334 	gsize n_bytes_remaining = max_bytes;
335 
336 	g_return_val_if_fail (max_bytes > 0, NULL);
337 
338 	if ((fz = fdopen (fd, "r")) == NULL) {
339 		g_warning ("Cannot read from FD... could not extract text");
340 		close (fd);
341 		return NULL;
342 	}
343 
344 	/* Reading in chunks of BUFFER_SIZE
345 	 *   Loop is halted whenever one of this conditions is met:
346 	 *     a) Read bytes reached the maximum allowed (max_bytes)
347 	 *     b) No more bytes to read
348 	 *     c) Error reading
349 	 *     d) Stream has less than 3 bytes
350 	 *     e) Stream has a single line of BUFFER_SIZE bytes with no EOL
351 	 */
352 	while (n_bytes_remaining > 0) {
353 		gchar buf[BUFFER_SIZE];
354 		gsize n_bytes_read;
355 
356 		/* Read bytes */
357 		n_bytes_read = fread (buf,
358 		                      1,
359 		                      MIN (BUFFER_SIZE, n_bytes_remaining),
360 		                      fz);
361 
362 		/* Process read bytes, and halt loop if needed */
363 		if (!process_chunk (buf,
364 		                    n_bytes_read,
365 		                    BUFFER_SIZE,
366 		                    &n_bytes_remaining,
367 		                    &s)) {
368 			break;
369 		}
370 	}
371 
372 	/* Close the file here */
373 #ifdef HAVE_POSIX_FADVISE
374 	posix_fadvise (fd, 0, 0, POSIX_FADV_DONTNEED);
375 #endif /* HAVE_POSIX_FADVISE */
376 	fclose (fz);
377 
378 	/* Validate UTF-8 if something was read, and return it */
379 	return s ? process_whole_string (s) : NULL;
380 }
tracker-0.16.2/src/tracker-extract/tracker-read.c