tracker-0.16.2/src/tracker-extract/tracker-extract-html.c

Location Tool Test ID Function Issue
tracker-extract-html.c:81:4 gcc pointer-sign lookup_attribute pointer targets in return differ in signedness
tracker-extract-html.c:114:5 gcc pointer-sign parser_start_element pointer targets in passing argument 2 of 'tracker_sparql_builder_object_unvalidated' differ in signedness
tracker-extract-html.c:199:3 gcc pointer-sign parser_characters pointer targets in passing argument 2 of 'g_string_append' differ in signedness
  1 /*
  2  * Copyright (C) 2007, Jason Kivlighn <jkivlighn@gmail.com>
  3  * Copyright (C) 2008-2009, Nokia <ivan.frade@nokia.com>
  4  *
  5  * This library is free software; you can redistribute it and/or
  6  * modify it under the terms of the GNU Lesser General Public
  7  * License as published by the Free Software Foundation; either
  8  * version 2.1 of the License, or (at your option) any later version.
  9  *
 10  * This library is distributed in the hope that it will be useful,
 11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 13  * Lesser General Public License for more details.
 14  *
 15  * You should have received a copy of the GNU Lesser General Public
 16  * License along with this library; if not, write to the
 17  * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
 18  * Boston, MA  02110-1301, USA.
 19  */
 20 
 21 #include "config.h"
 22 
 23 #include <string.h>
 24 
 25 #include <glib.h>
 26 
 27 #include <libxml/HTMLparser.h>
 28 #include <libtracker-common/tracker-utils.h>
 29 #include <libtracker-extract/tracker-extract.h>
 30 
 31 #include "tracker-main.h"
 32 
 33 typedef enum {
 34 	READ_TITLE,
 35 	READ_IGNORE
 36 } tag_type;
 37 
 38 typedef struct {
 39 	TrackerSparqlBuilder *metadata;
 40 	tag_type current;
 41 	guint in_body : 1;
 42 	GString *title;
 43 	GString *plain_text;
 44 	guint n_bytes_remaining;
 45 } parser_data;
 46 
 47 static gboolean
 48 has_attribute (const gchar **attrs,
 49                const gchar  *attr,
 50                const gchar  *val)
 51 {
 52 	gint i;
 53 
 54 	if (!attrs || !attr || !val) {
 55 		return FALSE;
 56 	}
 57 
 58 	for (i = 0; attrs[i] && attrs[i + 1]; i += 2) {
 59 		if (g_ascii_strcasecmp (attrs[i], attr) == 0) {
 60 			if (g_ascii_strcasecmp (attrs[i + 1], val) == 0) {
 61 				return TRUE;
 62 			}
 63 		}
 64 	}
 65 
 66 	return FALSE;
 67 }
 68 
 69 static const xmlChar *
 70 lookup_attribute (const gchar **attrs,
 71                   const gchar  *attr)
 72 {
 73 	gint i;
 74 
 75 	if (!attrs || !attr) {
 76 		return NULL;
 77 	}
 78 
 79 	for (i = 0; attrs[i] && attrs[i + 1]; i += 2) {
 80 		if (g_ascii_strcasecmp (attrs[i], attr) == 0) {
 81 			return attrs[i + 1];
pointer targets in return differ in signedness
(emitted by gcc)
82 } 83 } 84 85 return NULL; 86 } 87 88 static void 89 parser_start_element (void *data, 90 const xmlChar *name_, 91 const xmlChar **attrs_) 92 { 93 parser_data *pd = data; 94 const gchar *name = (const gchar*) name_; 95 const gchar **attrs = (const gchar**) attrs_; 96 97 if (!pd || !name) { 98 return; 99 } 100 101 /* Look for RDFa triple describing the license */ 102 if (g_ascii_strcasecmp (name, "a") == 0) { 103 /* This tag is a license. Ignore, however, if it is 104 * referring to another document. 105 */ 106 if (has_attribute (attrs, "rel", "license") && 107 has_attribute (attrs, "about", NULL) == FALSE) { 108 const xmlChar *href; 109 110 href = lookup_attribute (attrs, "href"); 111 112 if (href) { 113 tracker_sparql_builder_predicate (pd->metadata, "nie:license"); 114 tracker_sparql_builder_object_unvalidated (pd->metadata, href);
pointer targets in passing argument 2 of 'tracker_sparql_builder_object_unvalidated' differ in signedness
(emitted by gcc)
115 } 116 } 117 } else if (g_ascii_strcasecmp (name, "title") == 0) { 118 pd->current = READ_TITLE; 119 } else if (g_ascii_strcasecmp (name, "meta") == 0) { 120 if (has_attribute (attrs, "name", "author")) { 121 const xmlChar *author; 122 123 author = lookup_attribute (attrs, "content"); 124 125 if (author) { 126 tracker_sparql_builder_predicate (pd->metadata, "nco:creator"); 127 tracker_sparql_builder_object_blank_open (pd->metadata); 128 tracker_sparql_builder_predicate (pd->metadata, "a"); 129 tracker_sparql_builder_object (pd->metadata, "nco:Contact"); 130 tracker_sparql_builder_predicate (pd->metadata, "nco:fullname"); 131 tracker_sparql_builder_object_unvalidated (pd->metadata, author); 132 tracker_sparql_builder_object_blank_close (pd->metadata); 133 } 134 } 135 136 if (has_attribute (attrs, "name", "description")) { 137 const xmlChar *desc; 138 139 desc = lookup_attribute (attrs,"content"); 140 141 if (desc) { 142 tracker_sparql_builder_predicate (pd->metadata, "nie:description"); 143 tracker_sparql_builder_object_unvalidated (pd->metadata, desc); 144 } 145 } 146 147 if (has_attribute (attrs, "name", "keywords")) { 148 const xmlChar* content = lookup_attribute (attrs, "content"); 149 150 if (content) { 151 gchar **keywords; 152 gint i; 153 154 keywords = g_strsplit (content, ",", -1); 155 if (keywords) { 156 for (i = 0; keywords[i] != NULL; i++) { 157 if (!keywords[i] || keywords[i] == '\0') { 158 continue; 159 } 160 161 tracker_sparql_builder_predicate (pd->metadata, "nie:keyword"); 162 tracker_sparql_builder_object_unvalidated (pd->metadata, g_strstrip (keywords[i])); 163 } 164 165 g_strfreev (keywords); 166 } 167 } 168 } 169 } else if (g_ascii_strcasecmp (name, "body") == 0) { 170 pd->in_body = TRUE; 171 } else if (g_ascii_strcasecmp (name, "script") == 0) { 172 /* Ignore javascript and such */ 173 pd->current = READ_IGNORE; 174 } 175 } 176 177 static void 178 parser_end_element (void *data, 179 const xmlChar *name_) 180 { 181 parser_data *pd = data; 182 const gchar *name = (const gchar*) name_; 183 184 if (g_ascii_strcasecmp (name, "title") == 0 || 185 g_ascii_strcasecmp (name, "script") == 0) { 186 pd->current = -1; 187 } 188 } 189 190 static void 191 parser_characters (void *data, 192 const xmlChar *ch, 193 int len) 194 { 195 parser_data *pd = data; 196 197 switch (pd->current) { 198 case READ_TITLE: 199 g_string_append (pd->title, ch);
pointer targets in passing argument 2 of 'g_string_append' differ in signedness
(emitted by gcc)
200 break; 201 case READ_IGNORE: 202 break; 203 default: 204 if (pd->in_body && pd->n_bytes_remaining > 0) { 205 gsize text_len; 206 207 text_len = strlen (ch); 208 209 if (tracker_text_validate_utf8 (ch, 210 (pd->n_bytes_remaining < text_len ? 211 pd->n_bytes_remaining : 212 text_len), 213 &pd->plain_text, 214 NULL)) { 215 /* In the case of HTML, each string arriving this 216 * callback is independent to any other previous 217 * string, so need to add an explicit whitespace 218 * separator */ 219 g_string_append_c (pd->plain_text, ' '); 220 } 221 222 if (pd->n_bytes_remaining > text_len) { 223 pd->n_bytes_remaining -= text_len; 224 } else { 225 pd->n_bytes_remaining = 0; 226 } 227 } 228 break; 229 } 230 } 231 232 G_MODULE_EXPORT gboolean 233 tracker_extract_get_metadata (TrackerExtractInfo *info) 234 { 235 TrackerSparqlBuilder *metadata; 236 GFile *file; 237 TrackerConfig *config; 238 htmlDocPtr doc; 239 parser_data pd; 240 gchar *filename; 241 xmlSAXHandler handler = { 242 NULL, /* internalSubset */ 243 NULL, /* isStandalone */ 244 NULL, /* hasInternalSubset */ 245 NULL, /* hasExternalSubset */ 246 NULL, /* resolveEntity */ 247 NULL, /* getEntity */ 248 NULL, /* entityDecl */ 249 NULL, /* notationDecl */ 250 NULL, /* attributeDecl */ 251 NULL, /* elementDecl */ 252 NULL, /* unparsedEntityDecl */ 253 NULL, /* setDocumentLocator */ 254 NULL, /* startDocument */ 255 NULL, /* endDocument */ 256 parser_start_element, /* startElement */ 257 parser_end_element, /* endElement */ 258 NULL, /* reference */ 259 parser_characters, /* characters */ 260 NULL, /* ignorableWhitespace */ 261 NULL, /* processingInstruction */ 262 NULL, /* comment */ 263 NULL, /* xmlParserWarning */ 264 NULL, /* xmlParserError */ 265 NULL, /* xmlParserError */ 266 NULL, /* getParameterEntity */ 267 NULL, /* cdataBlock */ 268 NULL, /* externalSubset */ 269 1, /* initialized */ 270 NULL, /* private */ 271 NULL, /* startElementNsSAX2Func */ 272 NULL, /* endElementNsSAX2Func */ 273 NULL /* xmlStructuredErrorFunc */ 274 }; 275 276 metadata = tracker_extract_info_get_metadata_builder (info); 277 file = tracker_extract_info_get_file (info); 278 279 tracker_sparql_builder_predicate (metadata, "a"); 280 tracker_sparql_builder_object (metadata, "nfo:HtmlDocument"); 281 282 pd.metadata = metadata; 283 pd.current = -1; 284 pd.in_body = FALSE; 285 pd.plain_text = g_string_new (NULL); 286 pd.title = g_string_new (NULL); 287 288 config = tracker_main_get_config (); 289 pd.n_bytes_remaining = tracker_config_get_max_bytes (config); 290 291 filename = g_file_get_path (file); 292 doc = htmlSAXParseFile (filename, NULL, &handler, &pd); 293 g_free (filename); 294 295 if (doc) { 296 xmlFreeDoc (doc); 297 } 298 299 g_strstrip (pd.plain_text->str); 300 g_strstrip (pd.title->str); 301 302 if (pd.title->str && 303 *pd.title->str != '\0') { 304 tracker_sparql_builder_predicate (metadata, "nie:title"); 305 tracker_sparql_builder_object_unvalidated (metadata, pd.title->str); 306 } 307 308 if (pd.plain_text->str && 309 *pd.plain_text->str != '\0') { 310 tracker_sparql_builder_predicate (metadata, "nie:plainTextContent"); 311 tracker_sparql_builder_object_unvalidated (metadata, pd.plain_text->str); 312 } 313 314 g_string_free (pd.plain_text, TRUE); 315 g_string_free (pd.title, TRUE); 316 317 return TRUE; 318 }