1 /*
2 * Copyright (C) 2007, Jason Kivlighn <jkivlighn@gmail.com>
3 * Copyright (C) 2008-2009, Nokia <ivan.frade@nokia.com>
4 *
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2.1 of the License, or (at your option) any later version.
9 *
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
14 *
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, write to the
17 * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
18 * Boston, MA 02110-1301, USA.
19 */
20
21 #include "config.h"
22
23 #include <string.h>
24
25 #include <glib.h>
26
27 #include <libxml/HTMLparser.h>
28 #include <libtracker-common/tracker-utils.h>
29 #include <libtracker-extract/tracker-extract.h>
30
31 #include "tracker-main.h"
32
33 typedef enum {
34 READ_TITLE,
35 READ_IGNORE
36 } tag_type;
37
38 typedef struct {
39 TrackerSparqlBuilder *metadata;
40 tag_type current;
41 guint in_body : 1;
42 GString *title;
43 GString *plain_text;
44 guint n_bytes_remaining;
45 } parser_data;
46
47 static gboolean
48 has_attribute (const gchar **attrs,
49 const gchar *attr,
50 const gchar *val)
51 {
52 gint i;
53
54 if (!attrs || !attr || !val) {
55 return FALSE;
56 }
57
58 for (i = 0; attrs[i] && attrs[i + 1]; i += 2) {
59 if (g_ascii_strcasecmp (attrs[i], attr) == 0) {
60 if (g_ascii_strcasecmp (attrs[i + 1], val) == 0) {
61 return TRUE;
62 }
63 }
64 }
65
66 return FALSE;
67 }
68
69 static const xmlChar *
70 lookup_attribute (const gchar **attrs,
71 const gchar *attr)
72 {
73 gint i;
74
75 if (!attrs || !attr) {
76 return NULL;
77 }
78
79 for (i = 0; attrs[i] && attrs[i + 1]; i += 2) {
80 if (g_ascii_strcasecmp (attrs[i], attr) == 0) {
81 return attrs[i + 1];
pointer targets in return differ in signedness
(emitted by gcc)
82 }
83 }
84
85 return NULL;
86 }
87
88 static void
89 parser_start_element (void *data,
90 const xmlChar *name_,
91 const xmlChar **attrs_)
92 {
93 parser_data *pd = data;
94 const gchar *name = (const gchar*) name_;
95 const gchar **attrs = (const gchar**) attrs_;
96
97 if (!pd || !name) {
98 return;
99 }
100
101 /* Look for RDFa triple describing the license */
102 if (g_ascii_strcasecmp (name, "a") == 0) {
103 /* This tag is a license. Ignore, however, if it is
104 * referring to another document.
105 */
106 if (has_attribute (attrs, "rel", "license") &&
107 has_attribute (attrs, "about", NULL) == FALSE) {
108 const xmlChar *href;
109
110 href = lookup_attribute (attrs, "href");
111
112 if (href) {
113 tracker_sparql_builder_predicate (pd->metadata, "nie:license");
114 tracker_sparql_builder_object_unvalidated (pd->metadata, href);
pointer targets in passing argument 2 of 'tracker_sparql_builder_object_unvalidated' differ in signedness
(emitted by gcc)
115 }
116 }
117 } else if (g_ascii_strcasecmp (name, "title") == 0) {
118 pd->current = READ_TITLE;
119 } else if (g_ascii_strcasecmp (name, "meta") == 0) {
120 if (has_attribute (attrs, "name", "author")) {
121 const xmlChar *author;
122
123 author = lookup_attribute (attrs, "content");
124
125 if (author) {
126 tracker_sparql_builder_predicate (pd->metadata, "nco:creator");
127 tracker_sparql_builder_object_blank_open (pd->metadata);
128 tracker_sparql_builder_predicate (pd->metadata, "a");
129 tracker_sparql_builder_object (pd->metadata, "nco:Contact");
130 tracker_sparql_builder_predicate (pd->metadata, "nco:fullname");
131 tracker_sparql_builder_object_unvalidated (pd->metadata, author);
132 tracker_sparql_builder_object_blank_close (pd->metadata);
133 }
134 }
135
136 if (has_attribute (attrs, "name", "description")) {
137 const xmlChar *desc;
138
139 desc = lookup_attribute (attrs,"content");
140
141 if (desc) {
142 tracker_sparql_builder_predicate (pd->metadata, "nie:description");
143 tracker_sparql_builder_object_unvalidated (pd->metadata, desc);
144 }
145 }
146
147 if (has_attribute (attrs, "name", "keywords")) {
148 const xmlChar* content = lookup_attribute (attrs, "content");
149
150 if (content) {
151 gchar **keywords;
152 gint i;
153
154 keywords = g_strsplit (content, ",", -1);
155 if (keywords) {
156 for (i = 0; keywords[i] != NULL; i++) {
157 if (!keywords[i] || keywords[i] == '\0') {
158 continue;
159 }
160
161 tracker_sparql_builder_predicate (pd->metadata, "nie:keyword");
162 tracker_sparql_builder_object_unvalidated (pd->metadata, g_strstrip (keywords[i]));
163 }
164
165 g_strfreev (keywords);
166 }
167 }
168 }
169 } else if (g_ascii_strcasecmp (name, "body") == 0) {
170 pd->in_body = TRUE;
171 } else if (g_ascii_strcasecmp (name, "script") == 0) {
172 /* Ignore javascript and such */
173 pd->current = READ_IGNORE;
174 }
175 }
176
177 static void
178 parser_end_element (void *data,
179 const xmlChar *name_)
180 {
181 parser_data *pd = data;
182 const gchar *name = (const gchar*) name_;
183
184 if (g_ascii_strcasecmp (name, "title") == 0 ||
185 g_ascii_strcasecmp (name, "script") == 0) {
186 pd->current = -1;
187 }
188 }
189
190 static void
191 parser_characters (void *data,
192 const xmlChar *ch,
193 int len)
194 {
195 parser_data *pd = data;
196
197 switch (pd->current) {
198 case READ_TITLE:
199 g_string_append (pd->title, ch);
pointer targets in passing argument 2 of 'g_string_append' differ in signedness
(emitted by gcc)
200 break;
201 case READ_IGNORE:
202 break;
203 default:
204 if (pd->in_body && pd->n_bytes_remaining > 0) {
205 gsize text_len;
206
207 text_len = strlen (ch);
208
209 if (tracker_text_validate_utf8 (ch,
210 (pd->n_bytes_remaining < text_len ?
211 pd->n_bytes_remaining :
212 text_len),
213 &pd->plain_text,
214 NULL)) {
215 /* In the case of HTML, each string arriving this
216 * callback is independent to any other previous
217 * string, so need to add an explicit whitespace
218 * separator */
219 g_string_append_c (pd->plain_text, ' ');
220 }
221
222 if (pd->n_bytes_remaining > text_len) {
223 pd->n_bytes_remaining -= text_len;
224 } else {
225 pd->n_bytes_remaining = 0;
226 }
227 }
228 break;
229 }
230 }
231
232 G_MODULE_EXPORT gboolean
233 tracker_extract_get_metadata (TrackerExtractInfo *info)
234 {
235 TrackerSparqlBuilder *metadata;
236 GFile *file;
237 TrackerConfig *config;
238 htmlDocPtr doc;
239 parser_data pd;
240 gchar *filename;
241 xmlSAXHandler handler = {
242 NULL, /* internalSubset */
243 NULL, /* isStandalone */
244 NULL, /* hasInternalSubset */
245 NULL, /* hasExternalSubset */
246 NULL, /* resolveEntity */
247 NULL, /* getEntity */
248 NULL, /* entityDecl */
249 NULL, /* notationDecl */
250 NULL, /* attributeDecl */
251 NULL, /* elementDecl */
252 NULL, /* unparsedEntityDecl */
253 NULL, /* setDocumentLocator */
254 NULL, /* startDocument */
255 NULL, /* endDocument */
256 parser_start_element, /* startElement */
257 parser_end_element, /* endElement */
258 NULL, /* reference */
259 parser_characters, /* characters */
260 NULL, /* ignorableWhitespace */
261 NULL, /* processingInstruction */
262 NULL, /* comment */
263 NULL, /* xmlParserWarning */
264 NULL, /* xmlParserError */
265 NULL, /* xmlParserError */
266 NULL, /* getParameterEntity */
267 NULL, /* cdataBlock */
268 NULL, /* externalSubset */
269 1, /* initialized */
270 NULL, /* private */
271 NULL, /* startElementNsSAX2Func */
272 NULL, /* endElementNsSAX2Func */
273 NULL /* xmlStructuredErrorFunc */
274 };
275
276 metadata = tracker_extract_info_get_metadata_builder (info);
277 file = tracker_extract_info_get_file (info);
278
279 tracker_sparql_builder_predicate (metadata, "a");
280 tracker_sparql_builder_object (metadata, "nfo:HtmlDocument");
281
282 pd.metadata = metadata;
283 pd.current = -1;
284 pd.in_body = FALSE;
285 pd.plain_text = g_string_new (NULL);
286 pd.title = g_string_new (NULL);
287
288 config = tracker_main_get_config ();
289 pd.n_bytes_remaining = tracker_config_get_max_bytes (config);
290
291 filename = g_file_get_path (file);
292 doc = htmlSAXParseFile (filename, NULL, &handler, &pd);
293 g_free (filename);
294
295 if (doc) {
296 xmlFreeDoc (doc);
297 }
298
299 g_strstrip (pd.plain_text->str);
300 g_strstrip (pd.title->str);
301
302 if (pd.title->str &&
303 *pd.title->str != '\0') {
304 tracker_sparql_builder_predicate (metadata, "nie:title");
305 tracker_sparql_builder_object_unvalidated (metadata, pd.title->str);
306 }
307
308 if (pd.plain_text->str &&
309 *pd.plain_text->str != '\0') {
310 tracker_sparql_builder_predicate (metadata, "nie:plainTextContent");
311 tracker_sparql_builder_object_unvalidated (metadata, pd.plain_text->str);
312 }
313
314 g_string_free (pd.plain_text, TRUE);
315 g_string_free (pd.title, TRUE);
316
317 return TRUE;
318 }