LibOFX
ofx_preproc.cpp
Go to the documentation of this file.
1/***************************************************************************
2 ofx_preproc.cpp
3 -------------------
4 copyright : (C) 2002 by Benoit Gr�oir
5 email : benoitg@coeus.ca
6***************************************************************************/
12/***************************************************************************
13 * *
14 * This program is free software; you can redistribute it and/or modify *
15 * it under the terms of the GNU General Public License as published by *
16 * the Free Software Foundation; either version 2 of the License, or *
17 * (at your option) any later version. *
18 * *
19 ***************************************************************************/
20#include "../config.h"
21#include <iostream>
22#include <fstream>
23#include <cstdlib>
24#include <stdio.h>
25#include <sstream>
26#include <string>
27#include "ParserEventGeneratorKit.h"
28#include "libofx.h"
29#include "messages.hh"
30#include "ofx_sgml.hh"
31#include "ofc_sgml.hh"
32#include "ofx_preproc.hh"
33#include "ofx_utilities.hh"
34#ifdef HAVE_ICONV
35#include <iconv.h>
36#endif
37
38#ifdef _WIN32
39# define DIRSEP "\\"
40#else
41# define DIRSEP "/"
42#endif
43
44#ifdef _WIN32
45# include "win32.hh"
46# include <windows.h> // for GetModuleFileName()
47# undef ERROR
48# undef DELETE
49#endif
50
51#define LIBOFX_DEFAULT_INPUT_ENCODING "CP1252"
52#define LIBOFX_DEFAULT_OUTPUT_ENCODING "UTF-8"
53
57#ifdef MAKEFILE_DTD_PATH
58const int DTD_SEARCH_PATH_NUM = 4;
59#else
60const int DTD_SEARCH_PATH_NUM = 3;
61#endif
62
67{
68#ifdef MAKEFILE_DTD_PATH
69 MAKEFILE_DTD_PATH,
70#endif
71 "/usr/local/share/libofx/dtd",
72 "/usr/share/libofx/dtd",
73 "~"
74};
75
80int ofx_proc_file(LibofxContextPtr ctx, const char * p_filename)
81{
82 LibofxContext *libofx_context;
83 bool ofx_start = false;
84 bool ofx_end = false;
85 bool file_is_xml = false;
86 bool used_iconv = false;
87 std::ifstream input_file;
88 std::ofstream tmp_file;
89 char *filenames[3];
90 char tmp_filename[256];
91 int tmp_file_fd;
92#ifdef HAVE_ICONV
93 iconv_t conversion_descriptor;
94#endif
95 libofx_context = (LibofxContext*)ctx;
96
97 if (p_filename != NULL && strcmp(p_filename, "") != 0)
98 {
99 message_out(DEBUG, std::string("ofx_proc_file():Opening file: ") + p_filename);
100
101 input_file.open(p_filename);
102 if (!input_file)
103 {
104 message_out(ERROR, "ofx_proc_file():Unable to open the input file " + std::string(p_filename));
105 }
106
107 mkTempFileName("libofxtmpXXXXXX", tmp_filename, sizeof(tmp_filename));
108
109 message_out(DEBUG, "ofx_proc_file(): Creating temp file: " + std::string(tmp_filename));
110#ifdef _WIN32
111 tmp_file_fd = mkstemp_win32(tmp_filename);
112#else
113 tmp_file_fd = mkstemp(tmp_filename);
114#endif
115 if (tmp_file_fd)
116 {
117 tmp_file.open(tmp_filename);
118 if (!tmp_file)
119 {
120 message_out(ERROR, "ofx_proc_file():Unable to open the created temp file " + std::string(tmp_filename));
121 return -1;
122 }
123 }
124 else
125 {
126 message_out(ERROR, "ofx_proc_file():Unable to create a temp file at " + std::string(tmp_filename));
127 return -1;
128 }
129
130 if (input_file && tmp_file)
131 {
132 std::size_t header_separator_idx;
133 std::string header_name;
134 std::string header_value;
135 std::string ofx_encoding;
136 std::string ofx_charset;
137 do
138 {
139 std::stringbuf buffer;
140 std::string s_buffer;
141 input_file.get(buffer, '\n');
142 //cout<< "got: \"" << buffer<<"\"\n";
143 s_buffer = buffer.str();
144
145 // Watch out: If input_file is in eof(), any subsequent read or
146 // peek() will fail and we must exit this loop.
147 if (!input_file.eof())
148 {
149 //cout<<"input_file.gcount(): "<<input_file.gcount()<< " s_buffer.size=" << s_buffer.size()<<" sizeof(buffer): "<<sizeof(buffer) << " peek=\"" << int(input_file.peek()) << "\"" <<endl;
150 if (input_file.fail()) // If no characters were extracted above, the failbit is set.
151 {
152 // No characters extracted means that we've reached the newline
153 // delimiter (because we already checked for EOF). We will check
154 // for and remove that newline in the next if-clause, but must
155 // remove the failbit so that peek() will work again.
156 input_file.clear();
157 }
158
159 // Is the next character really the newline?
160 if (input_file.peek() == '\n')
161 {
162 // Yes. Then discard that newline character from the stream
163 input_file.get();
164 }
165 }
166
167 if (ofx_start == false && (s_buffer.find("<?xml") != std::string::npos))
168 {
169 message_out(DEBUG, "ofx_proc_file(): File is an actual XML file, iconv conversion will be skipped.");
170 file_is_xml = true;
171 }
172
173 std::size_t ofx_start_idx;
174 if (ofx_start == false)
175 {
176 if (
177 (libofx_context->currentFileType() == OFX &&
178 ((ofx_start_idx = s_buffer.find("<OFX>")) != std::string::npos ||
179 (ofx_start_idx = s_buffer.find("<ofx>")) != std::string::npos))
180 ||
181 (libofx_context->currentFileType() == OFC &&
182 ((ofx_start_idx = s_buffer.find("<OFC>")) != std::string::npos ||
183 (ofx_start_idx = s_buffer.find("<ofc>")) != std::string::npos))
184 )
185 {
186 ofx_start = true;
187 if (file_is_xml == false)
188 {
189 s_buffer.erase(0, ofx_start_idx); //Fix for really broken files that don't have a newline after the header.
190 }
191 message_out(DEBUG, "ofx_proc_file():<OFX> or <OFC> has been found");
192
193 static char sp_charset_fixed[] = "SP_CHARSET_FIXED=1";
194 if (putenv(sp_charset_fixed) != 0)
195 {
196 message_out(ERROR, "ofx_proc_file(): putenv failed");
197 }
198#define OPENSP_UTF8_WARNING_TEXT "ofx_proc_file(): OpenSP cannot process an UTF-8 XML file without garbling it. Furthermore, on windows the support for UTF-8 encode SGML files is broken. This is worked around by forcing a single byte encoding. If the file is indeed UTF-8, it should pass through unmolested, but you will likely get 'non SGML character number' errors, even though the output is correct."
199 if (file_is_xml == true)
200 {
201 /* Normally the following would be "SP_ENCODING=xml".
202 * Unfortunately, opensp's generic api will garble UTF-8 if this
203 * is set to xml. So we set a single byte encoding that uses most
204 * values to avoid messing up the UTF-8.
205 * Unfortunately this means that non-UTF-8 files will not
206 * get properly translated. We'd need to manually detect the
207 * encoding in the XML header and convert the xml with iconv like
208 * we do for SGML to work around the problem. Most unfortunate. */
209 message_out(WARNING, OPENSP_UTF8_WARNING_TEXT);
210 static char sp_encoding[] = "SP_ENCODING=ms-dos";
211 if (putenv(sp_encoding) != 0)
212 {
213 message_out(ERROR, "ofx_proc_file(): putenv failed");
214 }
215 }
216 else
217 {
218 static char sp_encoding[] = "SP_ENCODING=ms-dos"; // Like the above, force a single byte encoding in every case, we don't want opensp messing up UTF-8
219 if (putenv(sp_encoding) != 0)
220 {
221 message_out(ERROR, "ofx_proc_file(): putenv failed");
222 }
223#ifdef HAVE_ICONV
224 std::string fromcode;
225 std::string tocode;
226 if (ofx_encoding.compare("USASCII") == 0)
227 {
228 if (ofx_charset.compare("ISO-8859-1") == 0 || ofx_charset.compare("8859-1") == 0)
229 {
230 //Only "ISO-8859-1" is actually a legal value, but since the banks follows the spec SO well...
231 fromcode = "ISO-8859-1";
232 }
233 else if (ofx_charset.compare("1252") == 0 || ofx_charset.compare("CP1252") == 0)
234 {
235 //Only "1252" is actually a legal value, but since the banks follows the spec SO well...
236 fromcode = "CP1252";
237 }
238 else if (ofx_charset.compare("NONE") == 0)
239 {
240 fromcode = LIBOFX_DEFAULT_INPUT_ENCODING;
241 }
242 else
243 {
244 fromcode = LIBOFX_DEFAULT_INPUT_ENCODING;
245 }
246 }
247 else if (ofx_encoding.compare("UTF-8") == 0 || ofx_encoding.compare("UNICODE") == 0)
248 {
249 //While "UNICODE" isn't a legal value, some cyrilic files do specify it as such...
250 fromcode = "UTF-8";
251 message_out(WARNING, OPENSP_UTF8_WARNING_TEXT);
252 }
253 else
254 {
255 fromcode = LIBOFX_DEFAULT_INPUT_ENCODING;
256 }
257 tocode = LIBOFX_DEFAULT_OUTPUT_ENCODING;
258 message_out(DEBUG, "ofx_proc_file(): Setting up iconv for fromcode: " + fromcode + ", tocode: " + tocode);
259 conversion_descriptor = iconv_open (tocode.c_str(), fromcode.c_str());
260 used_iconv = true;
261#endif
262 }
263 }
264 else
265 {
266 //We are still in the headers
267 if ((header_separator_idx = s_buffer.find(':')) != std::string::npos)
268 {
269 //Header processing
270 header_name.assign(s_buffer.substr(0, header_separator_idx));
271 header_value.assign(s_buffer.substr(header_separator_idx + 1));
272 while ( header_value.length() > 0 &&
273 ( header_value[header_value.length() - 1 ] == '\n' ||
274 header_value[header_value.length() - 1 ] == '\r' ))
275 header_value.erase(header_value.length() - 1);
276 message_out(DEBUG, "ofx_proc_file():Header: " + header_name + " with value: " + header_value + " has been found");
277 if (header_name.compare("ENCODING") == 0)
278 {
279 ofx_encoding.assign(header_value);
280 }
281 if (header_name.compare("CHARSET") == 0)
282 {
283 ofx_charset.assign(header_value);
284 }
285 }
286 }
287 }
288
289 if (file_is_xml == true || (ofx_start == true && ofx_end == false))
290 {
291 if (ofx_start == true)
292 {
293 /* The above test won't help us if the <OFX> tag is on the same line
294 * as the xml header, but as opensp can't be used to parse it anyway
295 * this isn't a great loss for now.
296 */
297 s_buffer = sanitize_proprietary_tags(s_buffer);
298 if (s_buffer.empty())
299 continue;
300 }
301 //cout<< s_buffer<<"\n";
302 if (file_is_xml == false)
303 {
304#ifdef HAVE_ICONV
305 size_t inbytesleft = s_buffer.size();
306 size_t outbytesleft = inbytesleft * 2 - 1;
307 char * iconv_buffer = (char*) malloc (inbytesleft * 2);
308 memset(iconv_buffer, 0, inbytesleft * 2);
309 const char* inchar = s_buffer.c_str();
310 char * outchar = iconv_buffer;
311 int iconv_retval = iconv (conversion_descriptor,
312#ifdef HAVE_ICONV_CONST
313 &inchar,
314#else
315 const_cast<char**>(&inchar),
316#endif
317 &inbytesleft, &outchar, &outbytesleft);
318 if (iconv_retval == -1)
319 {
320 message_out(ERROR, "ofx_proc_file(): Iconv conversion error");
321 }
322 // All validly converted bytes will be copied to the
323 // original buffer
324 s_buffer = std::string(iconv_buffer, outchar - iconv_buffer);
325 free (iconv_buffer);
326#endif
327 }
328 //cout << s_buffer << "\n";
329 tmp_file << s_buffer << std::endl;
330 }
331
332 if (ofx_start == true &&
333 (
334 (libofx_context->currentFileType() == OFX &&
335 ((ofx_start_idx = s_buffer.find("</OFX>")) != std::string::npos ||
336 (ofx_start_idx = s_buffer.find("</ofx>")) != std::string::npos))
337 || (libofx_context->currentFileType() == OFC &&
338 ((ofx_start_idx = s_buffer.find("</OFC>")) != std::string::npos ||
339 (ofx_start_idx = s_buffer.find("</ofc>")) != std::string::npos))
340 )
341 )
342 {
343 ofx_end = true;
344 message_out(DEBUG, "ofx_proc_file():</OFX> or </OFC> has been found");
345 }
346
347 }
348 while (!input_file.eof() && !input_file.bad());
349 }
350 input_file.close();
351 tmp_file.close();
352#ifdef HAVE_ICONV
353 if (used_iconv == true)
354 {
355 iconv_close(conversion_descriptor);
356 }
357#endif
358 char filename_openspdtd[255];
359 char filename_dtd[255];
360 char filename_ofx[255];
361 STRNCPY(filename_openspdtd, find_dtd(ctx, OPENSPDCL_FILENAME)); //The opensp sgml dtd file
362 if (libofx_context->currentFileType() == OFX)
363 {
364 STRNCPY(filename_dtd, find_dtd(ctx, OFX160DTD_FILENAME)); //The ofx dtd file
365 }
366 else if (libofx_context->currentFileType() == OFC)
367 {
368 STRNCPY(filename_dtd, find_dtd(ctx, OFCDTD_FILENAME)); //The ofc dtd file
369 }
370 else
371 {
372 message_out(ERROR, std::string("ofx_proc_file(): Error unknown file format for the OFX parser"));
373 }
374
375 if ((std::string)filename_dtd != "" && (std::string)filename_openspdtd != "")
376 {
377 strncpy(filename_ofx, tmp_filename, 255); //The processed ofx file
378 filenames[0] = filename_openspdtd;
379 filenames[1] = filename_dtd;
380 filenames[2] = filename_ofx;
381 int rv;
382 if (libofx_context->currentFileType() == OFX)
383 {
384 rv = ofx_proc_sgml(libofx_context, 3, filenames);
385 }
386 else if (libofx_context->currentFileType() == OFC)
387 {
388 rv = ofc_proc_sgml(libofx_context, 3, filenames);
389 }
390 else
391 {
392 message_out(ERROR, std::string("ofx_proc_file(): Error unknown file format for the OFX parser"));
393 rv = -1;
394 }
395 if (remove(tmp_filename) != 0)
396 {
397 message_out(ERROR, "ofx_proc_file(): Error deleting temporary file " + std::string(tmp_filename));
398 }
399 return rv;
400 }
401 else
402 {
403 message_out(ERROR, "ofx_proc_file(): FATAL: Missing DTD, aborting");
404 return -1;
405 }
406 }
407 else
408 {
409 message_out(ERROR, "ofx_proc_file():No input file specified");
410 return -1;
411 }
412 return 0;
413}
414
415/* Searches input string for an opening or closing tag starting from pos_start.
416 * If found will return the tag_name and pos_start will be set to the string
417 * of the starting <, pos_end to the position after the closing '>'
418 * If the tag doesn't have a closing '>', pos_end will be set to string::npos.
419 */
420static std::string find_tag_open (std::string& input_string, size_t& pos_start, size_t& pos_end)
421{
422 pos_start = input_string.find ('<', pos_start);
423
424 if (pos_start == std::string::npos)
425 {
426 pos_end = std::string::npos;
427 return std::string();
428 }
429
430 pos_end = input_string.find ('>', pos_start + 1);
431 if (pos_end != std::string::npos)
432 pos_end = pos_end + 1;
433 size_t tag_size = (pos_end - 1) - (pos_start + 1);
434 return input_string.substr(pos_start + 1, tag_size);
435}
436
437/* Searches input string for a closing tag matching tag_name starting at pos.
438 * If found pos will be set to the position right after of the closing '>'
439 * If no matching closing tag is found pos will be set to the start of the next
440 * opening or closing tag found.
441 */
442static void find_tag_close (std::string& input_string, std::string& tag_name, size_t& pos)
443{
444 size_t start_idx = input_string.find ("</" + tag_name + ">", pos);
445
446 if (start_idx == std::string::npos)
447 {
448 start_idx = pos;
449 size_t end_idx;
450 std::string new_tag_name = find_tag_open (input_string, start_idx, end_idx);
451 if (!new_tag_name.empty())
452 {
453 message_out(DEBUG, "find_tag_close() fell back to next open tag: " + new_tag_name);
454 // find_tag_open returns the *end* of an opening tag, but in this
455 // case we want its start, so we need to rewind a bit..
456 pos = start_idx;
457 //printf("find_tag_close() returning pos after fallback: %d\n",pos);
458 }
459 else
460 {
461 pos = input_string.length();
462 }
463 }
464 else
465 {
466 pos = start_idx + tag_name.length() + 3;
467 }
468 return;
469}
470
471
483std::string sanitize_proprietary_tags(std::string input_string)
484{
485 size_t last_known_good_pos = 0;
486 size_t open_tag_start_pos = last_known_good_pos;
487 size_t open_tag_end_pos;
488 size_t close_tag_end_pos;
489
490 std::string tag_name = find_tag_open(input_string, open_tag_start_pos, open_tag_end_pos);
491 while (!tag_name.empty())
492 {
493 // Determine whether the current tag is proprietary.
494 if ((tag_name.find('.') != std::string::npos) || // tag has a . in the name
495 (tag_name == "CATEGORY")) // Chase bank started setting these in 2017
496 {
497 close_tag_end_pos = open_tag_end_pos;
498 find_tag_close (input_string, tag_name, close_tag_end_pos);
499 size_t tag_size = close_tag_end_pos - open_tag_start_pos;
500 std::string prop_tag = input_string.substr(open_tag_start_pos, tag_size);
501 message_out(INFO, "sanitize_proprietary_tags() removed: " + prop_tag);
502 input_string.erase(open_tag_start_pos, tag_size);
503 last_known_good_pos = open_tag_start_pos;
504 }
505 else
506 {
507 last_known_good_pos = open_tag_end_pos;
508 }
509 tag_name.clear();
510 open_tag_start_pos = last_known_good_pos;
511 if (last_known_good_pos != std::string::npos)
512 tag_name = find_tag_open(input_string, open_tag_start_pos, open_tag_end_pos);
513 }
514 return input_string;
515}
516
517
518#ifdef _WIN32
519static std::string get_dtd_installation_directory()
520{
521 // Partial implementation of
522 // http://developer.gnome.org/doc/API/2.0/glib/glib-Windows-Compatibility-Functions.html#g-win32-get-package-installation-directory
523 char ch_fn[MAX_PATH], *p;
524 std::string str_fn;
525
526 if (!GetModuleFileName(NULL, ch_fn, MAX_PATH)) return "";
527
528 if ((p = strrchr(ch_fn, '\\')) != NULL)
529 * p = '\0';
530
531 p = strrchr(ch_fn, '\\');
532 if (p && (_stricmp(p + 1, "bin") == 0 ||
533 _stricmp(p + 1, "lib") == 0))
534 *p = '\0';
535
536 str_fn = ch_fn;
537 str_fn += "\\share\\libofx\\dtd";
538
539 return str_fn;
540}
541#endif
542
543
557std::string find_dtd(LibofxContextPtr ctx, const std::string& dtd_filename)
558{
559 std::string dtd_path_filename;
560 char *env_dtd_path;
561
562 dtd_path_filename = reinterpret_cast<const LibofxContext*>(ctx)->dtdDir();
563 if (!dtd_path_filename.empty())
564 {
565 dtd_path_filename.append(dtd_filename);
566 std::ifstream dtd_file(dtd_path_filename.c_str());
567 if (dtd_file)
568 {
569 message_out(STATUS, "find_dtd():DTD found: " + dtd_path_filename);
570 return dtd_path_filename;
571 }
572 }
573
574#ifdef _WIN32
575 dtd_path_filename = get_dtd_installation_directory();
576 if (!dtd_path_filename.empty())
577 {
578 dtd_path_filename.append(DIRSEP);
579 dtd_path_filename.append(dtd_filename);
580 std::ifstream dtd_file(dtd_path_filename.c_str());
581 if (dtd_file)
582 {
583 message_out(STATUS, "find_dtd():DTD found: " + dtd_path_filename);
584 return dtd_path_filename;
585 }
586 }
587#endif
588 /* Search in environment variable OFX_DTD_PATH */
589 env_dtd_path = getenv("OFX_DTD_PATH");
590 if (env_dtd_path)
591 {
592 dtd_path_filename = env_dtd_path;
593 dtd_path_filename.append(DIRSEP);
594 dtd_path_filename.append(dtd_filename);
595 std::ifstream dtd_file(dtd_path_filename.c_str());
596 if (!dtd_file)
597 {
598 message_out(STATUS, "find_dtd():OFX_DTD_PATH env variable was was present, but unable to open the file " + dtd_path_filename);
599 }
600 else
601 {
602 message_out(STATUS, "find_dtd():DTD found: " + dtd_path_filename);
603 return dtd_path_filename;
604 }
605 }
606
607 for (int i = 0; i < DTD_SEARCH_PATH_NUM; i++)
608 {
609 dtd_path_filename = DTD_SEARCH_PATH[i];
610 dtd_path_filename.append(DIRSEP);
611 dtd_path_filename.append(dtd_filename);
612 std::ifstream dtd_file(dtd_path_filename.c_str());
613 if (!dtd_file)
614 {
615 message_out(DEBUG, "find_dtd():Unable to open the file " + dtd_path_filename);
616 }
617 else
618 {
619 message_out(STATUS, "find_dtd():DTD found: " + dtd_path_filename);
620 return dtd_path_filename;
621 }
622 }
623
624 /* Last resort, look in source tree relative path (useful for development) */
625 dtd_path_filename = "";
626 dtd_path_filename.append("..");
627 dtd_path_filename.append(DIRSEP);
628 dtd_path_filename.append("dtd");
629 dtd_path_filename.append(DIRSEP);
630 dtd_path_filename.append(dtd_filename);
631 std::ifstream dtd_file(dtd_path_filename.c_str());
632 if (!dtd_file)
633 {
634 message_out(DEBUG, "find_dtd(): Unable to open the file " + dtd_path_filename + ", most likely we are not in the source tree.");
635 }
636 else
637 {
638 message_out(STATUS, "find_dtd():DTD found: " + dtd_path_filename);
639 return dtd_path_filename;
640 }
641
642
643 message_out(ERROR, "find_dtd():Unable to find the DTD named " + dtd_filename);
644 return "";
645}
Main header file containing the LibOfx API.
@ OFX
Definition: libofx.h:140
@ OFC
Definition: libofx.h:141
int message_out(OfxMsgType error_type, const std::string message)
Message output function.
Definition: messages.cpp:67
Message IO functionality.
@ DEBUG
Definition: messages.hh:25
@ ERROR
Definition: messages.hh:34
@ INFO
Definition: messages.hh:32
@ WARNING
Definition: messages.hh:33
@ STATUS
Definition: messages.hh:31
int ofc_proc_sgml(LibofxContext *libofx_context, int argc, char *const *argv)
Parses a DTD and OFX file(s)
Definition: ofc_sgml.cpp:346
OFX/SGML parsing functionality.
std::string sanitize_proprietary_tags(std::string input_string)
Removes proprietary tags and comments.
const char * DTD_SEARCH_PATH[DTD_SEARCH_PATH_NUM]
The list of paths to search for the DTDs.
Definition: ofx_preproc.cpp:66
const int DTD_SEARCH_PATH_NUM
The number of different paths to search for DTDs.
Definition: ofx_preproc.cpp:60
std::string find_dtd(LibofxContextPtr ctx, const std::string &dtd_filename)
Find the appropriate DTD for the file version.
int ofx_proc_file(LibofxContextPtr ctx, const char *p_filename)
File pre-processing of OFX AND for OFC files.
Definition: ofx_preproc.cpp:80
Preprocessing of the OFX files before parsing.
int ofx_proc_sgml(LibofxContext *libofx_context, int argc, char *const *argv)
Parses a DTD and OFX file(s)
Definition: ofx_sgml.cpp:444
OFX/SGML parsing functionality.
Various simple functions for type conversion & al.
void STRNCPY(T &dest, const std::string &src)