LibOFX
ofx_preproc.cpp
Go to the documentation of this file.
1 /***************************************************************************
2  ofx_preproc.cpp
3  -------------------
4  copyright : (C) 2002 by Benoit Gr�oir
5  email : benoitg@coeus.ca
6 ***************************************************************************/
12 /***************************************************************************
13  * *
14  * This program is free software; you can redistribute it and/or modify *
15  * it under the terms of the GNU General Public License as published by *
16  * the Free Software Foundation; either version 2 of the License, or *
17  * (at your option) any later version. *
18  * *
19  ***************************************************************************/
20 #include "../config.h"
21 #include <iostream>
22 #include <fstream>
23 #include <cstdlib>
24 #include <stdio.h>
25 #include <sstream>
26 #include <string>
27 #include "ParserEventGeneratorKit.h"
28 #include "libofx.h"
29 #include "messages.hh"
30 #include "ofx_sgml.hh"
31 #include "ofc_sgml.hh"
32 #include "ofx_preproc.hh"
33 #include "ofx_utilities.hh"
34 #ifdef HAVE_ICONV
35 #include <iconv.h>
36 #endif
37 
38 #ifdef __WIN32__
39 # define DIRSEP "\\"
40 #else
41 # define DIRSEP "/"
42 #endif
43 
44 #ifdef __WIN32__
45 # include "win32.hh"
46 # include <windows.h> // for GetModuleFileName()
47 # undef ERROR
48 # undef DELETE
49 #endif
50 
51 #define LIBOFX_DEFAULT_INPUT_ENCODING "CP1252"
52 #define LIBOFX_DEFAULT_OUTPUT_ENCODING "UTF-8"
53 
54 using namespace std;
58 #ifdef MAKEFILE_DTD_PATH
59 const int DTD_SEARCH_PATH_NUM = 4;
60 #else
61 const int DTD_SEARCH_PATH_NUM = 3;
62 #endif
63 
68 {
69 #ifdef MAKEFILE_DTD_PATH
70  MAKEFILE_DTD_PATH,
71 #endif
72  "/usr/local/share/libofx/dtd",
73  "/usr/share/libofx/dtd",
74  "~"
75 };
76 
81 int ofx_proc_file(LibofxContextPtr ctx, const char * p_filename)
82 {
83  LibofxContext *libofx_context;
84  bool ofx_start = false;
85  bool ofx_end = false;
86  bool file_is_xml = false;
87  bool used_iconv = false;
88  ifstream input_file;
89  ofstream tmp_file;
90  char *filenames[3];
91  char tmp_filename[256];
92  int tmp_file_fd;
93 #ifdef HAVE_ICONV
94  iconv_t conversion_descriptor;
95 #endif
96  libofx_context = (LibofxContext*)ctx;
97 
98  if (p_filename != NULL && strcmp(p_filename, "") != 0)
99  {
100  message_out(DEBUG, string("ofx_proc_file():Opening file: ") + p_filename);
101 
102  input_file.open(p_filename);
103  if (!input_file)
104  {
105  message_out(ERROR, "ofx_proc_file():Unable to open the input file " + string(p_filename));
106  }
107 
108  mkTempFileName("libofxtmpXXXXXX", tmp_filename, sizeof(tmp_filename));
109 
110  message_out(DEBUG, "ofx_proc_file(): Creating temp file: " + string(tmp_filename));
111 #ifdef __WIN32__
112  tmp_file_fd = mkstemp_win32(tmp_filename);
113 #else
114  tmp_file_fd = mkstemp(tmp_filename);
115 #endif
116  if (tmp_file_fd)
117  {
118  tmp_file.open(tmp_filename);
119  if (!tmp_file)
120  {
121  message_out(ERROR, "ofx_proc_file():Unable to open the created temp file " + string(tmp_filename));
122  return -1;
123  }
124  }
125  else
126  {
127  message_out(ERROR, "ofx_proc_file():Unable to create a temp file at " + string(tmp_filename));
128  return -1;
129  }
130 
131  if (input_file && tmp_file)
132  {
133  std::size_t header_separator_idx;
134  string header_name;
135  string header_value;
136  string ofx_encoding;
137  string ofx_charset;
138  do
139  {
140  stringbuf buffer;
141  string s_buffer;
142  input_file.get(buffer, '\n');
143  //cout<< "got: \"" << buffer<<"\"\n";
144  s_buffer = buffer.str();
145 
146  // Watch out: If input_file is in eof(), any subsequent read or
147  // peek() will fail and we must exit this loop.
148  if (!input_file.eof())
149  {
150  //cout<<"input_file.gcount(): "<<input_file.gcount()<< " s_buffer.size=" << s_buffer.size()<<" sizeof(buffer): "<<sizeof(buffer) << " peek=\"" << int(input_file.peek()) << "\"" <<endl;
151  if (input_file.fail()) // If no characters were extracted above, the failbit is set.
152  {
153  // No characters extracted means that we've reached the newline
154  // delimiter (because we already checked for EOF). We will check
155  // for and remove that newline in the next if-clause, but must
156  // remove the failbit so that peek() will work again.
157  input_file.clear();
158  }
159 
160  // Is the next character really the newline?
161  if (input_file.peek() == '\n')
162  {
163  // Yes. Then discard that newline character from the stream
164  input_file.get();
165  }
166  }
167 
168  if (ofx_start == false && (s_buffer.find("<?xml") != string::npos))
169  {
170  message_out(DEBUG, "ofx_proc_file(): File is an actual XML file, iconv conversion will be skipped.");
171  file_is_xml = true;
172  }
173 
174  std::size_t ofx_start_idx;
175  if (ofx_start == false)
176  {
177  if (
178  (libofx_context->currentFileType() == OFX &&
179  ((ofx_start_idx = s_buffer.find("<OFX>")) != string::npos ||
180  (ofx_start_idx = s_buffer.find("<ofx>")) != string::npos))
181  ||
182  (libofx_context->currentFileType() == OFC &&
183  ((ofx_start_idx = s_buffer.find("<OFC>")) != string::npos ||
184  (ofx_start_idx = s_buffer.find("<ofc>")) != string::npos))
185  )
186  {
187  ofx_start = true;
188  if (file_is_xml == false)
189  {
190  s_buffer.erase(0, ofx_start_idx); //Fix for really broken files that don't have a newline after the header.
191  }
192  message_out(DEBUG, "ofx_proc_file():<OFX> or <OFC> has been found");
193 
194  if (file_is_xml == true)
195  {
196  static char sp_charset_fixed[] = "SP_CHARSET_FIXED=1";
197  if (putenv(sp_charset_fixed) != 0)
198  {
199  message_out(ERROR, "ofx_proc_file(): putenv failed");
200  }
201  /* Normally the following would be "xml".
202  * Unfortunately, opensp's generic api will garble UTF-8 if this is
203  * set to xml. So we set any single byte encoding to avoid messing
204  * up UTF-8. Unfortunately this means that non-UTF-8 files will not
205  * get properly translated. We'd need to manually detect the
206  * encoding in the XML header and convert the xml with iconv like we
207  * do for SGML to work around the problem. Most unfortunate. */
208  static char sp_encoding[] = "SP_ENCODING=ms-dos";
209  if (putenv(sp_encoding) != 0)
210  {
211  message_out(ERROR, "ofx_proc_file(): putenv failed");
212  }
213  }
214  else
215  {
216  static char sp_charset_fixed[] = "SP_CHARSET_FIXED=1";
217  if (putenv(sp_charset_fixed) != 0)
218  {
219  message_out(ERROR, "ofx_proc_file(): putenv failed");
220  }
221  static char sp_encoding[] = "SP_ENCODING=ms-dos"; //Any single byte encoding will do, we don't want opensp messing up UTF-8;
222  if (putenv(sp_encoding) != 0)
223  {
224  message_out(ERROR, "ofx_proc_file(): putenv failed");
225  }
226 #ifdef HAVE_ICONV
227  string fromcode;
228  string tocode;
229  if (ofx_encoding.compare("USASCII") == 0)
230  {
231  if (ofx_charset.compare("ISO-8859-1") == 0 || ofx_charset.compare("8859-1") == 0)
232  {
233  //Only "ISO-8859-1" is actually a legal value, but since the banks follows the spec SO well...
234  fromcode = "ISO-8859-1";
235  }
236  else if (ofx_charset.compare("1252") == 0 || ofx_charset.compare("CP1252") == 0)
237  {
238  //Only "1252" is actually a legal value, but since the banks follows the spec SO well...
239  fromcode = "CP1252";
240  }
241  else if (ofx_charset.compare("NONE") == 0)
242  {
243  fromcode = LIBOFX_DEFAULT_INPUT_ENCODING;
244  }
245  else
246  {
247  fromcode = LIBOFX_DEFAULT_INPUT_ENCODING;
248  }
249  }
250  else if (ofx_encoding.compare("UTF-8") == 0 || ofx_encoding.compare("UNICODE") == 0)
251  {
252  //While "UNICODE" isn't a legal value, some cyrilic files do specify it as such...
253  fromcode = "UTF-8";
254  }
255  else
256  {
257  fromcode = LIBOFX_DEFAULT_INPUT_ENCODING;
258  }
259  tocode = LIBOFX_DEFAULT_OUTPUT_ENCODING;
260  message_out(DEBUG, "ofx_proc_file(): Setting up iconv for fromcode: " + fromcode + ", tocode: " + tocode);
261  conversion_descriptor = iconv_open (tocode.c_str(), fromcode.c_str());
262  used_iconv = true;
263 #endif
264  }
265  }
266  else
267  {
268  //We are still in the headers
269  if ((header_separator_idx = s_buffer.find(':')) != string::npos)
270  {
271  //Header processing
272  header_name.assign(s_buffer.substr(0, header_separator_idx));
273  header_value.assign(s_buffer.substr(header_separator_idx + 1));
274  while ( header_value[header_value.length() - 1 ] == '\n' ||
275  header_value[header_value.length() - 1 ] == '\r' )
276  header_value.erase(header_value.length() - 1);
277  message_out(DEBUG, "ofx_proc_file():Header: " + header_name + " with value: " + header_value + " has been found");
278  if (header_name.compare("ENCODING") == 0)
279  {
280  ofx_encoding.assign(header_value);
281  }
282  if (header_name.compare("CHARSET") == 0)
283  {
284  ofx_charset.assign(header_value);
285  }
286  }
287  }
288  }
289 
290  if (file_is_xml == true || (ofx_start == true && ofx_end == false))
291  {
292  if (ofx_start == true)
293  {
294  /* The above test won't help us if the <OFX> tag is on the same line
295  * as the xml header, but as opensp can't be used to parse it anyway
296  * this isn't a great loss for now.
297  */
298  s_buffer = sanitize_proprietary_tags(s_buffer);
299  if (s_buffer.empty())
300  continue;
301  }
302  //cout<< s_buffer<<"\n";
303  if (file_is_xml == false)
304  {
305 #ifdef HAVE_ICONV
306  size_t inbytesleft = s_buffer.size();
307  size_t outbytesleft = inbytesleft * 2 - 1;
308  char * iconv_buffer = (char*) malloc (inbytesleft * 2);
309  memset(iconv_buffer, 0, inbytesleft * 2);
310  const char* inchar = s_buffer.c_str();
311  char * outchar = iconv_buffer;
312  int iconv_retval = iconv (conversion_descriptor,
313  const_cast<char**>(&inchar), &inbytesleft,
314  &outchar, &outbytesleft);
315  if (iconv_retval == -1)
316  {
317  message_out(ERROR, "ofx_proc_file(): Iconv conversion error");
318  }
319  // All validly converted bytes will be copied to the
320  // original buffer
321  s_buffer = std::string(iconv_buffer, outchar - iconv_buffer);
322  free (iconv_buffer);
323 #endif
324  }
325  //cout << s_buffer << "\n";
326  tmp_file << s_buffer << endl;
327  }
328 
329  if (ofx_start == true &&
330  (
331  (libofx_context->currentFileType() == OFX &&
332  ((ofx_start_idx = s_buffer.find("</OFX>")) != string::npos ||
333  (ofx_start_idx = s_buffer.find("</ofx>")) != string::npos))
334  || (libofx_context->currentFileType() == OFC &&
335  ((ofx_start_idx = s_buffer.find("</OFC>")) != string::npos ||
336  (ofx_start_idx = s_buffer.find("</ofc>")) != string::npos))
337  )
338  )
339  {
340  ofx_end = true;
341  message_out(DEBUG, "ofx_proc_file():</OFX> or </OFC> has been found");
342  }
343 
344  }
345  while (!input_file.eof() && !input_file.bad());
346  }
347  input_file.close();
348  tmp_file.close();
349 #ifdef HAVE_ICONV
350  if (used_iconv == true)
351  {
352  iconv_close(conversion_descriptor);
353  }
354 #endif
355  char filename_openspdtd[255];
356  char filename_dtd[255];
357  char filename_ofx[255];
358  STRNCPY(filename_openspdtd, find_dtd(ctx, OPENSPDCL_FILENAME)); //The opensp sgml dtd file
359  if (libofx_context->currentFileType() == OFX)
360  {
361  STRNCPY(filename_dtd, find_dtd(ctx, OFX160DTD_FILENAME)); //The ofx dtd file
362  }
363  else if (libofx_context->currentFileType() == OFC)
364  {
365  STRNCPY(filename_dtd, find_dtd(ctx, OFCDTD_FILENAME)); //The ofc dtd file
366  }
367  else
368  {
369  message_out(ERROR, string("ofx_proc_file(): Error unknown file format for the OFX parser"));
370  }
371 
372  if ((string)filename_dtd != "" && (string)filename_openspdtd != "")
373  {
374  strncpy(filename_ofx, tmp_filename, 255); //The processed ofx file
375  filenames[0] = filename_openspdtd;
376  filenames[1] = filename_dtd;
377  filenames[2] = filename_ofx;
378  int rv;
379  if (libofx_context->currentFileType() == OFX)
380  {
381  rv = ofx_proc_sgml(libofx_context, 3, filenames);
382  }
383  else if (libofx_context->currentFileType() == OFC)
384  {
385  rv = ofc_proc_sgml(libofx_context, 3, filenames);
386  }
387  else
388  {
389  message_out(ERROR, string("ofx_proc_file(): Error unknown file format for the OFX parser"));
390  rv = -1;
391  }
392  if (remove(tmp_filename) != 0)
393  {
394  message_out(ERROR, "ofx_proc_file(): Error deleting temporary file " + string(tmp_filename));
395  }
396  return rv;
397  }
398  else
399  {
400  message_out(ERROR, "ofx_proc_file(): FATAL: Missing DTD, aborting");
401  return -1;
402  }
403  }
404  else
405  {
406  message_out(ERROR, "ofx_proc_file():No input file specified");
407  return -1;
408  }
409  return 0;
410 }
411 
412 /* Searches input string for an opening or closing tag starting from pos_start.
413  * If found will return the tag_name and pos_start will be set to the string
414  * of the starting <, pos_end to the position after the closing '>'
415  * If the tag doesn't have a closing '>', pos_end will be set to string::npos.
416  */
417 static string find_tag_open (string& input_string, size_t& pos_start, size_t& pos_end)
418 {
419  pos_start = input_string.find ('<', pos_start);
420 
421  if (pos_start == string::npos)
422  {
423  pos_end = string::npos;
424  return string();
425  }
426 
427  pos_end = input_string.find ('>', pos_start + 1);
428  if (pos_end != string::npos)
429  pos_end = pos_end + 1;
430  size_t tag_size = (pos_end - 1) - (pos_start + 1);
431  return input_string.substr(pos_start + 1, tag_size);
432 }
433 
434 /* Searches input string for a closing tag matching tag_name starting at pos.
435  * If found pos will be set to the position right after of the closing '>'
436  * If no matching closing tag is found pos will be set to the start of the next
437  * opening or closing tag found.
438  */
439 static void find_tag_close (string& input_string, string& tag_name, size_t& pos)
440 {
441  size_t start_idx = input_string.find ("</" + tag_name + ">", pos);
442 
443  if (start_idx == string::npos)
444  {
445  start_idx = pos;
446  size_t end_idx;
447  string new_tag_name = find_tag_open (input_string, start_idx, end_idx);
448  if (!new_tag_name.empty())
449  {
450  message_out(DEBUG, "find_tag_close() fell back to next open tag: " + new_tag_name);
451  // find_tag_open returns the *end* of an opening tag, but in this
452  // case we want its start, so we need to rewind a bit..
453  pos = start_idx;
454  //printf("find_tag_close() returning pos after fallback: %d\n",pos);
455  }
456  else
457  {
458  pos = input_string.length();
459  }
460  }
461  else
462  {
463  pos = start_idx + tag_name.length() + 3;
464  }
465  return;
466 }
467 
468 
480 string sanitize_proprietary_tags(string input_string)
481 {
482  size_t last_known_good_pos = 0;
483  size_t open_tag_start_pos = last_known_good_pos;
484  size_t open_tag_end_pos;
485  size_t close_tag_end_pos;
486 
487  string tag_name = find_tag_open(input_string, open_tag_start_pos, open_tag_end_pos);
488  while (!tag_name.empty())
489  {
490  // Determine whether the current tag is proprietary.
491  if ((tag_name.find('.') != string::npos) || // tag has a . in the name
492  (tag_name == "CATEGORY")) // Chase bank started setting these in 2017
493  {
494  close_tag_end_pos = open_tag_end_pos;
495  find_tag_close (input_string, tag_name, close_tag_end_pos);
496  size_t tag_size = close_tag_end_pos - open_tag_start_pos;
497  string prop_tag = input_string.substr(open_tag_start_pos, tag_size);
498  message_out(INFO, "sanitize_proprietary_tags() removed: " + prop_tag);
499  input_string.erase(open_tag_start_pos, tag_size);
500  last_known_good_pos = open_tag_start_pos;
501  }
502  else
503  {
504  last_known_good_pos = open_tag_end_pos;
505  }
506  tag_name.clear();
507  open_tag_start_pos = last_known_good_pos;
508  if (last_known_good_pos != string::npos)
509  tag_name = find_tag_open(input_string, open_tag_start_pos, open_tag_end_pos);
510  }
511  return input_string;
512 }
513 
514 
515 #ifdef __WIN32__
516 static std::string get_dtd_installation_directory()
517 {
518  // Partial implementation of
519  // http://developer.gnome.org/doc/API/2.0/glib/glib-Windows-Compatibility-Functions.html#g-win32-get-package-installation-directory
520  char ch_fn[MAX_PATH], *p;
521  std::string str_fn;
522 
523  if (!GetModuleFileName(NULL, ch_fn, MAX_PATH)) return "";
524 
525  if ((p = strrchr(ch_fn, '\\')) != NULL)
526  * p = '\0';
527 
528  p = strrchr(ch_fn, '\\');
529  if (p && (_stricmp(p + 1, "bin") == 0 ||
530  _stricmp(p + 1, "lib") == 0))
531  *p = '\0';
532 
533  str_fn = ch_fn;
534  str_fn += "\\share\\libofx\\dtd";
535 
536  return str_fn;
537 }
538 #endif
539 
540 
553 std::string find_dtd(LibofxContextPtr ctx, const std::string& dtd_filename)
554 {
555  string dtd_path_filename;
556  char *env_dtd_path;
557 
558  dtd_path_filename = reinterpret_cast<const LibofxContext*>(ctx)->dtdDir();
559  if (!dtd_path_filename.empty())
560  {
561  dtd_path_filename.append(dtd_filename);
562  ifstream dtd_file(dtd_path_filename.c_str());
563  if (dtd_file)
564  {
565  message_out(STATUS, "find_dtd():DTD found: " + dtd_path_filename);
566  return dtd_path_filename;
567  }
568  }
569 
570 #ifdef __WIN32__
571  dtd_path_filename = get_dtd_installation_directory();
572  if (!dtd_path_filename.empty())
573  {
574  dtd_path_filename.append(DIRSEP);
575  dtd_path_filename.append(dtd_filename);
576  ifstream dtd_file(dtd_path_filename.c_str());
577  if (dtd_file)
578  {
579  message_out(STATUS, "find_dtd():DTD found: " + dtd_path_filename);
580  return dtd_path_filename;
581  }
582  }
583 #endif
584  /* Search in environement variable OFX_DTD_PATH */
585  env_dtd_path = getenv("OFX_DTD_PATH");
586  if (env_dtd_path)
587  {
588  dtd_path_filename.append(env_dtd_path);
589  dtd_path_filename.append(DIRSEP);
590  dtd_path_filename.append(dtd_filename);
591  ifstream dtd_file(dtd_path_filename.c_str());
592  if (!dtd_file)
593  {
594  message_out(STATUS, "find_dtd():OFX_DTD_PATH env variable was was present, but unable to open the file " + dtd_path_filename);
595  }
596  else
597  {
598  message_out(STATUS, "find_dtd():DTD found: " + dtd_path_filename);
599  return dtd_path_filename;
600  }
601  }
602 
603  for (int i = 0; i < DTD_SEARCH_PATH_NUM; i++)
604  {
605  dtd_path_filename = DTD_SEARCH_PATH[i];
606  dtd_path_filename.append(DIRSEP);
607  dtd_path_filename.append(dtd_filename);
608  ifstream dtd_file(dtd_path_filename.c_str());
609  if (!dtd_file)
610  {
611  message_out(DEBUG, "find_dtd():Unable to open the file " + dtd_path_filename);
612  }
613  else
614  {
615  message_out(STATUS, "find_dtd():DTD found: " + dtd_path_filename);
616  return dtd_path_filename;
617  }
618  }
619 
620  /* Last resort, look in source tree relative path (useful for development) */
621  dtd_path_filename = "";
622  dtd_path_filename.append("..");
623  dtd_path_filename.append(DIRSEP);
624  dtd_path_filename.append("dtd");
625  dtd_path_filename.append(DIRSEP);
626  dtd_path_filename.append(dtd_filename);
627  ifstream dtd_file(dtd_path_filename.c_str());
628  if (!dtd_file)
629  {
630  message_out(DEBUG, "find_dtd(): Unable to open the file " + dtd_path_filename + ", most likely we are not in the source tree.");
631  }
632  else
633  {
634  message_out(STATUS, "find_dtd():DTD found: " + dtd_path_filename);
635  return dtd_path_filename;
636  }
637 
638 
639  message_out(ERROR, "find_dtd():Unable to find the DTD named " + dtd_filename);
640  return "";
641 }
Main header file containing the LibOfx API.
@ OFX
Definition: libofx.h:129
@ OFC
Definition: libofx.h:130
int message_out(OfxMsgType error_type, const string message)
Message output function.
Definition: messages.cpp:61
Message IO functionality.
@ DEBUG
Definition: messages.hh:25
@ ERROR
Definition: messages.hh:34
@ INFO
Definition: messages.hh:32
@ STATUS
Definition: messages.hh:31
int ofc_proc_sgml(LibofxContext *libofx_context, int argc, char *const *argv)
Parses a DTD and OFX file(s)
Definition: ofc_sgml.cpp:353
OFX/SGML parsing functionnality.
const char * DTD_SEARCH_PATH[DTD_SEARCH_PATH_NUM]
The list of paths to search for the DTDs.
Definition: ofx_preproc.cpp:67
const int DTD_SEARCH_PATH_NUM
The number of different paths to search for DTDs.
Definition: ofx_preproc.cpp:61
string sanitize_proprietary_tags(string input_string)
Removes proprietary tags and comments.
std::string find_dtd(LibofxContextPtr ctx, const std::string &dtd_filename)
Find the appropriate DTD for the file version.
int ofx_proc_file(LibofxContextPtr ctx, const char *p_filename)
File pre-processing of OFX AND for OFC files.
Definition: ofx_preproc.cpp:81
Preprocessing of the OFX files before parsing.
int ofx_proc_sgml(LibofxContext *libofx_context, int argc, char *const *argv)
Parses a DTD and OFX file(s)
Definition: ofx_sgml.cpp:434
OFX/SGML parsing functionnality.
Various simple functions for type conversion & al.
void STRNCPY(T &dest, const std::string &src)