GnuCash  5.6-150-g038405b370+
gnc-tokenizer-csv.cpp
1 /********************************************************************\
2  * gnc-tokenizer-csv.cpp - takes a csv file and converts it into a *
3  * two-dimensional vector of strings (table)*
4  * *
5  * Copyright (C) 2015 Geert Janssens <geert@kobaltwit.be> *
6  * *
7  * This program is free software; you can redistribute it and/or *
8  * modify it under the terms of the GNU General Public License as *
9  * published by the Free Software Foundation; either version 2 of *
10  * the License, or (at your option) any later version. *
11  * *
12  * This program is distributed in the hope that it will be useful, *
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of *
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
15  * GNU General Public License for more details. *
16  * *
17  * You should have received a copy of the GNU General Public License*
18  * along with this program; if not, contact: *
19  * *
20  * Free Software Foundation Voice: +1-617-542-5942 *
21  * 51 Franklin Street, Fifth Floor Fax: +1-617-542-2652 *
22  * Boston, MA 02110-1301, USA gnu@gnu.org *
23 \********************************************************************/
24 
25 #include "gnc-tokenizer-csv.hpp"
26 
27 #include <iostream>
28 #include <fstream> // fstream
29 #include <vector>
30 #include <string>
31 #include <algorithm> // copy
32 #include <iterator> // ostream_operator
33 
34 #include <boost/tokenizer.hpp>
35 #include <boost/locale.hpp>
36 #include <boost/algorithm/string.hpp>
37 
38 #include <glib/gi18n.h>
39 
40 void
41 GncCsvTokenizer::set_separators(const std::string& separators)
42 {
43  m_sep_str = separators;
44 }
45 
46 
47 int GncCsvTokenizer::tokenize()
48 {
49  using Tokenizer = boost::tokenizer< boost::escaped_list_separator<char>>;
50 
51  boost::escaped_list_separator<char> sep("\\", m_sep_str, "\"");
52 
53  StrVec vec;
54  std::string line;
55  std::string buffer;
56 
57  bool inside_quotes(false);
58  size_t last_quote(0);
59 
60  m_tokenized_contents.clear();
61  std::istringstream in_stream(m_utf8_contents);
62 
63  try
64  {
65  while (std::getline (in_stream, buffer))
66  {
67  // --- deal with line breaks in quoted strings
68  buffer = boost::trim_copy (buffer); // Removes trailing newline and spaces
69  last_quote = buffer.find_first_of('"');
70  while (last_quote != std::string::npos)
71  {
72  if (last_quote == 0) // Test separately because last_quote - 1 would be out of range
73  inside_quotes = !inside_quotes;
74  else if (buffer[ last_quote - 1 ] != '\\')
75  inside_quotes = !inside_quotes;
76 
77  last_quote = buffer.find_first_of('"',last_quote+1);
78  }
79 
80  line.append(buffer);
81  if (inside_quotes)
82  {
83  line.append(" ");
84  continue;
85  }
86  // ---
87 
88  // Deal with backslashes that are not meant to be escapes
89  // The boost::tokenizer with escaped_list_separator as we use
90  // it would choke on this.
91  auto bs_pos = line.find ('\\');
92  while (bs_pos != std::string::npos)
93  {
94  if ((bs_pos == line.size()) || // got trailing single backslash
95  (line.find_first_of ("\"\\n", bs_pos + 1) != bs_pos + 1)) // backslash is not part of known escapes \\, \" or \n
96  line = line.substr(0, bs_pos) + "\\\\" + line.substr(bs_pos + 1);
97  bs_pos += 2;
98  bs_pos = line.find ('\\', bs_pos);
99  }
100 
101  // Deal with repeated " ("") in strings.
102  // This is commonly used as escape mechanism for double quotes in csv files.
103  // However boost just eats them.
104  bs_pos = line.find ("\"\"");
105  while (bs_pos != std::string::npos)
106  {
107  // Only make changes in case the double quotes are part of a larger field
108  // In other words a field which only contains two double quotes represent an
109  // empty field. We don't need to touch those.
110  // The way to determine whether the double quotes represent an empty string
111  // is by checking whether the character in front or after are either
112  // a field separator or the beginning or end of of the string.
113  if (!(((bs_pos == 0) || // quotes are at start of line
114  (m_sep_str.find (line[bs_pos-1]) != std::string::npos)) // quotes preceded by field separator
115  &&
116  ((bs_pos + 2 >= line.length()) || // quotes are at end of line
117  (m_sep_str.find (line[bs_pos+2]) != std::string::npos)))) // quotes followed by field separator
118  // Only make changes in case the double quotes are not an empty field
119  line.replace (bs_pos, 2, "\\\"");
120  bs_pos = line.find ("\"\"", bs_pos + 2);
121  }
122 
123  Tokenizer tok(line, sep);
124  vec.assign(tok.begin(),tok.end());
125  m_tokenized_contents.push_back(vec);
126  line.clear();
127  }
128  }
129  catch (boost::escaped_list_error &e)
130  {
131  throw (std::range_error N_("There was an error parsing the file."));
132  }
133 
134  return 0;
135 }
Class to convert a csv file into vector of string vectors.