From f76d1cde0a33131bf187d00385b8cfde7f45bbc6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rg=20Frings-F=C3=BCrst?= Date: Fri, 8 Aug 2014 14:02:36 +0200 Subject: Initial import of fast-cpp-csv-parser version 0.0+git20140808~0a259084edad-1 --- csv.h | 1069 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1069 insertions(+) create mode 100644 csv.h (limited to 'csv.h') diff --git a/csv.h b/csv.h new file mode 100644 index 0000000..4ff5a53 --- /dev/null +++ b/csv.h @@ -0,0 +1,1069 @@ +// Copyright: (2012-2014) Ben Strasser +// License: BSD-3 +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +//2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +//3. Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. + +#ifndef CSV_H +#define CSV_H + +#include +#include +#include +#include +#include +#include +#include +#ifndef CSV_IO_NO_THREAD +#include +#endif +#include +#include + +namespace io{ + //////////////////////////////////////////////////////////////////////////// + // LineReader // + //////////////////////////////////////////////////////////////////////////// + + namespace error{ + struct base : std::exception{ + virtual void format_error_message()const = 0; + + const char*what()const throw(){ + format_error_message(); + return error_message_buffer; + } + + mutable char error_message_buffer[256]; + }; + + const int max_file_name_length = 255; + + struct with_file_name{ + with_file_name(){ + std::memset(file_name, 0, max_file_name_length+1); + } + + void set_file_name(const char*file_name){ + std::strncpy(this->file_name, file_name, max_file_name_length); + this->file_name[max_file_name_length] = '\0'; + } + + char file_name[max_file_name_length+1]; + }; + + struct with_file_line{ + with_file_line(){ + file_line = -1; + } + + void set_file_line(int file_line){ + this->file_line = file_line; + } + + int file_line; + }; + + struct with_errno{ + with_errno(){ + errno = 0; + } + + void set_errno(int errno_value){ + this->errno_value = errno_value; + } + + int errno_value; + }; + + struct can_not_open_file : + base, + with_file_name, + with_errno{ + void format_error_message()const{ + if(errno_value != 0) + std::snprintf(error_message_buffer, sizeof(error_message_buffer), + "Can not open file \"%s\" because \"%s\"." + , file_name, std::strerror(errno_value)); + else + std::snprintf(error_message_buffer, sizeof(error_message_buffer), + "Can not open file \"%s\"." + , file_name); + } + }; + + struct line_length_limit_exceeded : + base, + with_file_name, + with_file_line{ + void format_error_message()const{ + std::snprintf(error_message_buffer, sizeof(error_message_buffer), + "Line number %d in file \"%s\" exceeds the maximum length of 2^24-1." + , file_line, file_name); + } + }; + } + + class LineReader{ + private: + static const int block_len = 1<<24; + #ifndef CSV_IO_NO_THREAD + std::futurebytes_read; + #endif + FILE*file; + char*buffer; + int data_begin; + int data_end; + + char file_name[error::max_file_name_length+1]; + unsigned file_line; + + void open_file(const char*file_name){ + // We open the file in binary mode as it makes no difference under *nix + // and under Windows we handle \r\n newlines ourself. + file = std::fopen(file_name, "rb"); + if(file == 0){ + int x = errno; // store errno as soon as possible, doing it after constructor call can fail. + error::can_not_open_file err; + err.set_errno(x); + err.set_file_name(file_name); + throw err; + } + } + + void init(){ + file_line = 0; + + // Tell the std library that we want to do the buffering ourself. + std::setvbuf(file, 0, _IONBF, 0); + + try{ + buffer = new char[3*block_len]; + }catch(...){ + std::fclose(file); + throw; + } + + data_begin = 0; + data_end = std::fread(buffer, 1, 2*block_len, file); + + // Ignore UTF-8 BOM + if(data_end >= 3 && buffer[0] == '\xEF' && buffer[1] == '\xBB' && buffer[2] == '\xBF') + data_begin = 3; + + #ifndef CSV_IO_NO_THREAD + if(data_end == 2*block_len){ + bytes_read = std::async(std::launch::async, [=]()->int{ + return std::fread(buffer + 2*block_len, 1, block_len, file); + }); + } + #endif + } + + public: + LineReader() = delete; + LineReader(const LineReader&) = delete; + LineReader&operator=(const LineReader&) = delete; + + LineReader(const char*file_name, FILE*file): + file(file){ + set_file_name(file_name); + init(); + } + + LineReader(const std::string&file_name, FILE*file): + file(file){ + set_file_name(file_name.c_str()); + init(); + } + + explicit LineReader(const char*file_name){ + set_file_name(file_name); + open_file(file_name); + init(); + } + + explicit LineReader(const std::string&file_name){ + set_file_name(file_name.c_str()); + open_file(file_name.c_str()); + init(); + } + + void set_file_name(const std::string&file_name){ + set_file_name(file_name.c_str()); + } + + void set_file_name(const char*file_name){ + strncpy(this->file_name, file_name, error::max_file_name_length); + this->file_name[error::max_file_name_length] = '\0'; + } + + const char*get_truncated_file_name()const{ + return file_name; + } + + void set_file_line(unsigned file_line){ + this->file_line = file_line; + } + + unsigned get_file_line()const{ + return file_line; + } + + char*next_line(){ + if(data_begin == data_end) + return 0; + + ++file_line; + + assert(data_begin < data_end); + assert(data_end <= block_len*2); + + if(data_begin >= block_len){ + std::memcpy(buffer, buffer+block_len, block_len); + data_begin -= block_len; + data_end -= block_len; + #ifndef CSV_IO_NO_THREAD + if(bytes_read.valid()) + #endif + { + #ifndef CSV_IO_NO_THREAD + data_end += bytes_read.get(); + #else + data_end += std::fread(buffer + 2*block_len, 1, block_len, file); + #endif + std::memcpy(buffer+block_len, buffer+2*block_len, block_len); + + #ifndef CSV_IO_NO_THREAD + bytes_read = std::async(std::launch::async, [=]()->int{ + return std::fread(buffer + 2*block_len, 1, block_len, file); + }); + #endif + } + } + + int line_end = data_begin; + while(buffer[line_end] != '\n' && line_end != data_end){ + ++line_end; + } + + if(line_end - data_begin + 1 > block_len){ + error::line_length_limit_exceeded err; + err.set_file_name(file_name); + err.set_file_line(file_line); + throw err; + } + + if(buffer[line_end] == '\n'){ + buffer[line_end] = '\0'; + }else{ + // some files are missing the newline at the end of the + // last line + ++data_end; + buffer[line_end] = '\0'; + } + + // handle windows \r\n-line breaks + if(line_end != data_begin && buffer[line_end-1] == '\r') + buffer[line_end-1] = '\0'; + + char*ret = buffer + data_begin; + data_begin = line_end+1; + return ret; + } + + ~LineReader(){ + #ifndef CSV_IO_NO_THREAD + // GCC needs this or it will crash. + if(bytes_read.valid()) + bytes_read.get(); + #endif + + delete[] buffer; + std::fclose(file); + } + }; + + //////////////////////////////////////////////////////////////////////////// + // CSV // + //////////////////////////////////////////////////////////////////////////// + + namespace error{ + const int max_column_name_length = 63; + struct with_column_name{ + with_column_name(){ + std::memset(column_name, 0, max_column_name_length+1); + } + + void set_column_name(const char*column_name){ + std::strncpy(this->column_name, column_name, max_column_name_length); + this->column_name[max_column_name_length] = '\0'; + } + + char column_name[max_column_name_length+1]; + }; + + + const int max_column_content_length = 63; + + struct with_column_content{ + with_column_content(){ + std::memset(column_content, 0, max_column_content_length+1); + } + + void set_column_content(const char*column_content){ + std::strncpy(this->column_content, column_content, max_column_content_length); + this->column_content[max_column_content_length] = '\0'; + } + + char column_content[max_column_content_length+1]; + }; + + + struct extra_column_in_header : + base, + with_file_name, + with_column_name{ + void format_error_message()const{ + std::snprintf(error_message_buffer, sizeof(error_message_buffer), + "Extra column \"%s\" in header of file \"%s\"." + , column_name, file_name); + } + }; + + struct missing_column_in_header : + base, + with_file_name, + with_column_name{ + void format_error_message()const{ + std::snprintf(error_message_buffer, sizeof(error_message_buffer), + "Missing column \"%s\" in header of file \"%s\"." + , column_name, file_name); + } + }; + + struct duplicated_column_in_header : + base, + with_file_name, + with_column_name{ + void format_error_message()const{ + std::snprintf(error_message_buffer, sizeof(error_message_buffer), + "Duplicated column \"%s\" in header of file \"%s\"." + , column_name, file_name); + } + }; + + struct header_missing : + base, + with_file_name{ + void format_error_message()const{ + std::snprintf(error_message_buffer, sizeof(error_message_buffer), + "Header missing in file \"%s\"." + , file_name); + } + }; + + struct too_few_columns : + base, + with_file_name, + with_file_line{ + void format_error_message()const{ + std::snprintf(error_message_buffer, sizeof(error_message_buffer), + "Too few columns in line %d in file \"%s\"." + , file_line, file_name); + } + }; + + struct too_many_columns : + base, + with_file_name, + with_file_line{ + void format_error_message()const{ + std::snprintf(error_message_buffer, sizeof(error_message_buffer), + "Too many columns in line %d in file \"%s\"." + , file_line, file_name); + } + }; + + struct escaped_string_not_closed : + base, + with_file_name, + with_file_line{ + void format_error_message()const{ + std::snprintf(error_message_buffer, sizeof(error_message_buffer), + "Escaped string was not closed in line %d in file \"%s\"." + , file_line, file_name); + } + }; + + struct integer_must_be_positive : + base, + with_file_name, + with_file_line, + with_column_name, + with_column_content{ + void format_error_message()const{ + std::snprintf(error_message_buffer, sizeof(error_message_buffer), + "The integer \"%s\" must be positive or 0 in column \"%s\" in file \"%s\" in line \"%d\"." + , column_content, column_name, file_name, file_line); + } + }; + + struct no_digit : + base, + with_file_name, + with_file_line, + with_column_name, + with_column_content{ + void format_error_message()const{ + std::snprintf(error_message_buffer, sizeof(error_message_buffer), + "The integer \"%s\" contains an invalid digit in column \"%s\" in file \"%s\" in line \"%d\"." + , column_content, column_name, file_name, file_line); + } + }; + + struct integer_overflow : + base, + with_file_name, + with_file_line, + with_column_name, + with_column_content{ + void format_error_message()const{ + std::snprintf(error_message_buffer, sizeof(error_message_buffer), + "The integer \"%s\" overflows in column \"%s\" in file \"%s\" in line \"%d\"." + , column_content, column_name, file_name, file_line); + } + }; + + struct integer_underflow : + base, + with_file_name, + with_file_line, + with_column_name, + with_column_content{ + void format_error_message()const{ + std::snprintf(error_message_buffer, sizeof(error_message_buffer), + "The integer \"%s\" underflows in column \"%s\" in file \"%s\" in line \"%d\"." + , column_content, column_name, file_name, file_line); + } + }; + + struct invalid_single_character : + base, + with_file_name, + with_file_line, + with_column_name, + with_column_content{ + void format_error_message()const{ + std::snprintf(error_message_buffer, sizeof(error_message_buffer), + "The content \"%s\" of column \"%s\" in file \"%s\" in line \"%d\" is not a single character." + , column_content, column_name, file_name, file_line); + } + }; + } + + typedef unsigned ignore_column; + static const ignore_column ignore_no_column = 0; + static const ignore_column ignore_extra_column = 1; + static const ignore_column ignore_missing_column = 2; + + template + struct trim_chars{ + private: + constexpr static bool is_trim_char(char c){ + return false; + } + + template + constexpr static bool is_trim_char(char c, char trim_char, OtherTrimChars...other_trim_chars){ + return c == trim_char || is_trim_char(c, other_trim_chars...); + } + + public: + static void trim(char*&str_begin, char*&str_end){ + while(is_trim_char(*str_begin, trim_char_list...) && str_begin != str_end) + ++str_begin; + while(is_trim_char(*(str_end-1), trim_char_list...) && str_begin != str_end) + --str_end; + *str_end = '\0'; + } + }; + + + struct no_comment{ + static bool is_comment(const char*line){ + return false; + } + }; + + template + struct single_line_comment{ + private: + constexpr static bool is_comment_start_char(char c){ + return false; + } + + template + constexpr static bool is_comment_start_char(char c, char comment_start_char, OtherCommentStartChars...other_comment_start_chars){ + return c == comment_start_char || is_comment_start_char(c, other_comment_start_chars...); + } + + public: + + static bool is_comment(const char*line){ + return is_comment_start_char(*line, comment_start_char_list...); + } + }; + + struct empty_line_comment{ + static bool is_comment(const char*line){ + if(*line == '\0') + return true; + while(*line == ' ' || *line == '\t'){ + ++line; + if(*line == 0) + return true; + } + return false; + } + }; + + template + struct single_and_empty_line_comment{ + static bool is_comment(const char*line){ + return single_line_comment::is_comment(line) || empty_line_comment::is_comment(line); + } + }; + + template + struct no_quote_escape{ + static const char*find_next_column_end(const char*col_begin){ + while(*col_begin != sep && *col_begin != '\0') + ++col_begin; + return col_begin; + } + + static void unescape(char*&col_begin, char*&col_end){ + + } + }; + + template + struct double_quote_escape{ + static const char*find_next_column_end(const char*col_begin){ + while(*col_begin != sep && *col_begin != '\0') + if(*col_begin != quote) + ++col_begin; + else{ + do{ + ++col_begin; + while(*col_begin != quote){ + if(*col_begin == '\0') + throw error::escaped_string_not_closed(); + ++col_begin; + } + ++col_begin; + }while(*col_begin == quote); + } + return col_begin; + } + + static void unescape(char*&col_begin, char*&col_end){ + if(col_end - col_begin >= 2){ + if(*col_begin == quote && *(col_end-1) == quote){ + ++col_begin; + --col_end; + char*out = col_begin; + for(char*in = col_begin; in!=col_end; ++in){ + if(*in == quote && *(in+1) == quote){ + continue; + } + *out = *in; + ++out; + } + col_end = out; + *col_end = '\0'; + } + } + + } + }; + + struct throw_on_overflow{ + template + static void on_overflow(T&){ + throw error::integer_overflow(); + } + + template + static void on_underflow(T&){ + throw error::integer_underflow(); + } + }; + + struct ignore_overflow{ + template + static void on_overflow(T&){} + + template + static void on_underflow(T&){} + }; + + struct set_to_max_on_overflow{ + template + static void on_overflow(T&x){ + x = std::numeric_limits::max(); + } + + template + static void on_underflow(T&x){ + x = std::numeric_limits::min(); + } + }; + + + namespace detail{ + template + void chop_next_column( + char*&line, char*&col_begin, char*&col_end + ){ + assert(line != nullptr); + + col_begin = line; + // the col_begin + (... - col_begin) removes the constness + col_end = col_begin + (quote_policy::find_next_column_end(col_begin) - col_begin); + + if(*col_end == '\0'){ + line = nullptr; + }else{ + *col_end = '\0'; + line = col_end + 1; + } + } + + template + void parse_line( + char*line, + char**sorted_col, + const std::vector&col_order + ){ + for(std::size_t i=0; i(line, col_begin, col_end); + + if(col_order[i] != -1){ + trim_policy::trim(col_begin, col_end); + quote_policy::unescape(col_begin, col_end); + + sorted_col[col_order[i]] = col_begin; + } + } + if(line != nullptr) + throw io::error::too_many_columns(); + } + + template + void parse_header_line( + char*line, + std::vector&col_order, + const std::string*col_name, + ignore_column ignore_policy + ){ + col_order.clear(); + + bool found[column_count]; + std::fill(found, found + column_count, false); + while(line){ + char*col_begin,*col_end; + chop_next_column(line, col_begin, col_end); + + trim_policy::trim(col_begin, col_end); + quote_policy::unescape(col_begin, col_end); + + for(unsigned i=0; i + void parse(char*col, char &x){ + if(!*col) + throw error::invalid_single_character(); + x = *col; + ++col; + if(*col) + throw error::invalid_single_character(); + } + + template + void parse(char*col, std::string&x){ + x = col; + } + + template + void parse(char*col, const char*&x){ + x = col; + } + + template + void parse(char*col, char*&x){ + x = col; + } + + template + void parse_unsigned_integer(const char*col, T&x){ + x = 0; + while(*col != '\0'){ + if('0' <= *col && *col <= '9'){ + T y = *col - '0'; + if(x > (std::numeric_limits::max()-y)/10){ + overflow_policy::on_overflow(x); + return; + } + x = 10*x+y; + }else + throw error::no_digit(); + ++col; + } + } + + templatevoid parse(char*col, unsigned char &x) + {parse_unsigned_integer(col, x);} + templatevoid parse(char*col, unsigned short &x) + {parse_unsigned_integer(col, x);} + templatevoid parse(char*col, unsigned int &x) + {parse_unsigned_integer(col, x);} + templatevoid parse(char*col, unsigned long &x) + {parse_unsigned_integer(col, x);} + templatevoid parse(char*col, unsigned long long &x) + {parse_unsigned_integer(col, x);} + + template + void parse_signed_integer(const char*col, T&x){ + if(*col == '-'){ + ++col; + + x = 0; + while(*col != '\0'){ + if('0' <= *col && *col <= '9'){ + T y = *col - '0'; + if(x < (std::numeric_limits::min()+y)/10){ + overflow_policy::on_underflow(x); + return; + } + x = 10*x-y; + }else + throw error::no_digit(); + ++col; + } + return; + }else if(*col == '+') + ++col; + parse_unsigned_integer(col, x); + } + + templatevoid parse(char*col, signed char &x) + {parse_signed_integer(col, x);} + templatevoid parse(char*col, signed short &x) + {parse_signed_integer(col, x);} + templatevoid parse(char*col, signed int &x) + {parse_signed_integer(col, x);} + templatevoid parse(char*col, signed long &x) + {parse_signed_integer(col, x);} + templatevoid parse(char*col, signed long long &x) + {parse_signed_integer(col, x);} + + template + void parse_float(const char*col, T&x){ + bool is_neg = false; + if(*col == '-'){ + is_neg = true; + ++col; + }else if(*col == '+') + ++col; + + x = 0; + while('0' <= *col && *col <= '9'){ + int y = *col - '0'; + x *= 10; + x += y; + ++col; + } + + if(*col == '.'|| *col == ','){ + ++col; + T pos = 1; + while('0' <= *col && *col <= '9'){ + pos /= 10; + int y = *col - '0'; + ++col; + x += y*pos; + } + } + + if(*col == 'e' || *col == 'E'){ + ++col; + int e; + + parse_signed_integer(col, e); + + if(e != 0){ + T base; + if(e < 0){ + base = 0.1; + e = -e; + }else{ + base = 10; + } + + while(e != 1){ + if((e & 1) == 0){ + base = base*base; + e >>= 1; + }else{ + x *= base; + --e; + } + } + x *= base; + } + }else{ + if(*col != '\0') + throw error::no_digit(); + } + + if(is_neg) + x = -x; + } + + template void parse(char*col, float&x) { parse_float(col, x); } + template void parse(char*col, double&x) { parse_float(col, x); } + template void parse(char*col, long double&x) { parse_float(col, x); } + + template + void parse(char*col, T&x){ + // GCC evalutes "false" when reading the template and + // "sizeof(T)!=sizeof(T)" only when instantiating it. This is why + // this strange construct is used. + static_assert(sizeof(T)!=sizeof(T), + "Can not parse this type. Only buildin integrals, floats, char, char*, const char* and std::string are supported"); + } + + } + + template, + class quote_policy = no_quote_escape<','>, + class overflow_policy = throw_on_overflow, + class comment_policy = no_comment + > + class CSVReader{ + private: + LineReader in; + + char*(row[column_count]); + std::string column_names[column_count]; + + std::vectorcol_order; + + template + void set_column_names(std::string s, ColNames...cols){ + column_names[column_count-sizeof...(ColNames)-1] = std::move(s); + set_column_names(std::forward(cols)...); + } + + void set_column_names(){} + + + public: + CSVReader() = delete; + CSVReader(const CSVReader&) = delete; + CSVReader&operator=(const CSVReader&); + + template + explicit CSVReader(Args...args):in(std::forward(args)...){ + std::fill(row, row+column_count, nullptr); + col_order.resize(column_count); + for(unsigned i=0; i + void read_header(ignore_column ignore_policy, ColNames...cols){ + static_assert(sizeof...(ColNames)>=column_count, "not enough column names specified"); + static_assert(sizeof...(ColNames)<=column_count, "too many column names specified"); + try{ + set_column_names(std::forward(cols)...); + + char*line; + do{ + line = in.next_line(); + if(!line) + throw error::header_missing(); + }while(comment_policy::is_comment(line)); + + detail::parse_header_line + + (line, col_order, column_names, ignore_policy); + }catch(error::with_file_name&err){ + err.set_file_name(in.get_truncated_file_name()); + throw; + } + } + + template + void set_header(ColNames...cols){ + static_assert(sizeof...(ColNames)>=column_count, + "not enough column names specified"); + static_assert(sizeof...(ColNames)<=column_count, + "too many column names specified"); + set_column_names(std::forward(cols)...); + std::fill(row, row+column_count, nullptr); + col_order.resize(column_count); + for(unsigned i=0; i + void parse_helper(std::size_t r, T&t, ColType&...cols){ + if(row[r]){ + try{ + try{ + row[r] = row[r]; + io::detail::parse(row[r], t); + }catch(error::with_column_content&err){ + err.set_column_content(row[r]); + throw; + } + }catch(error::with_column_name&err){ + err.set_column_name(column_names[r].c_str()); + throw; + } + } + parse_helper(r+1, cols...); + } + + + public: + template + bool read_row(ColType& ...cols){ + static_assert(sizeof...(ColType)>=column_count, + "not enough columns specified"); + static_assert(sizeof...(ColType)<=column_count, + "too many columns specified"); + try{ + try{ + + char*line; + do{ + line = in.next_line(); + if(!line) + return false; + }while(comment_policy::is_comment(line)); + + detail::parse_line + (line, row, col_order); + + parse_helper(0, cols...); + }catch(error::with_file_name&err){ + err.set_file_name(in.get_truncated_file_name()); + throw; + } + }catch(error::with_file_line&err){ + err.set_file_line(in.get_file_line()); + throw; + } + + return true; + } + }; +} +#endif + -- cgit v1.2.3