diff --git a/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_detection.cpp b/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_detection.cpp index 1e5d7029a..d970f5889 100644 --- a/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_detection.cpp +++ b/src/duckdb/src/execution/operator/csv_scanner/sniffer/type_detection.cpp @@ -318,6 +318,7 @@ void CSVSniffer::SniffTypes(DataChunk &data_chunk, CSVStateMachine &state_machin unordered_map> &info_sql_types_candidates, idx_t start_idx_detection) { const idx_t chunk_size = data_chunk.size(); + HasType has_type; for (idx_t col_idx = 0; col_idx < data_chunk.ColumnCount(); col_idx++) { auto &cur_vector = data_chunk.data[col_idx]; D_ASSERT(cur_vector.GetVectorType() == VectorType::FLAT_VECTOR); @@ -339,8 +340,8 @@ void CSVSniffer::SniffTypes(DataChunk &data_chunk, CSVStateMachine &state_machin // If Value is not Null, Has a numeric date format, and the current investigated candidate is // either a timestamp or a date if (null_mask.RowIsValid(row_idx) && StartsWithNumericDate(separator, vector_data[row_idx]) && - (col_type_candidates.back().id() == LogicalTypeId::TIMESTAMP || - col_type_candidates.back().id() == LogicalTypeId::DATE)) { + ((col_type_candidates.back().id() == LogicalTypeId::TIMESTAMP && !has_type.timestamp) || + (col_type_candidates.back().id() == LogicalTypeId::DATE && !has_type.date))) { DetectDateAndTimeStampFormats(state_machine, sql_type, separator, vector_data[row_idx]); } // try cast from string to sql_type @@ -364,6 +365,12 @@ void CSVSniffer::SniffTypes(DataChunk &data_chunk, CSVStateMachine &state_machin col_type_candidates.pop_back(); } } + if (col_type_candidates.back().id() == LogicalTypeId::DATE) { + has_type.date = true; + } + if (col_type_candidates.back().id() == LogicalTypeId::TIMESTAMP) { + has_type.timestamp = true; + } } } diff --git a/src/duckdb/src/function/table/version/pragma_version.cpp b/src/duckdb/src/function/table/version/pragma_version.cpp index a3fa5f3ed..382ba725f 100644 --- a/src/duckdb/src/function/table/version/pragma_version.cpp +++ b/src/duckdb/src/function/table/version/pragma_version.cpp @@ -1,5 +1,5 @@ #ifndef DUCKDB_PATCH_VERSION -#define DUCKDB_PATCH_VERSION "1-dev4041" +#define DUCKDB_PATCH_VERSION "1-dev4052" #endif #ifndef DUCKDB_MINOR_VERSION #define DUCKDB_MINOR_VERSION 0 @@ -8,10 +8,10 @@ #define DUCKDB_MAJOR_VERSION 1 #endif #ifndef DUCKDB_VERSION -#define DUCKDB_VERSION "v1.0.1-dev4041" +#define DUCKDB_VERSION "v1.0.1-dev4052" #endif #ifndef DUCKDB_SOURCE_ID -#define DUCKDB_SOURCE_ID "ad7df1eabc" +#define DUCKDB_SOURCE_ID "f5ab7c167e" #endif #include "duckdb/function/table/system_functions.hpp" #include "duckdb/main/database.hpp" diff --git a/src/duckdb/src/include/duckdb/common/operator/double_cast_operator.hpp b/src/duckdb/src/include/duckdb/common/operator/double_cast_operator.hpp index af87eaafa..733a95d44 100644 --- a/src/duckdb/src/include/duckdb/common/operator/double_cast_operator.hpp +++ b/src/duckdb/src/include/duckdb/common/operator/double_cast_operator.hpp @@ -10,6 +10,7 @@ #include "duckdb.h" #include "fast_float/fast_float.h" +#include "duckdb/common/string_util.hpp" namespace duckdb { template @@ -37,7 +38,7 @@ static bool TryDoubleCast(const char *buf, idx_t len, T &result, bool strict, ch } } auto endptr = buf + len; - auto parse_result = duckdb_fast_float::from_chars(buf, buf + len, result, decimal_separator); + auto parse_result = duckdb_fast_float::from_chars(buf, buf + len, result, strict, decimal_separator); if (parse_result.ec != std::errc()) { return false; } diff --git a/src/duckdb/src/include/duckdb/common/operator/integer_cast_operator.hpp b/src/duckdb/src/include/duckdb/common/operator/integer_cast_operator.hpp index decec991f..8bf694c42 100644 --- a/src/duckdb/src/include/duckdb/common/operator/integer_cast_operator.hpp +++ b/src/duckdb/src/include/duckdb/common/operator/integer_cast_operator.hpp @@ -302,7 +302,7 @@ static bool IntegerCastLoop(const char *buf, idx_t len, T &result, bool strict) return false; } - if (pos != len && buf[pos] == '_') { + if (pos != len && buf[pos] == '_' && !strict) { // Skip one underscore if it is not the last character and followed by a digit pos++; if (pos == len || !StringUtil::CharacterIsDigit(buf[pos])) { diff --git a/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/csv_sniffer.hpp b/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/csv_sniffer.hpp index 83e4cf27c..45ed78f7d 100644 --- a/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/csv_sniffer.hpp +++ b/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/csv_sniffer.hpp @@ -20,6 +20,7 @@ struct DateTimestampSniffing { bool initialized = false; bool had_match = false; vector format; + idx_t initial_size; }; //! Struct to store the result of the Sniffer struct SnifferResult { @@ -107,6 +108,12 @@ struct SetColumns { } }; +//! Struct used to know if we have a date or timestamp type already identified in this CSV File +struct HasType { + bool date = false; + bool timestamp = false; +}; + //! Sniffer that detects Header, Dialect and Types of CSV Files class CSVSniffer { public: diff --git a/src/duckdb/third_party/fast_float/fast_float/fast_float.h b/src/duckdb/third_party/fast_float/fast_float/fast_float.h index b5e048289..0ce7d21ad 100644 --- a/src/duckdb/third_party/fast_float/fast_float/fast_float.h +++ b/src/duckdb/third_party/fast_float/fast_float/fast_float.h @@ -75,7 +75,7 @@ struct from_chars_result { */ template from_chars_result from_chars(const char *first, const char *last, - T &value, + T &value, bool strict=false, const char decimal_separator = '.', chars_format fmt = chars_format::general) noexcept; @@ -504,7 +504,7 @@ struct parsed_number_string { // Assuming that you use no more than 19 digits, this will // parse an ASCII string. fastfloat_really_inline -parsed_number_string parse_number_string(const char *p, const char *pend, const char decimal_separator, chars_format fmt) noexcept { +parsed_number_string parse_number_string(const char *p, const char *pend, const char decimal_separator, chars_format fmt, bool strict) noexcept { parsed_number_string answer; answer.valid = false; answer.too_many_digits = false; @@ -530,6 +530,10 @@ parsed_number_string parse_number_string(const char *p, const char *pend, const uint64_t(*p - '0'); // might overflow, we will handle the overflow later ++p; if(p != pend && *p == '_') { + if (strict) { + answer.valid = false; + return answer; + } // skip 1 underscore if it is not the last character and followed by a digit ++p; if(p == pend || !is_integer(*p)) { @@ -565,6 +569,10 @@ parsed_number_string parse_number_string(const char *p, const char *pend, const i = i * 10 + digit; // in rare cases, this will overflow, but that's ok if(p != pend && *p == '_') { + if (strict) { + answer.valid = false; + return answer; + } // skip 1 underscore if it is not the last character and followed by a digit ++p; ++skipped_underscores; @@ -611,6 +619,10 @@ parsed_number_string parse_number_string(const char *p, const char *pend, const ++p; if(p != pend && *p == '_') { + if (strict) { + answer.valid = false; + return answer; + } // skip 1 underscore if it is not the last character and followed by a digit ++p; if(p == pend || !is_integer(*p)) { @@ -659,6 +671,10 @@ parsed_number_string parse_number_string(const char *p, const char *pend, const ++p; if(p != pend && *p == '_') { + if (strict) { + answer.valid = false; + return answer; + } // skip 1 underscore if it is not the last character and followed by a digit ++p; if(p == pend || !is_integer(*p)) { @@ -2445,7 +2461,7 @@ fastfloat_really_inline void to_float(bool negative, adjusted_mantissa am, T &va template from_chars_result from_chars(const char *first, const char *last, - T &value, const char decimal_separator, chars_format fmt + T &value, bool strict, const char decimal_separator, chars_format fmt /*= chars_format::general*/) noexcept { static_assert (std::is_same::value || std::is_same::value, "only float and double are supported"); @@ -2456,7 +2472,7 @@ from_chars_result from_chars(const char *first, const char *last, answer.ptr = first; return answer; } - parsed_number_string pns = parse_number_string(first, last, decimal_separator, fmt); + parsed_number_string pns = parse_number_string(first, last, decimal_separator, fmt, strict); if (!pns.valid) { return detail::parse_infnan(first, last, value); }