From 0ae87d354d7e3598270845fc1fd59200ba1057bc Mon Sep 17 00:00:00 2001 From: Marcel Greter Date: Fri, 15 Aug 2014 18:46:19 +0200 Subject: [PATCH 01/10] Fixes unicode filepath handling on windows Use shortnames for all ANSI file operations --- file.cpp | 29 +++++++++++++++++++++++++++-- utf8_string.cpp | 41 ++++++++++++++++++++++++++++++++++++++++- utf8_string.hpp | 6 ++++++ 3 files changed, 73 insertions(+), 3 deletions(-) diff --git a/file.cpp b/file.cpp index b583643770..5b086547ae 100644 --- a/file.cpp +++ b/file.cpp @@ -17,8 +17,13 @@ #include #include "file.hpp" #include "context.hpp" +#include "utf8_string.hpp" #include "sass2scss/sass2scss.h" +#ifdef _WIN32 +#include +#endif + namespace Sass { namespace File { using namespace std; @@ -222,8 +227,28 @@ namespace Sass { char* read_file(string path) { struct stat st; - if (stat(path.c_str(), &st) == -1 || S_ISDIR(st.st_mode)) return 0; - ifstream file(path.c_str(), ios::in | ios::binary | ios::ate); + + // short path for windows + string spath = path; + +#ifdef _WIN32 + // resolve to short path for ansi compatibility + // do file operations with the short path string + const wchar_t* wpath = UTF_8::convert_to_utf16(path).c_str(); + size_t length = GetShortPathNameW(wpath, NULL, 0); + if (length > 0) { + wchar_t* buffer = new wchar_t[length]; + // result is without terminating null character + size_t result = GetShortPathNameW(wpath, buffer, length); + // check for success and expected result + if (result > 0 && length == result + 1) { + spath = UTF_8::convert_from_utf16(buffer); + } + delete [] buffer; + } +#endif + if (stat(spath.c_str(), &st) == -1 || S_ISDIR(st.st_mode)) return 0; + ifstream file(spath.c_str(), ios::in | ios::binary | ios::ate); string extension; if (path.length() > 5) { extension = path.substr(path.length() - 5, 5); diff --git a/utf8_string.cpp b/utf8_string.cpp index b1f96403ae..2aba30fcea 100644 --- a/utf8_string.cpp +++ b/utf8_string.cpp @@ -2,9 +2,14 @@ #define SASS_UTF8_STRING #include +#include #include #include +#ifdef _WIN32 +#include +#endif + namespace Sass { namespace UTF_8 { using std::string; @@ -101,7 +106,7 @@ namespace Sass { if (index > 0 && index <= signed_len) { // positive and within string length return index-1; - } + } else if (index > signed_len) { // positive and past string length return len; @@ -119,6 +124,40 @@ namespace Sass { } } + #ifdef _WIN32 + + using std::wstring; + + // function to convert utf16le to utf8 + string convert_from_utf16(const wstring& wstr) + { + string convertedString; + int requiredSize = WideCharToMultiByte(CP_UTF8, 0, wstr.c_str(), -1, 0, 0, 0, 0); + if(requiredSize > 0) + { + std::vector buffer(requiredSize); + WideCharToMultiByte(CP_UTF8, 0, wstr.c_str(), -1, &buffer[0], requiredSize, 0, 0); + convertedString.assign(buffer.begin(), buffer.end() - 1); + } + return convertedString; + } + + // function to convert utf8 to utf16le + wstring convert_to_utf16(const string& str) + { + wstring convertedString; + int requiredSize = MultiByteToWideChar(CP_UTF8, 0, str.c_str(), -1, 0, 0); + if(requiredSize > 0) + { + std::vector buffer(requiredSize); + MultiByteToWideChar(CP_UTF8, 0, str.c_str(), -1, &buffer[0], requiredSize); + convertedString.assign(buffer.begin(), buffer.end() - 1); + } + return convertedString; + } + + #endif + } } diff --git a/utf8_string.hpp b/utf8_string.hpp index 189f61345f..0f46d541a8 100644 --- a/utf8_string.hpp +++ b/utf8_string.hpp @@ -29,6 +29,12 @@ namespace Sass { // function that will return a normalized index, given a crazy one size_t normalize_index(int index, size_t len); + #ifdef _WIN32 + // functions to handle unicode paths on windows + string convert_from_utf16(const wstring& wstr); + wstring convert_to_utf16(const string& str); + #endif + } } From ef184682c65ec3c2b9fdb7f0e4cd88dbe8e3c67d Mon Sep 17 00:00:00 2001 From: Marcel Greter Date: Fri, 15 Aug 2014 23:51:37 +0200 Subject: [PATCH 02/10] Fixes utf8 error in str-index function Finding the correct code-point and increasing plus one is the correct way, although it also works by just "skipping" the first code point byte (which is what c_index + 1 does). --- functions.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/functions.cpp b/functions.cpp index 4ec406b9c7..69b104ee5c 100644 --- a/functions.cpp +++ b/functions.cpp @@ -768,7 +768,7 @@ namespace Sass { if(c_index == string::npos) { return new (ctx.mem) Null(path, position); } - size_t index = UTF_8::code_point_count(str, 0, c_index + 1); + size_t index = UTF_8::code_point_count(str, 0, c_index) + 1; return new (ctx.mem) Number(path, position, index); } From 18ecebd80cc30644659ddde12664ccdd6ae76555 Mon Sep 17 00:00:00 2001 From: Marcel Greter Date: Sat, 16 Aug 2014 00:00:23 +0200 Subject: [PATCH 03/10] Replaces native utf8 handling with UTF8-CPP library Remove the hassle to write and maintain code to handle Unicode. Use this light-weight (header-only) library to handle all Unicode related stuff. Can handle invalid Unicode strings correctly and more gracefully. http://utfcpp.sourceforge.net/ --- functions.cpp | 59 +++++++++ utf8.h | 34 +++++ utf8/checked.h | 327 ++++++++++++++++++++++++++++++++++++++++++++++ utf8/core.h | 329 +++++++++++++++++++++++++++++++++++++++++++++++ utf8/unchecked.h | 228 ++++++++++++++++++++++++++++++++ utf8_string.cpp | 119 +++++------------ 6 files changed, 1006 insertions(+), 90 deletions(-) create mode 100644 utf8.h create mode 100644 utf8/checked.h create mode 100644 utf8/core.h create mode 100644 utf8/unchecked.h diff --git a/functions.cpp b/functions.cpp index 69b104ee5c..f91a07ca05 100644 --- a/functions.cpp +++ b/functions.cpp @@ -9,6 +9,7 @@ #include "eval.hpp" #include "util.hpp" #include "utf8_string.hpp" +#include "utf8.h" #include #include @@ -696,6 +697,7 @@ namespace Sass { Signature str_length_sig = "str-length($string)"; BUILT_IN(str_length) { + try { String_Constant* s = ARG("$string", String_Constant); string str = s->value(); size_t length_of_s = str.size(); @@ -709,11 +711,26 @@ namespace Sass { size_t len = UTF_8::code_point_count(str, i, length_of_s); return new (ctx.mem) Number(path, position, len); + + } + catch (utf8::invalid_code_point) { + string msg("utf8::invalid_code_point"); + error(msg, path, position, backtrace); + } + catch (utf8::not_enough_room) { + string msg("utf8::not_enough_room"); + error(msg, path, position, backtrace); + } + catch (utf8::invalid_utf8) { + string msg("utf8::invalid_utf8"); + error(msg, path, position, backtrace); + } } Signature str_insert_sig = "str-insert($string, $insert, $index)"; BUILT_IN(str_insert) { + try { String_Constant* s = ARG("$string", String_Constant); string str = s->value(); char quotemark = s->quote_mark(); @@ -752,11 +769,25 @@ namespace Sass { return new (ctx.mem) String_Constant(path, position, str); + } + catch (utf8::invalid_code_point) { + string msg("utf8::invalid_code_point"); + error(msg, path, position, backtrace); + } + catch (utf8::not_enough_room) { + string msg("utf8::not_enough_room"); + error(msg, path, position, backtrace); + } + catch (utf8::invalid_utf8) { + string msg("utf8::invalid_utf8"); + error(msg, path, position, backtrace); + } } Signature str_index_sig = "str-index($string, $substring)"; BUILT_IN(str_index) { + try { String_Constant* s = ARG("$string", String_Constant); String_Constant* t = ARG("$substring", String_Constant); string str = s->value(); @@ -771,11 +802,26 @@ namespace Sass { size_t index = UTF_8::code_point_count(str, 0, c_index) + 1; return new (ctx.mem) Number(path, position, index); + + } + catch (utf8::invalid_code_point) { + string msg("utf8::invalid_code_point"); + error(msg, path, position, backtrace); + } + catch (utf8::not_enough_room) { + string msg("utf8::not_enough_room"); + error(msg, path, position, backtrace); + } + catch (utf8::invalid_utf8) { + string msg("utf8::invalid_utf8"); + error(msg, path, position, backtrace); + } } Signature str_slice_sig = "str-slice($string, $start-at, $end-at:-1)"; BUILT_IN(str_slice) { + try { String_Constant* s = ARG("$string", String_Constant); Number* n = ARG("$start-at", Number); Number* m = ARG("$end-at", Number); @@ -800,6 +846,19 @@ namespace Sass { return new (ctx.mem) String_Constant(path, position, newstr); + } + catch (utf8::invalid_code_point) { + string msg("utf8::invalid_code_point"); + error(msg, path, position, backtrace); + } + catch (utf8::not_enough_room) { + string msg("utf8::not_enough_room"); + error(msg, path, position, backtrace); + } + catch (utf8::invalid_utf8) { + string msg("utf8::invalid_utf8"); + error(msg, path, position, backtrace); + } } Signature to_upper_case_sig = "to-upper-case($string)"; diff --git a/utf8.h b/utf8.h new file mode 100644 index 0000000000..82b13f59f9 --- /dev/null +++ b/utf8.h @@ -0,0 +1,34 @@ +// Copyright 2006 Nemanja Trifunovic + +/* +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. +*/ + + +#ifndef UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731 +#define UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731 + +#include "utf8/checked.h" +#include "utf8/unchecked.h" + +#endif // header guard diff --git a/utf8/checked.h b/utf8/checked.h new file mode 100644 index 0000000000..1331155138 --- /dev/null +++ b/utf8/checked.h @@ -0,0 +1,327 @@ +// Copyright 2006 Nemanja Trifunovic + +/* +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. +*/ + + +#ifndef UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 +#define UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 + +#include "core.h" +#include + +namespace utf8 +{ + // Base for the exceptions that may be thrown from the library + class exception : public ::std::exception { + }; + + // Exceptions that may be thrown from the library functions. + class invalid_code_point : public exception { + uint32_t cp; + public: + invalid_code_point(uint32_t cp) : cp(cp) {} + virtual const char* what() const throw() { return "Invalid code point"; } + uint32_t code_point() const {return cp;} + }; + + class invalid_utf8 : public exception { + uint8_t u8; + public: + invalid_utf8 (uint8_t u) : u8(u) {} + virtual const char* what() const throw() { return "Invalid UTF-8"; } + uint8_t utf8_octet() const {return u8;} + }; + + class invalid_utf16 : public exception { + uint16_t u16; + public: + invalid_utf16 (uint16_t u) : u16(u) {} + virtual const char* what() const throw() { return "Invalid UTF-16"; } + uint16_t utf16_word() const {return u16;} + }; + + class not_enough_room : public exception { + public: + virtual const char* what() const throw() { return "Not enough space"; } + }; + + /// The library API - functions intended to be called by the users + + template + octet_iterator append(uint32_t cp, octet_iterator result) + { + if (!utf8::internal::is_code_point_valid(cp)) + throw invalid_code_point(cp); + + if (cp < 0x80) // one octet + *(result++) = static_cast(cp); + else if (cp < 0x800) { // two octets + *(result++) = static_cast((cp >> 6) | 0xc0); + *(result++) = static_cast((cp & 0x3f) | 0x80); + } + else if (cp < 0x10000) { // three octets + *(result++) = static_cast((cp >> 12) | 0xe0); + *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); + *(result++) = static_cast((cp & 0x3f) | 0x80); + } + else { // four octets + *(result++) = static_cast((cp >> 18) | 0xf0); + *(result++) = static_cast(((cp >> 12) & 0x3f) | 0x80); + *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); + *(result++) = static_cast((cp & 0x3f) | 0x80); + } + return result; + } + + template + output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement) + { + while (start != end) { + octet_iterator sequence_start = start; + internal::utf_error err_code = utf8::internal::validate_next(start, end); + switch (err_code) { + case internal::UTF8_OK : + for (octet_iterator it = sequence_start; it != start; ++it) + *out++ = *it; + break; + case internal::NOT_ENOUGH_ROOM: + throw not_enough_room(); + case internal::INVALID_LEAD: + out = utf8::append (replacement, out); + ++start; + break; + case internal::INCOMPLETE_SEQUENCE: + case internal::OVERLONG_SEQUENCE: + case internal::INVALID_CODE_POINT: + out = utf8::append (replacement, out); + ++start; + // just one replacement mark for the sequence + while (start != end && utf8::internal::is_trail(*start)) + ++start; + break; + } + } + return out; + } + + template + inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out) + { + static const uint32_t replacement_marker = utf8::internal::mask16(0xfffd); + return utf8::replace_invalid(start, end, out, replacement_marker); + } + + template + uint32_t next(octet_iterator& it, octet_iterator end) + { + uint32_t cp = 0; + internal::utf_error err_code = utf8::internal::validate_next(it, end, cp); + switch (err_code) { + case internal::UTF8_OK : + break; + case internal::NOT_ENOUGH_ROOM : + throw not_enough_room(); + case internal::INVALID_LEAD : + case internal::INCOMPLETE_SEQUENCE : + case internal::OVERLONG_SEQUENCE : + throw invalid_utf8(*it); + case internal::INVALID_CODE_POINT : + throw invalid_code_point(cp); + } + return cp; + } + + template + uint32_t peek_next(octet_iterator it, octet_iterator end) + { + return utf8::next(it, end); + } + + template + uint32_t prior(octet_iterator& it, octet_iterator start) + { + // can't do much if it == start + if (it == start) + throw not_enough_room(); + + octet_iterator end = it; + // Go back until we hit either a lead octet or start + while (utf8::internal::is_trail(*(--it))) + if (it == start) + throw invalid_utf8(*it); // error - no lead byte in the sequence + return utf8::peek_next(it, end); + } + + /// Deprecated in versions that include "prior" + template + uint32_t previous(octet_iterator& it, octet_iterator pass_start) + { + octet_iterator end = it; + while (utf8::internal::is_trail(*(--it))) + if (it == pass_start) + throw invalid_utf8(*it); // error - no lead byte in the sequence + octet_iterator temp = it; + return utf8::next(temp, end); + } + + template + void advance (octet_iterator& it, distance_type n, octet_iterator end) + { + for (distance_type i = 0; i < n; ++i) + utf8::next(it, end); + } + + template + typename std::iterator_traits::difference_type + distance (octet_iterator first, octet_iterator last) + { + typename std::iterator_traits::difference_type dist; + for (dist = 0; first < last; ++dist) + utf8::next(first, last); + return dist; + } + + template + octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result) + { + while (start != end) { + uint32_t cp = utf8::internal::mask16(*start++); + // Take care of surrogate pairs first + if (utf8::internal::is_lead_surrogate(cp)) { + if (start != end) { + uint32_t trail_surrogate = utf8::internal::mask16(*start++); + if (utf8::internal::is_trail_surrogate(trail_surrogate)) + cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET; + else + throw invalid_utf16(static_cast(trail_surrogate)); + } + else + throw invalid_utf16(static_cast(cp)); + + } + // Lone trail surrogate + else if (utf8::internal::is_trail_surrogate(cp)) + throw invalid_utf16(static_cast(cp)); + + result = utf8::append(cp, result); + } + return result; + } + + template + u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result) + { + while (start != end) { + uint32_t cp = utf8::next(start, end); + if (cp > 0xffff) { //make a surrogate pair + *result++ = static_cast((cp >> 10) + internal::LEAD_OFFSET); + *result++ = static_cast((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN); + } + else + *result++ = static_cast(cp); + } + return result; + } + + template + octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result) + { + while (start != end) + result = utf8::append(*(start++), result); + + return result; + } + + template + u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result) + { + while (start != end) + (*result++) = utf8::next(start, end); + + return result; + } + + // The iterator class + template + class iterator : public std::iterator { + octet_iterator it; + octet_iterator range_start; + octet_iterator range_end; + public: + iterator () {} + explicit iterator (const octet_iterator& octet_it, + const octet_iterator& range_start, + const octet_iterator& range_end) : + it(octet_it), range_start(range_start), range_end(range_end) + { + if (it < range_start || it > range_end) + throw std::out_of_range("Invalid utf-8 iterator position"); + } + // the default "big three" are OK + octet_iterator base () const { return it; } + uint32_t operator * () const + { + octet_iterator temp = it; + return utf8::next(temp, range_end); + } + bool operator == (const iterator& rhs) const + { + if (range_start != rhs.range_start || range_end != rhs.range_end) + throw std::logic_error("Comparing utf-8 iterators defined with different ranges"); + return (it == rhs.it); + } + bool operator != (const iterator& rhs) const + { + return !(operator == (rhs)); + } + iterator& operator ++ () + { + utf8::next(it, range_end); + return *this; + } + iterator operator ++ (int) + { + iterator temp = *this; + utf8::next(it, range_end); + return temp; + } + iterator& operator -- () + { + utf8::prior(it, range_start); + return *this; + } + iterator operator -- (int) + { + iterator temp = *this; + utf8::prior(it, range_start); + return temp; + } + }; // class iterator + +} // namespace utf8 + +#endif //header guard + + diff --git a/utf8/core.h b/utf8/core.h new file mode 100644 index 0000000000..693d388c07 --- /dev/null +++ b/utf8/core.h @@ -0,0 +1,329 @@ +// Copyright 2006 Nemanja Trifunovic + +/* +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. +*/ + + +#ifndef UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 +#define UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 + +#include + +namespace utf8 +{ + // The typedefs for 8-bit, 16-bit and 32-bit unsigned integers + // You may need to change them to match your system. + // These typedefs have the same names as ones from cstdint, or boost/cstdint + typedef unsigned char uint8_t; + typedef unsigned short uint16_t; + typedef unsigned int uint32_t; + +// Helper code - not intended to be directly called by the library users. May be changed at any time +namespace internal +{ + // Unicode constants + // Leading (high) surrogates: 0xd800 - 0xdbff + // Trailing (low) surrogates: 0xdc00 - 0xdfff + const uint16_t LEAD_SURROGATE_MIN = 0xd800u; + const uint16_t LEAD_SURROGATE_MAX = 0xdbffu; + const uint16_t TRAIL_SURROGATE_MIN = 0xdc00u; + const uint16_t TRAIL_SURROGATE_MAX = 0xdfffu; + const uint16_t LEAD_OFFSET = LEAD_SURROGATE_MIN - (0x10000 >> 10); + const uint32_t SURROGATE_OFFSET = 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN; + + // Maximum valid value for a Unicode code point + const uint32_t CODE_POINT_MAX = 0x0010ffffu; + + template + inline uint8_t mask8(octet_type oc) + { + return static_cast(0xff & oc); + } + template + inline uint16_t mask16(u16_type oc) + { + return static_cast(0xffff & oc); + } + template + inline bool is_trail(octet_type oc) + { + return ((utf8::internal::mask8(oc) >> 6) == 0x2); + } + + template + inline bool is_lead_surrogate(u16 cp) + { + return (cp >= LEAD_SURROGATE_MIN && cp <= LEAD_SURROGATE_MAX); + } + + template + inline bool is_trail_surrogate(u16 cp) + { + return (cp >= TRAIL_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX); + } + + template + inline bool is_surrogate(u16 cp) + { + return (cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX); + } + + template + inline bool is_code_point_valid(u32 cp) + { + return (cp <= CODE_POINT_MAX && !utf8::internal::is_surrogate(cp)); + } + + template + inline typename std::iterator_traits::difference_type + sequence_length(octet_iterator lead_it) + { + uint8_t lead = utf8::internal::mask8(*lead_it); + if (lead < 0x80) + return 1; + else if ((lead >> 5) == 0x6) + return 2; + else if ((lead >> 4) == 0xe) + return 3; + else if ((lead >> 3) == 0x1e) + return 4; + else + return 0; + } + + template + inline bool is_overlong_sequence(uint32_t cp, octet_difference_type length) + { + if (cp < 0x80) { + if (length != 1) + return true; + } + else if (cp < 0x800) { + if (length != 2) + return true; + } + else if (cp < 0x10000) { + if (length != 3) + return true; + } + + return false; + } + + enum utf_error {UTF8_OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT}; + + /// Helper for get_sequence_x + template + utf_error increase_safely(octet_iterator& it, octet_iterator end) + { + if (++it == end) + return NOT_ENOUGH_ROOM; + + if (!utf8::internal::is_trail(*it)) + return INCOMPLETE_SEQUENCE; + + return UTF8_OK; + } + + #define UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(IT, END) {utf_error ret = increase_safely(IT, END); if (ret != UTF8_OK) return ret;} + + /// get_sequence_x functions decode utf-8 sequences of the length x + template + utf_error get_sequence_1(octet_iterator& it, octet_iterator end, uint32_t& code_point) + { + if (it == end) + return NOT_ENOUGH_ROOM; + + code_point = utf8::internal::mask8(*it); + + return UTF8_OK; + } + + template + utf_error get_sequence_2(octet_iterator& it, octet_iterator end, uint32_t& code_point) + { + if (it == end) + return NOT_ENOUGH_ROOM; + + code_point = utf8::internal::mask8(*it); + + UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) + + code_point = ((code_point << 6) & 0x7ff) + ((*it) & 0x3f); + + return UTF8_OK; + } + + template + utf_error get_sequence_3(octet_iterator& it, octet_iterator end, uint32_t& code_point) + { + if (it == end) + return NOT_ENOUGH_ROOM; + + code_point = utf8::internal::mask8(*it); + + UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) + + code_point = ((code_point << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff); + + UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) + + code_point += (*it) & 0x3f; + + return UTF8_OK; + } + + template + utf_error get_sequence_4(octet_iterator& it, octet_iterator end, uint32_t& code_point) + { + if (it == end) + return NOT_ENOUGH_ROOM; + + code_point = utf8::internal::mask8(*it); + + UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) + + code_point = ((code_point << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff); + + UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) + + code_point += (utf8::internal::mask8(*it) << 6) & 0xfff; + + UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) + + code_point += (*it) & 0x3f; + + return UTF8_OK; + } + + #undef UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR + + template + utf_error validate_next(octet_iterator& it, octet_iterator end, uint32_t& code_point) + { + // Save the original value of it so we can go back in case of failure + // Of course, it does not make much sense with i.e. stream iterators + octet_iterator original_it = it; + + uint32_t cp = 0; + // Determine the sequence length based on the lead octet + typedef typename std::iterator_traits::difference_type octet_difference_type; + const octet_difference_type length = utf8::internal::sequence_length(it); + + // Get trail octets and calculate the code point + utf_error err = UTF8_OK; + switch (length) { + case 0: + return INVALID_LEAD; + case 1: + err = utf8::internal::get_sequence_1(it, end, cp); + break; + case 2: + err = utf8::internal::get_sequence_2(it, end, cp); + break; + case 3: + err = utf8::internal::get_sequence_3(it, end, cp); + break; + case 4: + err = utf8::internal::get_sequence_4(it, end, cp); + break; + } + + if (err == UTF8_OK) { + // Decoding succeeded. Now, security checks... + if (utf8::internal::is_code_point_valid(cp)) { + if (!utf8::internal::is_overlong_sequence(cp, length)){ + // Passed! Return here. + code_point = cp; + ++it; + return UTF8_OK; + } + else + err = OVERLONG_SEQUENCE; + } + else + err = INVALID_CODE_POINT; + } + + // Failure branch - restore the original value of the iterator + it = original_it; + return err; + } + + template + inline utf_error validate_next(octet_iterator& it, octet_iterator end) { + uint32_t ignored; + return utf8::internal::validate_next(it, end, ignored); + } + +} // namespace internal + + /// The library API - functions intended to be called by the users + + // Byte order mark + const uint8_t bom[] = {0xef, 0xbb, 0xbf}; + + template + octet_iterator find_invalid(octet_iterator start, octet_iterator end) + { + octet_iterator result = start; + while (result != end) { + utf8::internal::utf_error err_code = utf8::internal::validate_next(result, end); + if (err_code != internal::UTF8_OK) + return result; + } + return result; + } + + template + inline bool is_valid(octet_iterator start, octet_iterator end) + { + return (utf8::find_invalid(start, end) == end); + } + + template + inline bool starts_with_bom (octet_iterator it, octet_iterator end) + { + return ( + ((it != end) && (utf8::internal::mask8(*it++)) == bom[0]) && + ((it != end) && (utf8::internal::mask8(*it++)) == bom[1]) && + ((it != end) && (utf8::internal::mask8(*it)) == bom[2]) + ); + } + + //Deprecated in release 2.3 + template + inline bool is_bom (octet_iterator it) + { + return ( + (utf8::internal::mask8(*it++)) == bom[0] && + (utf8::internal::mask8(*it++)) == bom[1] && + (utf8::internal::mask8(*it)) == bom[2] + ); + } +} // namespace utf8 + +#endif // header guard + + diff --git a/utf8/unchecked.h b/utf8/unchecked.h new file mode 100644 index 0000000000..cb2427166b --- /dev/null +++ b/utf8/unchecked.h @@ -0,0 +1,228 @@ +// Copyright 2006 Nemanja Trifunovic + +/* +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. +*/ + + +#ifndef UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 +#define UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 + +#include "core.h" + +namespace utf8 +{ + namespace unchecked + { + template + octet_iterator append(uint32_t cp, octet_iterator result) + { + if (cp < 0x80) // one octet + *(result++) = static_cast(cp); + else if (cp < 0x800) { // two octets + *(result++) = static_cast((cp >> 6) | 0xc0); + *(result++) = static_cast((cp & 0x3f) | 0x80); + } + else if (cp < 0x10000) { // three octets + *(result++) = static_cast((cp >> 12) | 0xe0); + *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); + *(result++) = static_cast((cp & 0x3f) | 0x80); + } + else { // four octets + *(result++) = static_cast((cp >> 18) | 0xf0); + *(result++) = static_cast(((cp >> 12) & 0x3f)| 0x80); + *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); + *(result++) = static_cast((cp & 0x3f) | 0x80); + } + return result; + } + + template + uint32_t next(octet_iterator& it) + { + uint32_t cp = utf8::internal::mask8(*it); + typename std::iterator_traits::difference_type length = utf8::internal::sequence_length(it); + switch (length) { + case 1: + break; + case 2: + it++; + cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f); + break; + case 3: + ++it; + cp = ((cp << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff); + ++it; + cp += (*it) & 0x3f; + break; + case 4: + ++it; + cp = ((cp << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff); + ++it; + cp += (utf8::internal::mask8(*it) << 6) & 0xfff; + ++it; + cp += (*it) & 0x3f; + break; + } + ++it; + return cp; + } + + template + uint32_t peek_next(octet_iterator it) + { + return utf8::unchecked::next(it); + } + + template + uint32_t prior(octet_iterator& it) + { + while (utf8::internal::is_trail(*(--it))) ; + octet_iterator temp = it; + return utf8::unchecked::next(temp); + } + + // Deprecated in versions that include prior, but only for the sake of consistency (see utf8::previous) + template + inline uint32_t previous(octet_iterator& it) + { + return utf8::unchecked::prior(it); + } + + template + void advance (octet_iterator& it, distance_type n) + { + for (distance_type i = 0; i < n; ++i) + utf8::unchecked::next(it); + } + + template + typename std::iterator_traits::difference_type + distance (octet_iterator first, octet_iterator last) + { + typename std::iterator_traits::difference_type dist; + for (dist = 0; first < last; ++dist) + utf8::unchecked::next(first); + return dist; + } + + template + octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result) + { + while (start != end) { + uint32_t cp = utf8::internal::mask16(*start++); + // Take care of surrogate pairs first + if (utf8::internal::is_lead_surrogate(cp)) { + uint32_t trail_surrogate = utf8::internal::mask16(*start++); + cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET; + } + result = utf8::unchecked::append(cp, result); + } + return result; + } + + template + u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result) + { + while (start < end) { + uint32_t cp = utf8::unchecked::next(start); + if (cp > 0xffff) { //make a surrogate pair + *result++ = static_cast((cp >> 10) + internal::LEAD_OFFSET); + *result++ = static_cast((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN); + } + else + *result++ = static_cast(cp); + } + return result; + } + + template + octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result) + { + while (start != end) + result = utf8::unchecked::append(*(start++), result); + + return result; + } + + template + u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result) + { + while (start < end) + (*result++) = utf8::unchecked::next(start); + + return result; + } + + // The iterator class + template + class iterator : public std::iterator { + octet_iterator it; + public: + iterator () {} + explicit iterator (const octet_iterator& octet_it): it(octet_it) {} + // the default "big three" are OK + octet_iterator base () const { return it; } + uint32_t operator * () const + { + octet_iterator temp = it; + return utf8::unchecked::next(temp); + } + bool operator == (const iterator& rhs) const + { + return (it == rhs.it); + } + bool operator != (const iterator& rhs) const + { + return !(operator == (rhs)); + } + iterator& operator ++ () + { + ::std::advance(it, utf8::internal::sequence_length(it)); + return *this; + } + iterator operator ++ (int) + { + iterator temp = *this; + ::std::advance(it, utf8::internal::sequence_length(it)); + return temp; + } + iterator& operator -- () + { + utf8::unchecked::prior(it); + return *this; + } + iterator operator -- (int) + { + iterator temp = *this; + utf8::unchecked::prior(it); + return temp; + } + }; // class iterator + + } // namespace utf8::unchecked +} // namespace utf8 + + +#endif // header guard + diff --git a/utf8_string.cpp b/utf8_string.cpp index 2aba30fcea..544c7669bc 100644 --- a/utf8_string.cpp +++ b/utf8_string.cpp @@ -6,9 +6,7 @@ #include #include -#ifdef _WIN32 -#include -#endif +#include "utf8.h" namespace Sass { namespace UTF_8 { @@ -26,76 +24,30 @@ namespace Sass { // function that will count the number of code points (utf-8 characters) from the given beginning to the given end size_t code_point_count(const string& str, size_t start, size_t end) { - size_t len = 0; - size_t i = start; - - while (i < end) { - unsigned char c = static_cast(str[i]); - if (c < 128) { - // it's a single-byte character - ++len; - ++i; - } - // it's a multi byte sequence and presumably it's a leading byte - else { - ++i; // go to the next byte - // see if it's still part of the sequence - while ((i < end) && ((static_cast(str[i]) & 0xC0) == 0x80)) { - ++i; - } - // when it's not [aka a new leading byte], increment and move on - ++len; - } - } - return len; + return utf8::distance(str.begin() + start, str.begin() + end); } size_t code_point_count(const string& str) { - return code_point_count(str, 0, str.length()); + return utf8::distance(str.begin(), str.end()); } - // function that will return the byte offset of a code point in a + // function that will return the byte offset at a code point position size_t code_point_offset_to_byte_offset(const string& str, size_t offset) { - size_t i = 0; - size_t len = 0; - - while (len < offset) { - unsigned char c = static_cast(str[i]); - if (c < 128) { - // it's a single-byte character - ++len; - ++i; - } - // it's a multi byte sequence and presumably it's a leading byte - else { - ++i; // go to the next byte - // see if it's still part of the sequence - while ((i < str.length()) && ((static_cast(str[i]) & 0xC0) == 0x80)) { - ++i; - } - // when it's not [aka a new leading byte], increment and move on - ++len; - } - } - return i; + string::const_iterator it = str.begin(); + utf8::advance(it, offset, str.end()); + return distance(str.begin(), it); } - // function that returns number of bytes in a character in a string + // function that returns number of bytes in a character at offset size_t length_of_code_point_at(const string& str, size_t pos) { - unsigned char c = static_cast(str[pos]); - size_t i = 0; - if(c < 128) { - return 1; - } else { - ++i; // go to the next byte - ++pos; - // see if it's still part of the sequence - while ((i < str.length()) && ((static_cast(str[pos]) & 0xC0) == 0x80)) { - ++i; - ++pos; - } - } - return i; + // get iterator from string and forward by offset + string::const_iterator stop = str.begin() + pos; + // check if beyond boundary + if (stop == str.end()) return 0; + // advance by one code point + utf8::advance(stop, 1, str.end()); + // calculate poset for code point + return stop - str.begin() - pos; } // function that will return a normalized index, given a crazy one @@ -124,40 +76,27 @@ namespace Sass { } } - #ifdef _WIN32 - + // utf16 functions using std::wstring; - // function to convert utf16le to utf8 - string convert_from_utf16(const wstring& wstr) + // convert from utf16/wide string to utf8 string + string convert_from_utf16(const wstring& utf16) { - string convertedString; - int requiredSize = WideCharToMultiByte(CP_UTF8, 0, wstr.c_str(), -1, 0, 0, 0, 0); - if(requiredSize > 0) - { - std::vector buffer(requiredSize); - WideCharToMultiByte(CP_UTF8, 0, wstr.c_str(), -1, &buffer[0], requiredSize, 0, 0); - convertedString.assign(buffer.begin(), buffer.end() - 1); - } - return convertedString; + string utf8; + utf8::utf16to8(utf16.begin(), utf16.end(), + back_inserter(utf8)); + return utf8; } - // function to convert utf8 to utf16le - wstring convert_to_utf16(const string& str) + // convert from utf8 string to utf16/wide string + wstring convert_to_utf16(const string& utf8) { - wstring convertedString; - int requiredSize = MultiByteToWideChar(CP_UTF8, 0, str.c_str(), -1, 0, 0); - if(requiredSize > 0) - { - std::vector buffer(requiredSize); - MultiByteToWideChar(CP_UTF8, 0, str.c_str(), -1, &buffer[0], requiredSize); - convertedString.assign(buffer.begin(), buffer.end() - 1); - } - return convertedString; + wstring utf16; + utf8::utf8to16(utf8.begin(), utf8.end(), + back_inserter(utf16)); + return utf16; } - #endif - } } From 8d76b0fa4f6abdfa22a25d442b70fd299181f78d Mon Sep 17 00:00:00 2001 From: Marcel Greter Date: Sat, 16 Aug 2014 00:02:29 +0200 Subject: [PATCH 04/10] Indents code in try blocks --- functions.cpp | 162 +++++++++++++++++++++++++------------------------- 1 file changed, 81 insertions(+), 81 deletions(-) diff --git a/functions.cpp b/functions.cpp index f91a07ca05..3f9154bbf8 100644 --- a/functions.cpp +++ b/functions.cpp @@ -698,19 +698,19 @@ namespace Sass { BUILT_IN(str_length) { try { - String_Constant* s = ARG("$string", String_Constant); - string str = s->value(); - size_t length_of_s = str.size(); - size_t i = 0; - - if (s->is_quoted()) { - ++i; - --length_of_s; - } + String_Constant* s = ARG("$string", String_Constant); + string str = s->value(); + size_t length_of_s = str.size(); + size_t i = 0; + + if (s->is_quoted()) { + ++i; + --length_of_s; + } - size_t len = UTF_8::code_point_count(str, i, length_of_s); + size_t len = UTF_8::code_point_count(str, i, length_of_s); - return new (ctx.mem) Number(path, position, len); + return new (ctx.mem) Number(path, position, len); } catch (utf8::invalid_code_point) { @@ -731,43 +731,43 @@ namespace Sass { BUILT_IN(str_insert) { try { - String_Constant* s = ARG("$string", String_Constant); - string str = s->value(); - char quotemark = s->quote_mark(); - str = unquote(str); - String_Constant* i = ARG("$insert", String_Constant); - string ins = i->value(); - ins = unquote(ins); - Number* ind = ARG("$index", Number); - double index = ind->value(); - size_t len = UTF_8::code_point_count(str, 0, str.size()); - - if (index > 0 && index <= len) { - // positive and within string length - str.insert(UTF_8::code_point_offset_to_byte_offset(str, index-1), ins); - } - else if (index > len) { - // positive and past string length - str += ins; - } - else if (index == 0) { - str = ins + str; - } - else if (std::abs(index) <= len) { - // negative and within string length - index += len + 1; - str.insert(UTF_8::code_point_offset_to_byte_offset(str, index), ins); - } - else { - // negative and past string length - str = ins + str; - } + String_Constant* s = ARG("$string", String_Constant); + string str = s->value(); + char quotemark = s->quote_mark(); + str = unquote(str); + String_Constant* i = ARG("$insert", String_Constant); + string ins = i->value(); + ins = unquote(ins); + Number* ind = ARG("$index", Number); + double index = ind->value(); + size_t len = UTF_8::code_point_count(str, 0, str.size()); + + if (index > 0 && index <= len) { + // positive and within string length + str.insert(UTF_8::code_point_offset_to_byte_offset(str, index-1), ins); + } + else if (index > len) { + // positive and past string length + str += ins; + } + else if (index == 0) { + str = ins + str; + } + else if (std::abs(index) <= len) { + // negative and within string length + index += len + 1; + str.insert(UTF_8::code_point_offset_to_byte_offset(str, index), ins); + } + else { + // negative and past string length + str = ins + str; + } - if (quotemark) { - str = quote(str, quotemark); - } + if (quotemark) { + str = quote(str, quotemark); + } - return new (ctx.mem) String_Constant(path, position, str); + return new (ctx.mem) String_Constant(path, position, str); } catch (utf8::invalid_code_point) { @@ -788,20 +788,20 @@ namespace Sass { BUILT_IN(str_index) { try { - String_Constant* s = ARG("$string", String_Constant); - String_Constant* t = ARG("$substring", String_Constant); - string str = s->value(); - str = unquote(str); - string substr = t->value(); - substr = unquote(substr); - - size_t c_index = str.find(substr); - if(c_index == string::npos) { - return new (ctx.mem) Null(path, position); - } - size_t index = UTF_8::code_point_count(str, 0, c_index) + 1; + String_Constant* s = ARG("$string", String_Constant); + String_Constant* t = ARG("$substring", String_Constant); + string str = s->value(); + str = unquote(str); + string substr = t->value(); + substr = unquote(substr); + + size_t c_index = str.find(substr); + if(c_index == string::npos) { + return new (ctx.mem) Null(path, position); + } + size_t index = UTF_8::code_point_count(str, 0, c_index) + 1; - return new (ctx.mem) Number(path, position, index); + return new (ctx.mem) Number(path, position, index); } catch (utf8::invalid_code_point) { @@ -822,29 +822,29 @@ namespace Sass { BUILT_IN(str_slice) { try { - String_Constant* s = ARG("$string", String_Constant); - Number* n = ARG("$start-at", Number); - Number* m = ARG("$end-at", Number); - - string str = s->value(); - char quotemark = s->quote_mark(); - str = unquote(str); - - // normalize into 0-based indices - size_t start = UTF_8::code_point_offset_to_byte_offset(str, UTF_8::normalize_index(n->value(), UTF_8::code_point_count(str))); - size_t end = UTF_8::code_point_offset_to_byte_offset(str, UTF_8::normalize_index(m->value(), UTF_8::code_point_count(str))); - - string newstr; - if(start - end == 0) { - newstr = str.substr(start, end - start); - } else { - newstr = str.substr(start, end - start + UTF_8::length_of_code_point_at(str, end)); - } - if(quotemark) { - newstr = quote(newstr, quotemark); - } + String_Constant* s = ARG("$string", String_Constant); + Number* n = ARG("$start-at", Number); + Number* m = ARG("$end-at", Number); + + string str = s->value(); + char quotemark = s->quote_mark(); + str = unquote(str); + + // normalize into 0-based indices + size_t start = UTF_8::code_point_offset_to_byte_offset(str, UTF_8::normalize_index(n->value(), UTF_8::code_point_count(str))); + size_t end = UTF_8::code_point_offset_to_byte_offset(str, UTF_8::normalize_index(m->value(), UTF_8::code_point_count(str))); + + string newstr; + if(start - end == 0) { + newstr = str.substr(start, end - start); + } else { + newstr = str.substr(start, end - start + UTF_8::length_of_code_point_at(str, end)); + } + if(quotemark) { + newstr = quote(newstr, quotemark); + } - return new (ctx.mem) String_Constant(path, position, newstr); + return new (ctx.mem) String_Constant(path, position, newstr); } catch (utf8::invalid_code_point) { From c9aa5411825e70800e612ccadf752dc9c3d31de9 Mon Sep 17 00:00:00 2001 From: Marcel Greter Date: Sat, 16 Aug 2014 00:23:07 +0200 Subject: [PATCH 05/10] Renames some utf8 helper functions The names were (IMO) not very intuitive. I tried to apply some meaningful conventions. --- functions.cpp | 10 +++++----- utf8_string.cpp | 25 ++++++++++--------------- utf8_string.hpp | 17 ++++++----------- 3 files changed, 21 insertions(+), 31 deletions(-) diff --git a/functions.cpp b/functions.cpp index 3f9154bbf8..89c05362b5 100644 --- a/functions.cpp +++ b/functions.cpp @@ -744,7 +744,7 @@ namespace Sass { if (index > 0 && index <= len) { // positive and within string length - str.insert(UTF_8::code_point_offset_to_byte_offset(str, index-1), ins); + str.insert(UTF_8::offset_at_position(str, index - 1), ins); } else if (index > len) { // positive and past string length @@ -756,7 +756,7 @@ namespace Sass { else if (std::abs(index) <= len) { // negative and within string length index += len + 1; - str.insert(UTF_8::code_point_offset_to_byte_offset(str, index), ins); + str.insert(UTF_8::offset_at_position(str, index), ins); } else { // negative and past string length @@ -831,14 +831,14 @@ namespace Sass { str = unquote(str); // normalize into 0-based indices - size_t start = UTF_8::code_point_offset_to_byte_offset(str, UTF_8::normalize_index(n->value(), UTF_8::code_point_count(str))); - size_t end = UTF_8::code_point_offset_to_byte_offset(str, UTF_8::normalize_index(m->value(), UTF_8::code_point_count(str))); + size_t start = UTF_8::offset_at_position(str, UTF_8::normalize_index(n->value(), UTF_8::code_point_count(str))); + size_t end = UTF_8::offset_at_position(str, UTF_8::normalize_index(m->value(), UTF_8::code_point_count(str))); string newstr; if(start - end == 0) { newstr = str.substr(start, end - start); } else { - newstr = str.substr(start, end - start + UTF_8::length_of_code_point_at(str, end)); + newstr = str.substr(start, end - start + UTF_8::code_point_size_at_offset(str, end)); } if(quotemark) { newstr = quote(newstr, quotemark); diff --git a/utf8_string.cpp b/utf8_string.cpp index 544c7669bc..1ede66ac32 100644 --- a/utf8_string.cpp +++ b/utf8_string.cpp @@ -11,16 +11,11 @@ namespace Sass { namespace UTF_8 { using std::string; - // class utf8_string { - // string s_; - // public: - // utf8_string(const string &s): s_(s) {} - // utf8_string(const char* c): s_(string(c)) {} - // char operator[](size_t i); - // size_t length(); - // size_t byte_to_char(size_t i); - // }; + // naming conventions: + // offset: raw byte offset (0 based) + // position: code point offset (0 based) + // index: code point offset (1 based or negative) // function that will count the number of code points (utf-8 characters) from the given beginning to the given end size_t code_point_count(const string& str, size_t start, size_t end) { @@ -32,22 +27,22 @@ namespace Sass { } // function that will return the byte offset at a code point position - size_t code_point_offset_to_byte_offset(const string& str, size_t offset) { + size_t offset_at_position(const string& str, size_t position) { string::const_iterator it = str.begin(); - utf8::advance(it, offset, str.end()); + utf8::advance(it, position, str.end()); return distance(str.begin(), it); } // function that returns number of bytes in a character at offset - size_t length_of_code_point_at(const string& str, size_t pos) { + size_t code_point_size_at_offset(const string& str, size_t offset) { // get iterator from string and forward by offset - string::const_iterator stop = str.begin() + pos; + string::const_iterator stop = str.begin() + offset; // check if beyond boundary if (stop == str.end()) return 0; // advance by one code point utf8::advance(stop, 1, str.end()); - // calculate poset for code point - return stop - str.begin() - pos; + // calculate offset for code point + return stop - str.begin() - offset; } // function that will return a normalized index, given a crazy one diff --git a/utf8_string.hpp b/utf8_string.hpp index 0f46d541a8..89493927cc 100644 --- a/utf8_string.hpp +++ b/utf8_string.hpp @@ -5,26 +5,21 @@ namespace Sass { namespace UTF_8 { - // class utf8_string { - // string s_; - // public: - // utf8_string(const string &s): s_(s) {} - // utf8_string(const char* c): s_(string(c)) {} - // char operator[](size_t i); - // size_t length(); - // size_t byte_to_char(size_t i); - // }; + // naming conventions: + // offset: raw byte offset (0 based) + // position: code point offset (0 based) + // index: code point offset (1 based or negative) // function that will count the number of code points (utf-8 characters) from the beginning to the given end size_t code_point_count(const string& str, size_t start, size_t end); size_t code_point_count(const string& str); // function that will return the byte offset of a code point in a - size_t code_point_offset_to_byte_offset(const string& str, size_t offset); + size_t offset_at_position(const string& str, size_t position); // function that returns number of bytes in a character in a string - size_t length_of_code_point_at(const string& str, size_t pos); + size_t code_point_size_at_offset(const string& str, size_t offset); // function that will return a normalized index, given a crazy one size_t normalize_index(int index, size_t len); From 52086850e6f02fe308c67dca8a5f44a6f9f734ee Mon Sep 17 00:00:00 2001 From: Marcel Greter Date: Sat, 16 Aug 2014 01:54:49 +0200 Subject: [PATCH 06/10] Uses native windows API for file IO with Unicode paths Shortnames may become obsolete or can be disabled. This implementation should be more future proof! --- file.cpp | 43 ++++++++++++++++++++----------------------- 1 file changed, 20 insertions(+), 23 deletions(-) diff --git a/file.cpp b/file.cpp index 5b086547ae..95cebcefbe 100644 --- a/file.cpp +++ b/file.cpp @@ -228,31 +228,23 @@ namespace Sass { { struct stat st; - // short path for windows - string spath = path; - #ifdef _WIN32 - // resolve to short path for ansi compatibility - // do file operations with the short path string + BYTE* pBuffer; + DWORD dwBytes; + // windows unicode filepaths are encoded in utf16 const wchar_t* wpath = UTF_8::convert_to_utf16(path).c_str(); - size_t length = GetShortPathNameW(wpath, NULL, 0); - if (length > 0) { - wchar_t* buffer = new wchar_t[length]; - // result is without terminating null character - size_t result = GetShortPathNameW(wpath, buffer, length); - // check for success and expected result - if (result > 0 && length == result + 1) { - spath = UTF_8::convert_from_utf16(buffer); - } - delete [] buffer; - } -#endif - if (stat(spath.c_str(), &st) == -1 || S_ISDIR(st.st_mode)) return 0; - ifstream file(spath.c_str(), ios::in | ios::binary | ios::ate); - string extension; - if (path.length() > 5) { - extension = path.substr(path.length() - 5, 5); - } + HANDLE hFile = CreateFileW(wpath, GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, 0, NULL); + if (hFile == INVALID_HANDLE_VALUE) return 0; + DWORD dwFileLength = GetFileSize(hFile, NULL); + if (dwFileLength == INVALID_FILE_SIZE) return 0; + pBuffer = new BYTE[dwFileLength + 1]; + ReadFile(hFile, pBuffer, dwFileLength, &dwBytes, NULL); + pBuffer[dwFileLength] = '\0'; + // just convert from unsigned char* + char* contents = (char*) pBuffer; +#else + if (stat(path.c_str(), &st) == -1 || S_ISDIR(st.st_mode)) return 0; + ifstream file(path.c_str(), ios::in | ios::binary | ios::ate); char* contents = 0; if (file.is_open()) { size_t size = file.tellg(); @@ -262,6 +254,11 @@ namespace Sass { contents[size] = '\0'; file.close(); } +#endif + string extension; + if (path.length() > 5) { + extension = path.substr(path.length() - 5, 5); + } for(size_t i=0; i Date: Sat, 16 Aug 2014 02:17:06 +0200 Subject: [PATCH 07/10] Reorganizes around try/catch to avoid clang warnings Calling error in catch will throw, but clang doesn't know! --- functions.cpp | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/functions.cpp b/functions.cpp index 89c05362b5..5a731a48dd 100644 --- a/functions.cpp +++ b/functions.cpp @@ -697,6 +697,7 @@ namespace Sass { Signature str_length_sig = "str-length($string)"; BUILT_IN(str_length) { + size_t len; try { String_Constant* s = ARG("$string", String_Constant); string str = s->value(); @@ -708,9 +709,7 @@ namespace Sass { --length_of_s; } - size_t len = UTF_8::code_point_count(str, i, length_of_s); - - return new (ctx.mem) Number(path, position, len); + len = UTF_8::code_point_count(str, i, length_of_s); } catch (utf8::invalid_code_point) { @@ -725,14 +724,16 @@ namespace Sass { string msg("utf8::invalid_utf8"); error(msg, path, position, backtrace); } + return new (ctx.mem) Number(path, position, len); } Signature str_insert_sig = "str-insert($string, $insert, $index)"; BUILT_IN(str_insert) { + string str; try { String_Constant* s = ARG("$string", String_Constant); - string str = s->value(); + str = s->value(); char quotemark = s->quote_mark(); str = unquote(str); String_Constant* i = ARG("$insert", String_Constant); @@ -766,9 +767,6 @@ namespace Sass { if (quotemark) { str = quote(str, quotemark); } - - return new (ctx.mem) String_Constant(path, position, str); - } catch (utf8::invalid_code_point) { string msg("utf8::invalid_code_point"); @@ -782,11 +780,13 @@ namespace Sass { string msg("utf8::invalid_utf8"); error(msg, path, position, backtrace); } + return new (ctx.mem) String_Constant(path, position, str); } Signature str_index_sig = "str-index($string, $substring)"; BUILT_IN(str_index) { + size_t index; try { String_Constant* s = ARG("$string", String_Constant); String_Constant* t = ARG("$substring", String_Constant); @@ -799,10 +799,7 @@ namespace Sass { if(c_index == string::npos) { return new (ctx.mem) Null(path, position); } - size_t index = UTF_8::code_point_count(str, 0, c_index) + 1; - - return new (ctx.mem) Number(path, position, index); - + index = UTF_8::code_point_count(str, 0, c_index) + 1; } catch (utf8::invalid_code_point) { string msg("utf8::invalid_code_point"); @@ -816,11 +813,14 @@ namespace Sass { string msg("utf8::invalid_utf8"); error(msg, path, position, backtrace); } + // return something even even we had an error + return new (ctx.mem) Number(path, position, index); } Signature str_slice_sig = "str-slice($string, $start-at, $end-at:-1)"; BUILT_IN(str_slice) { + string newstr; try { String_Constant* s = ARG("$string", String_Constant); Number* n = ARG("$start-at", Number); @@ -834,7 +834,6 @@ namespace Sass { size_t start = UTF_8::offset_at_position(str, UTF_8::normalize_index(n->value(), UTF_8::code_point_count(str))); size_t end = UTF_8::offset_at_position(str, UTF_8::normalize_index(m->value(), UTF_8::code_point_count(str))); - string newstr; if(start - end == 0) { newstr = str.substr(start, end - start); } else { @@ -843,9 +842,6 @@ namespace Sass { if(quotemark) { newstr = quote(newstr, quotemark); } - - return new (ctx.mem) String_Constant(path, position, newstr); - } catch (utf8::invalid_code_point) { string msg("utf8::invalid_code_point"); @@ -859,6 +855,7 @@ namespace Sass { string msg("utf8::invalid_utf8"); error(msg, path, position, backtrace); } + return new (ctx.mem) String_Constant(path, position, newstr); } Signature to_upper_case_sig = "to-upper-case($string)"; From 0d5549f1d17b426e8ec06808637c69ffec359757 Mon Sep 17 00:00:00 2001 From: Marcel Greter Date: Sat, 16 Aug 2014 02:36:17 +0200 Subject: [PATCH 08/10] Adds performance optimization to UTF16 converters --- utf8_string.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/utf8_string.cpp b/utf8_string.cpp index 1ede66ac32..4166bc9480 100644 --- a/utf8_string.cpp +++ b/utf8_string.cpp @@ -78,6 +78,8 @@ namespace Sass { string convert_from_utf16(const wstring& utf16) { string utf8; + // pre-allocate expected memory + utf8.reserve(sizeof(utf16)/2); utf8::utf16to8(utf16.begin(), utf16.end(), back_inserter(utf8)); return utf8; @@ -87,6 +89,8 @@ namespace Sass { wstring convert_to_utf16(const string& utf8) { wstring utf16; + // pre-allocate expected memory + utf16.reserve(code_point_count(utf8)*2); utf8::utf8to16(utf8.begin(), utf8.end(), back_inserter(utf16)); return utf16; From 7f9045fa77fad0cdf7313f7bc7e05d02f3828af2 Mon Sep 17 00:00:00 2001 From: Marcel Greter Date: Thu, 4 Sep 2014 00:38:38 +0200 Subject: [PATCH 09/10] Updates sass2scss to latest tagged version (0.9.1) --- sass2scss | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sass2scss b/sass2scss index a35b799f49..0276e2033b 160000 --- a/sass2scss +++ b/sass2scss @@ -1 +1 @@ -Subproject commit a35b799f49d7d335ddc2b765b13d7d5db1d1247f +Subproject commit 0276e2033b9b1810f4ebd0f174bcf63bcbe6819e From 1c87aeedd475abf2b39bd0642d7457200964219d Mon Sep 17 00:00:00 2001 From: Marcel Greter Date: Wed, 24 Sep 2014 14:10:05 +0200 Subject: [PATCH 10/10] Fixes file handle closing on file reading Forgotten to close the handle in windows implementation. --- file.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/file.cpp b/file.cpp index 95cebcefbe..f55ad0963a 100644 --- a/file.cpp +++ b/file.cpp @@ -240,6 +240,7 @@ namespace Sass { pBuffer = new BYTE[dwFileLength + 1]; ReadFile(hFile, pBuffer, dwFileLength, &dwBytes, NULL); pBuffer[dwFileLength] = '\0'; + CloseHandle(hFile); // just convert from unsigned char* char* contents = (char*) pBuffer; #else