diff --git a/file.cpp b/file.cpp index b583643770..f55ad0963a 100644 --- a/file.cpp +++ b/file.cpp @@ -17,8 +17,13 @@ #include #include "file.hpp" #include "context.hpp" +#include "utf8_string.hpp" #include "sass2scss/sass2scss.h" +#ifdef _WIN32 +#include +#endif + namespace Sass { namespace File { using namespace std; @@ -222,12 +227,25 @@ namespace Sass { char* read_file(string path) { struct stat st; + +#ifdef _WIN32 + BYTE* pBuffer; + DWORD dwBytes; + // windows unicode filepaths are encoded in utf16 + const wchar_t* wpath = UTF_8::convert_to_utf16(path).c_str(); + HANDLE hFile = CreateFileW(wpath, GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, 0, NULL); + if (hFile == INVALID_HANDLE_VALUE) return 0; + DWORD dwFileLength = GetFileSize(hFile, NULL); + if (dwFileLength == INVALID_FILE_SIZE) return 0; + pBuffer = new BYTE[dwFileLength + 1]; + ReadFile(hFile, pBuffer, dwFileLength, &dwBytes, NULL); + pBuffer[dwFileLength] = '\0'; + CloseHandle(hFile); + // just convert from unsigned char* + char* contents = (char*) pBuffer; +#else if (stat(path.c_str(), &st) == -1 || S_ISDIR(st.st_mode)) return 0; ifstream file(path.c_str(), ios::in | ios::binary | ios::ate); - string extension; - if (path.length() > 5) { - extension = path.substr(path.length() - 5, 5); - } char* contents = 0; if (file.is_open()) { size_t size = file.tellg(); @@ -237,6 +255,11 @@ namespace Sass { contents[size] = '\0'; file.close(); } +#endif + string extension; + if (path.length() > 5) { + extension = path.substr(path.length() - 5, 5); + } for(size_t i=0; i #include @@ -696,110 +697,165 @@ namespace Sass { Signature str_length_sig = "str-length($string)"; BUILT_IN(str_length) { - String_Constant* s = ARG("$string", String_Constant); - string str = s->value(); - size_t length_of_s = str.size(); - size_t i = 0; - - if (s->is_quoted()) { - ++i; - --length_of_s; - } + size_t len; + try { + String_Constant* s = ARG("$string", String_Constant); + string str = s->value(); + size_t length_of_s = str.size(); + size_t i = 0; + + if (s->is_quoted()) { + ++i; + --length_of_s; + } - size_t len = UTF_8::code_point_count(str, i, length_of_s); + len = UTF_8::code_point_count(str, i, length_of_s); + } + catch (utf8::invalid_code_point) { + string msg("utf8::invalid_code_point"); + error(msg, path, position, backtrace); + } + catch (utf8::not_enough_room) { + string msg("utf8::not_enough_room"); + error(msg, path, position, backtrace); + } + catch (utf8::invalid_utf8) { + string msg("utf8::invalid_utf8"); + error(msg, path, position, backtrace); + } return new (ctx.mem) Number(path, position, len); } Signature str_insert_sig = "str-insert($string, $insert, $index)"; BUILT_IN(str_insert) { - String_Constant* s = ARG("$string", String_Constant); - string str = s->value(); - char quotemark = s->quote_mark(); - str = unquote(str); - String_Constant* i = ARG("$insert", String_Constant); - string ins = i->value(); - ins = unquote(ins); - Number* ind = ARG("$index", Number); - double index = ind->value(); - size_t len = UTF_8::code_point_count(str, 0, str.size()); - - if (index > 0 && index <= len) { - // positive and within string length - str.insert(UTF_8::code_point_offset_to_byte_offset(str, index-1), ins); - } - else if (index > len) { - // positive and past string length - str += ins; - } - else if (index == 0) { - str = ins + str; - } - else if (std::abs(index) <= len) { - // negative and within string length - index += len + 1; - str.insert(UTF_8::code_point_offset_to_byte_offset(str, index), ins); + string str; + try { + String_Constant* s = ARG("$string", String_Constant); + str = s->value(); + char quotemark = s->quote_mark(); + str = unquote(str); + String_Constant* i = ARG("$insert", String_Constant); + string ins = i->value(); + ins = unquote(ins); + Number* ind = ARG("$index", Number); + double index = ind->value(); + size_t len = UTF_8::code_point_count(str, 0, str.size()); + + if (index > 0 && index <= len) { + // positive and within string length + str.insert(UTF_8::offset_at_position(str, index - 1), ins); + } + else if (index > len) { + // positive and past string length + str += ins; + } + else if (index == 0) { + str = ins + str; + } + else if (std::abs(index) <= len) { + // negative and within string length + index += len + 1; + str.insert(UTF_8::offset_at_position(str, index), ins); + } + else { + // negative and past string length + str = ins + str; + } + + if (quotemark) { + str = quote(str, quotemark); + } } - else { - // negative and past string length - str = ins + str; + catch (utf8::invalid_code_point) { + string msg("utf8::invalid_code_point"); + error(msg, path, position, backtrace); } - - if (quotemark) { - str = quote(str, quotemark); + catch (utf8::not_enough_room) { + string msg("utf8::not_enough_room"); + error(msg, path, position, backtrace); + } + catch (utf8::invalid_utf8) { + string msg("utf8::invalid_utf8"); + error(msg, path, position, backtrace); } - return new (ctx.mem) String_Constant(path, position, str); - } Signature str_index_sig = "str-index($string, $substring)"; BUILT_IN(str_index) { - String_Constant* s = ARG("$string", String_Constant); - String_Constant* t = ARG("$substring", String_Constant); - string str = s->value(); - str = unquote(str); - string substr = t->value(); - substr = unquote(substr); - - size_t c_index = str.find(substr); - if(c_index == string::npos) { - return new (ctx.mem) Null(path, position); + size_t index; + try { + String_Constant* s = ARG("$string", String_Constant); + String_Constant* t = ARG("$substring", String_Constant); + string str = s->value(); + str = unquote(str); + string substr = t->value(); + substr = unquote(substr); + + size_t c_index = str.find(substr); + if(c_index == string::npos) { + return new (ctx.mem) Null(path, position); + } + index = UTF_8::code_point_count(str, 0, c_index) + 1; } - size_t index = UTF_8::code_point_count(str, 0, c_index + 1); - + catch (utf8::invalid_code_point) { + string msg("utf8::invalid_code_point"); + error(msg, path, position, backtrace); + } + catch (utf8::not_enough_room) { + string msg("utf8::not_enough_room"); + error(msg, path, position, backtrace); + } + catch (utf8::invalid_utf8) { + string msg("utf8::invalid_utf8"); + error(msg, path, position, backtrace); + } + // return something even even we had an error return new (ctx.mem) Number(path, position, index); } Signature str_slice_sig = "str-slice($string, $start-at, $end-at:-1)"; BUILT_IN(str_slice) { - String_Constant* s = ARG("$string", String_Constant); - Number* n = ARG("$start-at", Number); - Number* m = ARG("$end-at", Number); - - string str = s->value(); - char quotemark = s->quote_mark(); - str = unquote(str); - - // normalize into 0-based indices - size_t start = UTF_8::code_point_offset_to_byte_offset(str, UTF_8::normalize_index(n->value(), UTF_8::code_point_count(str))); - size_t end = UTF_8::code_point_offset_to_byte_offset(str, UTF_8::normalize_index(m->value(), UTF_8::code_point_count(str))); - string newstr; - if(start - end == 0) { - newstr = str.substr(start, end - start); - } else { - newstr = str.substr(start, end - start + UTF_8::length_of_code_point_at(str, end)); + try { + String_Constant* s = ARG("$string", String_Constant); + Number* n = ARG("$start-at", Number); + Number* m = ARG("$end-at", Number); + + string str = s->value(); + char quotemark = s->quote_mark(); + str = unquote(str); + + // normalize into 0-based indices + size_t start = UTF_8::offset_at_position(str, UTF_8::normalize_index(n->value(), UTF_8::code_point_count(str))); + size_t end = UTF_8::offset_at_position(str, UTF_8::normalize_index(m->value(), UTF_8::code_point_count(str))); + + if(start - end == 0) { + newstr = str.substr(start, end - start); + } else { + newstr = str.substr(start, end - start + UTF_8::code_point_size_at_offset(str, end)); + } + if(quotemark) { + newstr = quote(newstr, quotemark); + } } - if(quotemark) { - newstr = quote(newstr, quotemark); + catch (utf8::invalid_code_point) { + string msg("utf8::invalid_code_point"); + error(msg, path, position, backtrace); + } + catch (utf8::not_enough_room) { + string msg("utf8::not_enough_room"); + error(msg, path, position, backtrace); + } + catch (utf8::invalid_utf8) { + string msg("utf8::invalid_utf8"); + error(msg, path, position, backtrace); } - return new (ctx.mem) String_Constant(path, position, newstr); - } Signature to_upper_case_sig = "to-upper-case($string)"; diff --git a/sass2scss b/sass2scss index a35b799f49..0276e2033b 160000 --- a/sass2scss +++ b/sass2scss @@ -1 +1 @@ -Subproject commit a35b799f49d7d335ddc2b765b13d7d5db1d1247f +Subproject commit 0276e2033b9b1810f4ebd0f174bcf63bcbe6819e diff --git a/utf8.h b/utf8.h new file mode 100644 index 0000000000..82b13f59f9 --- /dev/null +++ b/utf8.h @@ -0,0 +1,34 @@ +// Copyright 2006 Nemanja Trifunovic + +/* +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. +*/ + + +#ifndef UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731 +#define UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731 + +#include "utf8/checked.h" +#include "utf8/unchecked.h" + +#endif // header guard diff --git a/utf8/checked.h b/utf8/checked.h new file mode 100644 index 0000000000..1331155138 --- /dev/null +++ b/utf8/checked.h @@ -0,0 +1,327 @@ +// Copyright 2006 Nemanja Trifunovic + +/* +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. +*/ + + +#ifndef UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 +#define UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 + +#include "core.h" +#include + +namespace utf8 +{ + // Base for the exceptions that may be thrown from the library + class exception : public ::std::exception { + }; + + // Exceptions that may be thrown from the library functions. + class invalid_code_point : public exception { + uint32_t cp; + public: + invalid_code_point(uint32_t cp) : cp(cp) {} + virtual const char* what() const throw() { return "Invalid code point"; } + uint32_t code_point() const {return cp;} + }; + + class invalid_utf8 : public exception { + uint8_t u8; + public: + invalid_utf8 (uint8_t u) : u8(u) {} + virtual const char* what() const throw() { return "Invalid UTF-8"; } + uint8_t utf8_octet() const {return u8;} + }; + + class invalid_utf16 : public exception { + uint16_t u16; + public: + invalid_utf16 (uint16_t u) : u16(u) {} + virtual const char* what() const throw() { return "Invalid UTF-16"; } + uint16_t utf16_word() const {return u16;} + }; + + class not_enough_room : public exception { + public: + virtual const char* what() const throw() { return "Not enough space"; } + }; + + /// The library API - functions intended to be called by the users + + template + octet_iterator append(uint32_t cp, octet_iterator result) + { + if (!utf8::internal::is_code_point_valid(cp)) + throw invalid_code_point(cp); + + if (cp < 0x80) // one octet + *(result++) = static_cast(cp); + else if (cp < 0x800) { // two octets + *(result++) = static_cast((cp >> 6) | 0xc0); + *(result++) = static_cast((cp & 0x3f) | 0x80); + } + else if (cp < 0x10000) { // three octets + *(result++) = static_cast((cp >> 12) | 0xe0); + *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); + *(result++) = static_cast((cp & 0x3f) | 0x80); + } + else { // four octets + *(result++) = static_cast((cp >> 18) | 0xf0); + *(result++) = static_cast(((cp >> 12) & 0x3f) | 0x80); + *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); + *(result++) = static_cast((cp & 0x3f) | 0x80); + } + return result; + } + + template + output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement) + { + while (start != end) { + octet_iterator sequence_start = start; + internal::utf_error err_code = utf8::internal::validate_next(start, end); + switch (err_code) { + case internal::UTF8_OK : + for (octet_iterator it = sequence_start; it != start; ++it) + *out++ = *it; + break; + case internal::NOT_ENOUGH_ROOM: + throw not_enough_room(); + case internal::INVALID_LEAD: + out = utf8::append (replacement, out); + ++start; + break; + case internal::INCOMPLETE_SEQUENCE: + case internal::OVERLONG_SEQUENCE: + case internal::INVALID_CODE_POINT: + out = utf8::append (replacement, out); + ++start; + // just one replacement mark for the sequence + while (start != end && utf8::internal::is_trail(*start)) + ++start; + break; + } + } + return out; + } + + template + inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out) + { + static const uint32_t replacement_marker = utf8::internal::mask16(0xfffd); + return utf8::replace_invalid(start, end, out, replacement_marker); + } + + template + uint32_t next(octet_iterator& it, octet_iterator end) + { + uint32_t cp = 0; + internal::utf_error err_code = utf8::internal::validate_next(it, end, cp); + switch (err_code) { + case internal::UTF8_OK : + break; + case internal::NOT_ENOUGH_ROOM : + throw not_enough_room(); + case internal::INVALID_LEAD : + case internal::INCOMPLETE_SEQUENCE : + case internal::OVERLONG_SEQUENCE : + throw invalid_utf8(*it); + case internal::INVALID_CODE_POINT : + throw invalid_code_point(cp); + } + return cp; + } + + template + uint32_t peek_next(octet_iterator it, octet_iterator end) + { + return utf8::next(it, end); + } + + template + uint32_t prior(octet_iterator& it, octet_iterator start) + { + // can't do much if it == start + if (it == start) + throw not_enough_room(); + + octet_iterator end = it; + // Go back until we hit either a lead octet or start + while (utf8::internal::is_trail(*(--it))) + if (it == start) + throw invalid_utf8(*it); // error - no lead byte in the sequence + return utf8::peek_next(it, end); + } + + /// Deprecated in versions that include "prior" + template + uint32_t previous(octet_iterator& it, octet_iterator pass_start) + { + octet_iterator end = it; + while (utf8::internal::is_trail(*(--it))) + if (it == pass_start) + throw invalid_utf8(*it); // error - no lead byte in the sequence + octet_iterator temp = it; + return utf8::next(temp, end); + } + + template + void advance (octet_iterator& it, distance_type n, octet_iterator end) + { + for (distance_type i = 0; i < n; ++i) + utf8::next(it, end); + } + + template + typename std::iterator_traits::difference_type + distance (octet_iterator first, octet_iterator last) + { + typename std::iterator_traits::difference_type dist; + for (dist = 0; first < last; ++dist) + utf8::next(first, last); + return dist; + } + + template + octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result) + { + while (start != end) { + uint32_t cp = utf8::internal::mask16(*start++); + // Take care of surrogate pairs first + if (utf8::internal::is_lead_surrogate(cp)) { + if (start != end) { + uint32_t trail_surrogate = utf8::internal::mask16(*start++); + if (utf8::internal::is_trail_surrogate(trail_surrogate)) + cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET; + else + throw invalid_utf16(static_cast(trail_surrogate)); + } + else + throw invalid_utf16(static_cast(cp)); + + } + // Lone trail surrogate + else if (utf8::internal::is_trail_surrogate(cp)) + throw invalid_utf16(static_cast(cp)); + + result = utf8::append(cp, result); + } + return result; + } + + template + u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result) + { + while (start != end) { + uint32_t cp = utf8::next(start, end); + if (cp > 0xffff) { //make a surrogate pair + *result++ = static_cast((cp >> 10) + internal::LEAD_OFFSET); + *result++ = static_cast((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN); + } + else + *result++ = static_cast(cp); + } + return result; + } + + template + octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result) + { + while (start != end) + result = utf8::append(*(start++), result); + + return result; + } + + template + u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result) + { + while (start != end) + (*result++) = utf8::next(start, end); + + return result; + } + + // The iterator class + template + class iterator : public std::iterator { + octet_iterator it; + octet_iterator range_start; + octet_iterator range_end; + public: + iterator () {} + explicit iterator (const octet_iterator& octet_it, + const octet_iterator& range_start, + const octet_iterator& range_end) : + it(octet_it), range_start(range_start), range_end(range_end) + { + if (it < range_start || it > range_end) + throw std::out_of_range("Invalid utf-8 iterator position"); + } + // the default "big three" are OK + octet_iterator base () const { return it; } + uint32_t operator * () const + { + octet_iterator temp = it; + return utf8::next(temp, range_end); + } + bool operator == (const iterator& rhs) const + { + if (range_start != rhs.range_start || range_end != rhs.range_end) + throw std::logic_error("Comparing utf-8 iterators defined with different ranges"); + return (it == rhs.it); + } + bool operator != (const iterator& rhs) const + { + return !(operator == (rhs)); + } + iterator& operator ++ () + { + utf8::next(it, range_end); + return *this; + } + iterator operator ++ (int) + { + iterator temp = *this; + utf8::next(it, range_end); + return temp; + } + iterator& operator -- () + { + utf8::prior(it, range_start); + return *this; + } + iterator operator -- (int) + { + iterator temp = *this; + utf8::prior(it, range_start); + return temp; + } + }; // class iterator + +} // namespace utf8 + +#endif //header guard + + diff --git a/utf8/core.h b/utf8/core.h new file mode 100644 index 0000000000..693d388c07 --- /dev/null +++ b/utf8/core.h @@ -0,0 +1,329 @@ +// Copyright 2006 Nemanja Trifunovic + +/* +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. +*/ + + +#ifndef UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 +#define UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 + +#include + +namespace utf8 +{ + // The typedefs for 8-bit, 16-bit and 32-bit unsigned integers + // You may need to change them to match your system. + // These typedefs have the same names as ones from cstdint, or boost/cstdint + typedef unsigned char uint8_t; + typedef unsigned short uint16_t; + typedef unsigned int uint32_t; + +// Helper code - not intended to be directly called by the library users. May be changed at any time +namespace internal +{ + // Unicode constants + // Leading (high) surrogates: 0xd800 - 0xdbff + // Trailing (low) surrogates: 0xdc00 - 0xdfff + const uint16_t LEAD_SURROGATE_MIN = 0xd800u; + const uint16_t LEAD_SURROGATE_MAX = 0xdbffu; + const uint16_t TRAIL_SURROGATE_MIN = 0xdc00u; + const uint16_t TRAIL_SURROGATE_MAX = 0xdfffu; + const uint16_t LEAD_OFFSET = LEAD_SURROGATE_MIN - (0x10000 >> 10); + const uint32_t SURROGATE_OFFSET = 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN; + + // Maximum valid value for a Unicode code point + const uint32_t CODE_POINT_MAX = 0x0010ffffu; + + template + inline uint8_t mask8(octet_type oc) + { + return static_cast(0xff & oc); + } + template + inline uint16_t mask16(u16_type oc) + { + return static_cast(0xffff & oc); + } + template + inline bool is_trail(octet_type oc) + { + return ((utf8::internal::mask8(oc) >> 6) == 0x2); + } + + template + inline bool is_lead_surrogate(u16 cp) + { + return (cp >= LEAD_SURROGATE_MIN && cp <= LEAD_SURROGATE_MAX); + } + + template + inline bool is_trail_surrogate(u16 cp) + { + return (cp >= TRAIL_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX); + } + + template + inline bool is_surrogate(u16 cp) + { + return (cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX); + } + + template + inline bool is_code_point_valid(u32 cp) + { + return (cp <= CODE_POINT_MAX && !utf8::internal::is_surrogate(cp)); + } + + template + inline typename std::iterator_traits::difference_type + sequence_length(octet_iterator lead_it) + { + uint8_t lead = utf8::internal::mask8(*lead_it); + if (lead < 0x80) + return 1; + else if ((lead >> 5) == 0x6) + return 2; + else if ((lead >> 4) == 0xe) + return 3; + else if ((lead >> 3) == 0x1e) + return 4; + else + return 0; + } + + template + inline bool is_overlong_sequence(uint32_t cp, octet_difference_type length) + { + if (cp < 0x80) { + if (length != 1) + return true; + } + else if (cp < 0x800) { + if (length != 2) + return true; + } + else if (cp < 0x10000) { + if (length != 3) + return true; + } + + return false; + } + + enum utf_error {UTF8_OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT}; + + /// Helper for get_sequence_x + template + utf_error increase_safely(octet_iterator& it, octet_iterator end) + { + if (++it == end) + return NOT_ENOUGH_ROOM; + + if (!utf8::internal::is_trail(*it)) + return INCOMPLETE_SEQUENCE; + + return UTF8_OK; + } + + #define UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(IT, END) {utf_error ret = increase_safely(IT, END); if (ret != UTF8_OK) return ret;} + + /// get_sequence_x functions decode utf-8 sequences of the length x + template + utf_error get_sequence_1(octet_iterator& it, octet_iterator end, uint32_t& code_point) + { + if (it == end) + return NOT_ENOUGH_ROOM; + + code_point = utf8::internal::mask8(*it); + + return UTF8_OK; + } + + template + utf_error get_sequence_2(octet_iterator& it, octet_iterator end, uint32_t& code_point) + { + if (it == end) + return NOT_ENOUGH_ROOM; + + code_point = utf8::internal::mask8(*it); + + UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) + + code_point = ((code_point << 6) & 0x7ff) + ((*it) & 0x3f); + + return UTF8_OK; + } + + template + utf_error get_sequence_3(octet_iterator& it, octet_iterator end, uint32_t& code_point) + { + if (it == end) + return NOT_ENOUGH_ROOM; + + code_point = utf8::internal::mask8(*it); + + UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) + + code_point = ((code_point << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff); + + UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) + + code_point += (*it) & 0x3f; + + return UTF8_OK; + } + + template + utf_error get_sequence_4(octet_iterator& it, octet_iterator end, uint32_t& code_point) + { + if (it == end) + return NOT_ENOUGH_ROOM; + + code_point = utf8::internal::mask8(*it); + + UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) + + code_point = ((code_point << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff); + + UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) + + code_point += (utf8::internal::mask8(*it) << 6) & 0xfff; + + UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) + + code_point += (*it) & 0x3f; + + return UTF8_OK; + } + + #undef UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR + + template + utf_error validate_next(octet_iterator& it, octet_iterator end, uint32_t& code_point) + { + // Save the original value of it so we can go back in case of failure + // Of course, it does not make much sense with i.e. stream iterators + octet_iterator original_it = it; + + uint32_t cp = 0; + // Determine the sequence length based on the lead octet + typedef typename std::iterator_traits::difference_type octet_difference_type; + const octet_difference_type length = utf8::internal::sequence_length(it); + + // Get trail octets and calculate the code point + utf_error err = UTF8_OK; + switch (length) { + case 0: + return INVALID_LEAD; + case 1: + err = utf8::internal::get_sequence_1(it, end, cp); + break; + case 2: + err = utf8::internal::get_sequence_2(it, end, cp); + break; + case 3: + err = utf8::internal::get_sequence_3(it, end, cp); + break; + case 4: + err = utf8::internal::get_sequence_4(it, end, cp); + break; + } + + if (err == UTF8_OK) { + // Decoding succeeded. Now, security checks... + if (utf8::internal::is_code_point_valid(cp)) { + if (!utf8::internal::is_overlong_sequence(cp, length)){ + // Passed! Return here. + code_point = cp; + ++it; + return UTF8_OK; + } + else + err = OVERLONG_SEQUENCE; + } + else + err = INVALID_CODE_POINT; + } + + // Failure branch - restore the original value of the iterator + it = original_it; + return err; + } + + template + inline utf_error validate_next(octet_iterator& it, octet_iterator end) { + uint32_t ignored; + return utf8::internal::validate_next(it, end, ignored); + } + +} // namespace internal + + /// The library API - functions intended to be called by the users + + // Byte order mark + const uint8_t bom[] = {0xef, 0xbb, 0xbf}; + + template + octet_iterator find_invalid(octet_iterator start, octet_iterator end) + { + octet_iterator result = start; + while (result != end) { + utf8::internal::utf_error err_code = utf8::internal::validate_next(result, end); + if (err_code != internal::UTF8_OK) + return result; + } + return result; + } + + template + inline bool is_valid(octet_iterator start, octet_iterator end) + { + return (utf8::find_invalid(start, end) == end); + } + + template + inline bool starts_with_bom (octet_iterator it, octet_iterator end) + { + return ( + ((it != end) && (utf8::internal::mask8(*it++)) == bom[0]) && + ((it != end) && (utf8::internal::mask8(*it++)) == bom[1]) && + ((it != end) && (utf8::internal::mask8(*it)) == bom[2]) + ); + } + + //Deprecated in release 2.3 + template + inline bool is_bom (octet_iterator it) + { + return ( + (utf8::internal::mask8(*it++)) == bom[0] && + (utf8::internal::mask8(*it++)) == bom[1] && + (utf8::internal::mask8(*it)) == bom[2] + ); + } +} // namespace utf8 + +#endif // header guard + + diff --git a/utf8/unchecked.h b/utf8/unchecked.h new file mode 100644 index 0000000000..cb2427166b --- /dev/null +++ b/utf8/unchecked.h @@ -0,0 +1,228 @@ +// Copyright 2006 Nemanja Trifunovic + +/* +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. +*/ + + +#ifndef UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 +#define UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 + +#include "core.h" + +namespace utf8 +{ + namespace unchecked + { + template + octet_iterator append(uint32_t cp, octet_iterator result) + { + if (cp < 0x80) // one octet + *(result++) = static_cast(cp); + else if (cp < 0x800) { // two octets + *(result++) = static_cast((cp >> 6) | 0xc0); + *(result++) = static_cast((cp & 0x3f) | 0x80); + } + else if (cp < 0x10000) { // three octets + *(result++) = static_cast((cp >> 12) | 0xe0); + *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); + *(result++) = static_cast((cp & 0x3f) | 0x80); + } + else { // four octets + *(result++) = static_cast((cp >> 18) | 0xf0); + *(result++) = static_cast(((cp >> 12) & 0x3f)| 0x80); + *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); + *(result++) = static_cast((cp & 0x3f) | 0x80); + } + return result; + } + + template + uint32_t next(octet_iterator& it) + { + uint32_t cp = utf8::internal::mask8(*it); + typename std::iterator_traits::difference_type length = utf8::internal::sequence_length(it); + switch (length) { + case 1: + break; + case 2: + it++; + cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f); + break; + case 3: + ++it; + cp = ((cp << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff); + ++it; + cp += (*it) & 0x3f; + break; + case 4: + ++it; + cp = ((cp << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff); + ++it; + cp += (utf8::internal::mask8(*it) << 6) & 0xfff; + ++it; + cp += (*it) & 0x3f; + break; + } + ++it; + return cp; + } + + template + uint32_t peek_next(octet_iterator it) + { + return utf8::unchecked::next(it); + } + + template + uint32_t prior(octet_iterator& it) + { + while (utf8::internal::is_trail(*(--it))) ; + octet_iterator temp = it; + return utf8::unchecked::next(temp); + } + + // Deprecated in versions that include prior, but only for the sake of consistency (see utf8::previous) + template + inline uint32_t previous(octet_iterator& it) + { + return utf8::unchecked::prior(it); + } + + template + void advance (octet_iterator& it, distance_type n) + { + for (distance_type i = 0; i < n; ++i) + utf8::unchecked::next(it); + } + + template + typename std::iterator_traits::difference_type + distance (octet_iterator first, octet_iterator last) + { + typename std::iterator_traits::difference_type dist; + for (dist = 0; first < last; ++dist) + utf8::unchecked::next(first); + return dist; + } + + template + octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result) + { + while (start != end) { + uint32_t cp = utf8::internal::mask16(*start++); + // Take care of surrogate pairs first + if (utf8::internal::is_lead_surrogate(cp)) { + uint32_t trail_surrogate = utf8::internal::mask16(*start++); + cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET; + } + result = utf8::unchecked::append(cp, result); + } + return result; + } + + template + u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result) + { + while (start < end) { + uint32_t cp = utf8::unchecked::next(start); + if (cp > 0xffff) { //make a surrogate pair + *result++ = static_cast((cp >> 10) + internal::LEAD_OFFSET); + *result++ = static_cast((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN); + } + else + *result++ = static_cast(cp); + } + return result; + } + + template + octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result) + { + while (start != end) + result = utf8::unchecked::append(*(start++), result); + + return result; + } + + template + u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result) + { + while (start < end) + (*result++) = utf8::unchecked::next(start); + + return result; + } + + // The iterator class + template + class iterator : public std::iterator { + octet_iterator it; + public: + iterator () {} + explicit iterator (const octet_iterator& octet_it): it(octet_it) {} + // the default "big three" are OK + octet_iterator base () const { return it; } + uint32_t operator * () const + { + octet_iterator temp = it; + return utf8::unchecked::next(temp); + } + bool operator == (const iterator& rhs) const + { + return (it == rhs.it); + } + bool operator != (const iterator& rhs) const + { + return !(operator == (rhs)); + } + iterator& operator ++ () + { + ::std::advance(it, utf8::internal::sequence_length(it)); + return *this; + } + iterator operator ++ (int) + { + iterator temp = *this; + ::std::advance(it, utf8::internal::sequence_length(it)); + return temp; + } + iterator& operator -- () + { + utf8::unchecked::prior(it); + return *this; + } + iterator operator -- (int) + { + iterator temp = *this; + utf8::unchecked::prior(it); + return temp; + } + }; // class iterator + + } // namespace utf8::unchecked +} // namespace utf8 + + +#endif // header guard + diff --git a/utf8_string.cpp b/utf8_string.cpp index b1f96403ae..4166bc9480 100644 --- a/utf8_string.cpp +++ b/utf8_string.cpp @@ -2,95 +2,47 @@ #define SASS_UTF8_STRING #include +#include #include #include +#include "utf8.h" + namespace Sass { namespace UTF_8 { using std::string; - // class utf8_string { - // string s_; - // public: - // utf8_string(const string &s): s_(s) {} - // utf8_string(const char* c): s_(string(c)) {} - // char operator[](size_t i); - // size_t length(); - // size_t byte_to_char(size_t i); - // }; + // naming conventions: + // offset: raw byte offset (0 based) + // position: code point offset (0 based) + // index: code point offset (1 based or negative) // function that will count the number of code points (utf-8 characters) from the given beginning to the given end size_t code_point_count(const string& str, size_t start, size_t end) { - size_t len = 0; - size_t i = start; - - while (i < end) { - unsigned char c = static_cast(str[i]); - if (c < 128) { - // it's a single-byte character - ++len; - ++i; - } - // it's a multi byte sequence and presumably it's a leading byte - else { - ++i; // go to the next byte - // see if it's still part of the sequence - while ((i < end) && ((static_cast(str[i]) & 0xC0) == 0x80)) { - ++i; - } - // when it's not [aka a new leading byte], increment and move on - ++len; - } - } - return len; + return utf8::distance(str.begin() + start, str.begin() + end); } size_t code_point_count(const string& str) { - return code_point_count(str, 0, str.length()); + return utf8::distance(str.begin(), str.end()); } - // function that will return the byte offset of a code point in a - size_t code_point_offset_to_byte_offset(const string& str, size_t offset) { - size_t i = 0; - size_t len = 0; - - while (len < offset) { - unsigned char c = static_cast(str[i]); - if (c < 128) { - // it's a single-byte character - ++len; - ++i; - } - // it's a multi byte sequence and presumably it's a leading byte - else { - ++i; // go to the next byte - // see if it's still part of the sequence - while ((i < str.length()) && ((static_cast(str[i]) & 0xC0) == 0x80)) { - ++i; - } - // when it's not [aka a new leading byte], increment and move on - ++len; - } - } - return i; + // function that will return the byte offset at a code point position + size_t offset_at_position(const string& str, size_t position) { + string::const_iterator it = str.begin(); + utf8::advance(it, position, str.end()); + return distance(str.begin(), it); } - // function that returns number of bytes in a character in a string - size_t length_of_code_point_at(const string& str, size_t pos) { - unsigned char c = static_cast(str[pos]); - size_t i = 0; - if(c < 128) { - return 1; - } else { - ++i; // go to the next byte - ++pos; - // see if it's still part of the sequence - while ((i < str.length()) && ((static_cast(str[pos]) & 0xC0) == 0x80)) { - ++i; - ++pos; - } - } - return i; + // function that returns number of bytes in a character at offset + size_t code_point_size_at_offset(const string& str, size_t offset) { + // get iterator from string and forward by offset + string::const_iterator stop = str.begin() + offset; + // check if beyond boundary + if (stop == str.end()) return 0; + // advance by one code point + utf8::advance(stop, 1, str.end()); + // calculate offset for code point + return stop - str.begin() - offset; } // function that will return a normalized index, given a crazy one @@ -101,7 +53,7 @@ namespace Sass { if (index > 0 && index <= signed_len) { // positive and within string length return index-1; - } + } else if (index > signed_len) { // positive and past string length return len; @@ -119,6 +71,31 @@ namespace Sass { } } + // utf16 functions + using std::wstring; + + // convert from utf16/wide string to utf8 string + string convert_from_utf16(const wstring& utf16) + { + string utf8; + // pre-allocate expected memory + utf8.reserve(sizeof(utf16)/2); + utf8::utf16to8(utf16.begin(), utf16.end(), + back_inserter(utf8)); + return utf8; + } + + // convert from utf8 string to utf16/wide string + wstring convert_to_utf16(const string& utf8) + { + wstring utf16; + // pre-allocate expected memory + utf16.reserve(code_point_count(utf8)*2); + utf8::utf8to16(utf8.begin(), utf8.end(), + back_inserter(utf16)); + return utf16; + } + } } diff --git a/utf8_string.hpp b/utf8_string.hpp index 189f61345f..89493927cc 100644 --- a/utf8_string.hpp +++ b/utf8_string.hpp @@ -5,30 +5,31 @@ namespace Sass { namespace UTF_8 { - // class utf8_string { - // string s_; - // public: - // utf8_string(const string &s): s_(s) {} - // utf8_string(const char* c): s_(string(c)) {} - // char operator[](size_t i); - // size_t length(); - // size_t byte_to_char(size_t i); - // }; + // naming conventions: + // offset: raw byte offset (0 based) + // position: code point offset (0 based) + // index: code point offset (1 based or negative) // function that will count the number of code points (utf-8 characters) from the beginning to the given end size_t code_point_count(const string& str, size_t start, size_t end); size_t code_point_count(const string& str); // function that will return the byte offset of a code point in a - size_t code_point_offset_to_byte_offset(const string& str, size_t offset); + size_t offset_at_position(const string& str, size_t position); // function that returns number of bytes in a character in a string - size_t length_of_code_point_at(const string& str, size_t pos); + size_t code_point_size_at_offset(const string& str, size_t offset); // function that will return a normalized index, given a crazy one size_t normalize_index(int index, size_t len); + #ifdef _WIN32 + // functions to handle unicode paths on windows + string convert_from_utf16(const wstring& wstr); + wstring convert_to_utf16(const string& str); + #endif + } }