json/include/nlohmann/detail/input/input_adapters.hpp
2018-05-18 06:19:31 +00:00

579 lines
18 KiB
C++

#pragma once
#include <algorithm> // min
#include <array> // array
#include <cassert> // assert
#include <cstddef> // size_t
#include <cstring> // strlen
#include <ios> // streamsize, streamoff, streampos
#include <istream> // istream
#include <iterator> // begin, end, iterator_traits, random_access_iterator_tag, distance, next
#include <memory> // shared_ptr, make_shared, addressof
#include <numeric> // accumulate
#include <string> // string, char_traits
#include <type_traits> // enable_if, is_base_of, is_pointer, is_integral, remove_pointer
#include <utility> // pair, declval
#include <nlohmann/detail/macro_scope.hpp>
namespace nlohmann
{
namespace detail
{
////////////////////
// input adapters //
////////////////////
/*!
@brief abstract input adapter interface
Produces a stream of std::char_traits<char>::int_type characters from a
std::istream, a buffer, or some other input type. Accepts the return of exactly
one non-EOF character for future input. The int_type characters returned
consist of all valid char values as positive values (typically unsigned char),
plus an EOF value outside that range, specified by the value of the function
std::char_traits<char>::eof(). This value is typically -1, but could be any
arbitrary value which is not a valid char value.
*/
struct input_adapter_protocol
{
/// get a character [0,255] or std::char_traits<char>::eof().
virtual std::char_traits<char>::int_type get_character() = 0;
/// restore the last non-eof() character to input
virtual void unget_character() = 0;
virtual ~input_adapter_protocol() = default;
};
/// a type to simplify interfaces
using input_adapter_t = std::shared_ptr<input_adapter_protocol>;
/*!
A helper function to skip the UTF-8 byte order mark.
If a complete BOM has been skipped, or if an incomplete BOM has been detected
and the stream has been successfully rewind to the start of the BOM, returns
goodbit.
If an internal operation fails, returns badbit, and the streambuf should no
longer be used.
Note: Doesn't handle the eofbit. Before doing anything else is.unget() clears
the eofbit. However, some implementations keep the eofbit if is.unget() fails,
others do not.
Note: The streambuf must be non-null.
*/
inline std::ios_base::iostate skip_byte_order_mark(std::streambuf* sb)
{
using traits_type = std::char_traits<char>;
assert(sb != nullptr);
if (sb->sgetc() == 0xEF)
{
sb->sbumpc();
if (sb->sgetc() == 0xBB)
{
sb->sbumpc();
if (sb->sgetc() == 0xBF)
{
sb->sbumpc();
return std::ios_base::goodbit;
}
if (sb->sungetc() == traits_type::eof())
{
return std::ios_base::badbit;
}
}
if (sb->sungetc() == traits_type::eof())
{
return std::ios_base::badbit;
}
}
return std::ios_base::goodbit;
}
/*!
Input adapter for a (caching) istream.
Ignores a UTF Byte Order Mark at beginning of input.
Does not support changing the underlying std::streambuf in mid-input.
*/
#if 0
class input_stream_adapter : public input_adapter_protocol
{
public:
using traits_type = std::char_traits<char>;
explicit input_stream_adapter(std::istream& i)
: is(i)
{
// Skip byte order mark
if (is.peek() == 0xEF)
{
is.ignore();
if (is.peek() == 0xBB)
{
is.ignore();
if (is.peek() == 0xBF)
{
is.ignore();
return; // Found a complete BOM.
}
is.unget();
}
is.unget();
}
}
input_stream_adapter(const input_stream_adapter&) = delete;
input_stream_adapter& operator=(const input_stream_adapter&) = delete;
traits_type::int_type get_character() override
{
// Only try to get a character if the stream is good!
if (is.good())
{
const auto ch = is.peek();
// If peek() returns EOF, the following call to ignore() will set
// the failbit, but we do not want to set the failbit here.
if (ch != traits_type::eof())
{
is.ignore();
return ch;
}
}
return traits_type::eof();
}
void unget_character() override
{
is.unget();
}
private:
std::istream& is;
};
#else
class input_stream_adapter : public input_adapter_protocol
{
//
// NOTE:
//
// This implementation differs slightly from the reference implementation
// (using the std::istream interface):
//
// From N4659:
// 30.7.4.3 Unformatted input functions
//
// [...]
// If an exception is thrown during input then `ios::badbit` is turned
// on[310] in `*this`'s error state. (Exceptions thrown from
// `basic_ios<>::clear()` are not caught or rethrown.)
// If `(exceptions() & badbit) != 0` then the exception is rethrown.
//
// [310] This is done without causing an `ios::failure` to be thrown.
//
// However, there is no (portable) way to turn on the `badbit` in `is`
// without throwing an exception, so here we don't catch (and possibly)
// rethrow exceptions from streambuf operations.
// If an internal operation throws an exception, the behavior of this
// implementation is therefore slightly different from the reference
// implementation:
//
// If an exception is thrown during input and
//
// - badbit is turned ON in `is.exceptions()`:
// The badbit will **not** be set in `is`'s error state.
//
// - badbit is turned OFF in `is.exceptions()`:
// The badbit will **not** be set in `is`'s error state and the
// exception is **not** swallowed.
//
public:
using traits_type = std::char_traits<char>;
explicit input_stream_adapter(std::istream& i)
: is(i)
, ok(i, /* noskipws */ true)
{
std::ios_base::iostate state = std::ios_base::goodbit;
if (ok)
{
state |= nlohmann::detail::skip_byte_order_mark(is.rdbuf());
}
else
{
state |= std::ios_base::failbit;
}
// Update the stream state. In case skip_byte_order_mark() failed (but
// did not throw an exception), `state` now has the badbit set and the
// call to setstate might throw an ios::failure. Likewise, if the stream
// is "not ok" then the failbit will be set, which might throw an
// exception, too.
is.setstate(state);
}
input_stream_adapter(const input_stream_adapter&) = delete;
input_stream_adapter& operator=(const input_stream_adapter&) = delete;
traits_type::int_type get_character() override
{
// Only try to get a character if the stream is good!
if (is.good())
{
const auto ch = is.rdbuf()->sbumpc();
if (ch != traits_type::eof())
{
return ch;
}
// sbumpc failed.
// No more characters are available. Set eofbit.
is.setstate(std::ios_base::eofbit);
}
return traits_type::eof();
}
void unget_character() override
{
// This method is only ever called if the last call to get_character was
// successful (i.e. not EOF). This implies that the stream is good and
// that the call to sungetc below is guaranteed to succeed.
is.rdbuf()->sungetc();
}
private:
std::istream& is;
std::istream::sentry const ok;
};
#endif
/// input adapter for buffer input
class input_buffer_adapter : public input_adapter_protocol
{
public:
input_buffer_adapter(const char* b, const std::size_t l)
: cursor(b), limit(b + l), start(b)
{
// skip byte order mark
if (l >= 3 and b[0] == '\xEF' and b[1] == '\xBB' and b[2] == '\xBF')
{
cursor += 3;
}
}
// delete because of pointer members
input_buffer_adapter(const input_buffer_adapter&) = delete;
input_buffer_adapter& operator=(input_buffer_adapter&) = delete;
std::char_traits<char>::int_type get_character() noexcept override
{
if (JSON_LIKELY(cursor < limit))
{
return std::char_traits<char>::to_int_type(*(cursor++));
}
return std::char_traits<char>::eof();
}
void unget_character() noexcept override
{
if (JSON_LIKELY(cursor > start))
{
--cursor;
}
}
private:
/// pointer to the current character
const char* cursor;
/// pointer past the last character
const char* limit;
/// pointer to the first character
const char* start;
};
template<typename WideStringType>
class wide_string_input_adapter : public input_adapter_protocol
{
public:
wide_string_input_adapter(const WideStringType& w) : str(w) {}
std::char_traits<char>::int_type get_character() noexcept override
{
// unget_character() was called previously: return the last character
if (next_unget)
{
next_unget = false;
return last_char;
}
// check if buffer needs to be filled
if (utf8_bytes_index == utf8_bytes_filled)
{
if (sizeof(typename WideStringType::value_type) == 2)
{
fill_buffer_utf16();
}
else
{
fill_buffer_utf32();
}
assert(utf8_bytes_filled > 0);
assert(utf8_bytes_index == 0);
}
// use buffer
assert(utf8_bytes_filled > 0);
assert(utf8_bytes_index < utf8_bytes_filled);
return (last_char = utf8_bytes[utf8_bytes_index++]);
}
void unget_character() noexcept override
{
next_unget = true;
}
private:
void fill_buffer_utf16()
{
utf8_bytes_index = 0;
if (current_wchar == str.size())
{
utf8_bytes[0] = std::char_traits<char>::eof();
utf8_bytes_filled = 1;
}
else
{
// get the current character
const int wc = static_cast<int>(str[current_wchar++]);
// UTF-16 to UTF-8 encoding
if (wc < 0x80)
{
utf8_bytes[0] = wc;
utf8_bytes_filled = 1;
}
else if (wc <= 0x7FF)
{
utf8_bytes[0] = 0xC0 | ((wc >> 6));
utf8_bytes[1] = 0x80 | (wc & 0x3F);
utf8_bytes_filled = 2;
}
else if (0xD800 > wc or wc >= 0xE000)
{
utf8_bytes[0] = 0xE0 | ((wc >> 12));
utf8_bytes[1] = 0x80 | ((wc >> 6) & 0x3F);
utf8_bytes[2] = 0x80 | (wc & 0x3F);
utf8_bytes_filled = 3;
}
else
{
if (current_wchar < str.size())
{
const int wc2 = static_cast<int>(str[current_wchar++]);
const int charcode = 0x10000 + (((wc & 0x3FF) << 10) | (wc2 & 0x3FF));
utf8_bytes[0] = 0xf0 | (charcode >> 18);
utf8_bytes[1] = 0x80 | ((charcode >> 12) & 0x3F);
utf8_bytes[2] = 0x80 | ((charcode >> 6) & 0x3F);
utf8_bytes[3] = 0x80 | (charcode & 0x3F);
utf8_bytes_filled = 4;
}
else
{
// unknown character
++current_wchar;
utf8_bytes[0] = wc;
utf8_bytes_filled = 1;
}
}
}
}
void fill_buffer_utf32()
{
utf8_bytes_index = 0;
if (current_wchar == str.size())
{
utf8_bytes[0] = std::char_traits<char>::eof();
utf8_bytes_filled = 1;
}
else
{
// get the current character
const int wc = static_cast<int>(str[current_wchar++]);
// UTF-32 to UTF-8 encoding
if (wc < 0x80)
{
utf8_bytes[0] = wc;
utf8_bytes_filled = 1;
}
else if (wc <= 0x7FF)
{
utf8_bytes[0] = 0xC0 | ((wc >> 6) & 0x1F);
utf8_bytes[1] = 0x80 | (wc & 0x3F);
utf8_bytes_filled = 2;
}
else if (wc <= 0xFFFF)
{
utf8_bytes[0] = 0xE0 | ((wc >> 12) & 0x0F);
utf8_bytes[1] = 0x80 | ((wc >> 6) & 0x3F);
utf8_bytes[2] = 0x80 | (wc & 0x3F);
utf8_bytes_filled = 3;
}
else if (wc <= 0x10FFFF)
{
utf8_bytes[0] = 0xF0 | ((wc >> 18 ) & 0x07);
utf8_bytes[1] = 0x80 | ((wc >> 12) & 0x3F);
utf8_bytes[2] = 0x80 | ((wc >> 6) & 0x3F);
utf8_bytes[3] = 0x80 | (wc & 0x3F);
utf8_bytes_filled = 4;
}
else
{
// unknown character
utf8_bytes[0] = wc;
utf8_bytes_filled = 1;
}
}
}
private:
/// the wstring to process
const WideStringType& str;
/// index of the current wchar in str
std::size_t current_wchar = 0;
/// a buffer for UTF-8 bytes
std::array<std::char_traits<char>::int_type, 4> utf8_bytes = {{0, 0, 0, 0}};
/// index to the utf8_codes array for the next valid byte
std::size_t utf8_bytes_index = 0;
/// number of valid bytes in the utf8_codes array
std::size_t utf8_bytes_filled = 0;
/// the last character (returned after unget_character() is called)
std::char_traits<char>::int_type last_char = 0;
/// whether get_character() should return last_char
bool next_unget = false;
};
class input_adapter
{
public:
// native support
/// input adapter for input stream
input_adapter(std::istream& i)
: ia(std::make_shared<input_stream_adapter>(i)) {}
/// input adapter for input stream
input_adapter(std::istream&& i)
: ia(std::make_shared<input_stream_adapter>(i)) {}
input_adapter(const std::wstring& ws)
: ia(std::make_shared<wide_string_input_adapter<std::wstring>>(ws)) {}
input_adapter(const std::u16string& ws)
: ia(std::make_shared<wide_string_input_adapter<std::u16string>>(ws)) {}
input_adapter(const std::u32string& ws)
: ia(std::make_shared<wide_string_input_adapter<std::u32string>>(ws)) {}
/// input adapter for buffer
template<typename CharT,
typename std::enable_if<
std::is_pointer<CharT>::value and
std::is_integral<typename std::remove_pointer<CharT>::type>::value and
sizeof(typename std::remove_pointer<CharT>::type) == 1,
int>::type = 0>
input_adapter(CharT b, std::size_t l)
: ia(std::make_shared<input_buffer_adapter>(reinterpret_cast<const char*>(b), l)) {}
// derived support
/// input adapter for string literal
template<typename CharT,
typename std::enable_if<
std::is_pointer<CharT>::value and
std::is_integral<typename std::remove_pointer<CharT>::type>::value and
sizeof(typename std::remove_pointer<CharT>::type) == 1,
int>::type = 0>
input_adapter(CharT b)
: input_adapter(reinterpret_cast<const char*>(b),
std::strlen(reinterpret_cast<const char*>(b))) {}
/// input adapter for iterator range with contiguous storage
template<class IteratorType,
typename std::enable_if<
std::is_same<typename std::iterator_traits<IteratorType>::iterator_category, std::random_access_iterator_tag>::value,
int>::type = 0>
input_adapter(IteratorType first, IteratorType last)
{
// assertion to check that the iterator range is indeed contiguous,
// see http://stackoverflow.com/a/35008842/266378 for more discussion
assert(std::accumulate(
first, last, std::pair<bool, int>(true, 0),
[&first](std::pair<bool, int> res, decltype(*first) val)
{
res.first &= (val == *(std::next(std::addressof(*first), res.second++)));
return res;
}).first);
// assertion to check that each element is 1 byte long
static_assert(
sizeof(typename std::iterator_traits<IteratorType>::value_type) == 1,
"each element in the iterator range must have the size of 1 byte");
const auto len = static_cast<size_t>(std::distance(first, last));
if (JSON_LIKELY(len > 0))
{
// there is at least one element: use the address of first
ia = std::make_shared<input_buffer_adapter>(reinterpret_cast<const char*>(&(*first)), len);
}
else
{
// the address of first cannot be used: use nullptr
ia = std::make_shared<input_buffer_adapter>(nullptr, len);
}
}
/// input adapter for array
template<class T, std::size_t N>
input_adapter(T (&array)[N])
: input_adapter(std::begin(array), std::end(array)) {}
/// input adapter for contiguous container
template<class ContiguousContainer, typename
std::enable_if<not std::is_pointer<ContiguousContainer>::value and
std::is_base_of<std::random_access_iterator_tag, typename std::iterator_traits<decltype(std::begin(std::declval<ContiguousContainer const>()))>::iterator_category>::value,
int>::type = 0>
input_adapter(const ContiguousContainer& c)
: input_adapter(std::begin(c), std::end(c)) {}
operator input_adapter_t()
{
return ia;
}
private:
/// the actual adapter
input_adapter_t ia = nullptr;
};
}
}