encodedstream.h
1 #ifndef RAPIDJSON_ENCODEDSTREAM_H_
2 #define RAPIDJSON_ENCODEDSTREAM_H_
3 
4 #include "rapidjson.h"
5 
6 #ifdef __GNUC__
7 RAPIDJSON_DIAG_PUSH
8 RAPIDJSON_DIAG_OFF(effc++)
9 #endif
10 
11 namespace rapidjson {
12 
13 //! Input byte stream wrapper with a statically bound encoding.
14 /*!
15  \tparam Encoding The interpretation of encoding of the stream. Either UTF8, UTF16LE, UTF16BE, UTF32LE, UTF32BE.
16  \tparam InputByteStream Type of input byte stream. For example, FileReadStream.
17 */
18 template <typename Encoding, typename InputByteStream>
20  RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1);
21 public:
22  typedef typename Encoding::Ch Ch;
23 
24  EncodedInputStream(InputByteStream& is) : is_(is) {
25  current_ = Encoding::TakeBOM(is_);
26  }
27 
28  Ch Peek() const { return current_; }
29  Ch Take() { Ch c = current_; current_ = Encoding::Take(is_); return c; }
30  size_t Tell() const { return is_.Tell(); }
31 
32  // Not implemented
33  void Put(Ch) { RAPIDJSON_ASSERT(false); }
34  void Flush() { RAPIDJSON_ASSERT(false); }
35  Ch* PutBegin() { RAPIDJSON_ASSERT(false); return 0; }
36  size_t PutEnd(Ch*) { RAPIDJSON_ASSERT(false); return 0; }
37 
38 private:
40  EncodedInputStream& operator=(const EncodedInputStream&);
41 
42  InputByteStream& is_;
43  Ch current_;
44 };
45 
46 //! Output byte stream wrapper with statically bound encoding.
47 /*!
48  \tparam Encoding The interpretation of encoding of the stream. Either UTF8, UTF16LE, UTF16BE, UTF32LE, UTF32BE.
49  \tparam InputByteStream Type of input byte stream. For example, FileWriteStream.
50 */
51 template <typename Encoding, typename OutputByteStream>
53  RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1);
54 public:
55  typedef typename Encoding::Ch Ch;
56 
57  EncodedOutputStream(OutputByteStream& os, bool putBOM = true) : os_(os) {
58  if (putBOM)
59  Encoding::PutBOM(os_);
60  }
61 
62  void Put(Ch c) { Encoding::Put(os_, c); }
63  void Flush() { os_.Flush(); }
64 
65  // Not implemented
66  Ch Peek() const { RAPIDJSON_ASSERT(false); }
67  Ch Take() { RAPIDJSON_ASSERT(false); }
68  size_t Tell() const { RAPIDJSON_ASSERT(false); return 0; }
69  Ch* PutBegin() { RAPIDJSON_ASSERT(false); return 0; }
70  size_t PutEnd(Ch*) { RAPIDJSON_ASSERT(false); return 0; }
71 
72 private:
74  EncodedOutputStream& operator=(const EncodedOutputStream&);
75 
76  OutputByteStream& os_;
77 };
78 
79 #define RAPIDJSON_ENCODINGS_FUNC(x) UTF8<Ch>::x, UTF16LE<Ch>::x, UTF16BE<Ch>::x, UTF32LE<Ch>::x, UTF32BE<Ch>::x
80 
81 //! Input stream wrapper with dynamically bound encoding and automatic encoding detection.
82 /*!
83  \tparam CharType Type of character for reading.
84  \tparam InputByteStream type of input byte stream to be wrapped.
85 */
86 template <typename CharType, typename InputByteStream>
88  RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1);
89 public:
90  typedef CharType Ch;
91 
92  //! Constructor.
93  /*!
94  \param is input stream to be wrapped.
95  \param type UTF encoding type if it is not detected from the stream.
96  */
97  AutoUTFInputStream(InputByteStream& is, UTFType type = kUTF8) : is_(&is), type_(type), hasBOM_(false) {
98  DetectType();
99  static const TakeFunc f[] = { RAPIDJSON_ENCODINGS_FUNC(Take) };
100  takeFunc_ = f[type_];
101  current_ = takeFunc_(*is_);
102  }
103 
104  UTFType GetType() const { return type_; }
105  bool HasBOM() const { return hasBOM_; }
106 
107  Ch Peek() const { return current_; }
108  Ch Take() { Ch c = current_; current_ = takeFunc_(*is_); return c; }
109  size_t Tell() const { return is_->Tell(); }
110 
111  // Not implemented
112  void Put(Ch) { RAPIDJSON_ASSERT(false); }
113  void Flush() { RAPIDJSON_ASSERT(false); }
114  Ch* PutBegin() { RAPIDJSON_ASSERT(false); return 0; }
115  size_t PutEnd(Ch*) { RAPIDJSON_ASSERT(false); return 0; }
116 
117 private:
119  AutoUTFInputStream& operator=(const AutoUTFInputStream&);
120 
121  // Detect encoding type with BOM or RFC 4627
122  void DetectType() {
123  // BOM (Byte Order Mark):
124  // 00 00 FE FF UTF-32BE
125  // FF FE 00 00 UTF-32LE
126  // FE FF UTF-16BE
127  // FF FE UTF-16LE
128  // EF BB BF UTF-8
129 
130  const unsigned char* c = (const unsigned char *)is_->Peek4();
131  if (!c)
132  return;
133 
134  unsigned bom = c[0] | (c[1] << 8) | (c[2] << 16) | (c[3] << 24);
135  hasBOM_ = false;
136  if (bom == 0xFFFE0000) { type_ = kUTF32BE; hasBOM_ = true; is_->Take(); is_->Take(); is_->Take(); is_->Take(); }
137  else if (bom == 0x0000FEFF) { type_ = kUTF32LE; hasBOM_ = true; is_->Take(); is_->Take(); is_->Take(); is_->Take(); }
138  else if ((bom & 0xFFFF) == 0xFFFE) { type_ = kUTF16BE; hasBOM_ = true; is_->Take(); is_->Take(); }
139  else if ((bom & 0xFFFF) == 0xFEFF) { type_ = kUTF16LE; hasBOM_ = true; is_->Take(); is_->Take(); }
140  else if ((bom & 0xFFFFFF) == 0xBFBBEF) { type_ = kUTF8; hasBOM_ = true; is_->Take(); is_->Take(); is_->Take(); }
141 
142  // RFC 4627: Section 3
143  // "Since the first two characters of a JSON text will always be ASCII
144  // characters [RFC0020], it is possible to determine whether an octet
145  // stream is UTF-8, UTF-16 (BE or LE), or UTF-32 (BE or LE) by looking
146  // at the pattern of nulls in the first four octets."
147  // 00 00 00 xx UTF-32BE
148  // 00 xx 00 xx UTF-16BE
149  // xx 00 00 00 UTF-32LE
150  // xx 00 xx 00 UTF-16LE
151  // xx xx xx xx UTF-8
152 
153  if (!hasBOM_) {
154  unsigned pattern = (c[0] ? 1 : 0) | (c[1] ? 2 : 0) | (c[2] ? 4 : 0) | (c[3] ? 8 : 0);
155  switch (pattern) {
156  case 0x08: type_ = kUTF32BE; break;
157  case 0x0A: type_ = kUTF16BE; break;
158  case 0x01: type_ = kUTF32LE; break;
159  case 0x05: type_ = kUTF16LE; break;
160  case 0x0F: type_ = kUTF8; break;
161  default: break; // Use type defined by user.
162  }
163  }
164 
165  // Runtime check whether the size of character type is sufficient. It only perform checks with assertion.
166  switch (type_) {
167  case kUTF8:
168  // Do nothing
169  break;
170  case kUTF16LE:
171  case kUTF16BE:
172  RAPIDJSON_ASSERT(sizeof(Ch) >= 2);
173  break;
174  case kUTF32LE:
175  case kUTF32BE:
176  RAPIDJSON_ASSERT(sizeof(Ch) >= 4);
177  break;
178  default:
179  RAPIDJSON_ASSERT(false); // Invalid type
180  }
181  }
182 
183  typedef Ch (*TakeFunc)(InputByteStream& is);
184  InputByteStream* is_;
185  UTFType type_;
186  Ch current_;
187  TakeFunc takeFunc_;
188  bool hasBOM_;
189 };
190 
191 //! Output stream wrapper with dynamically bound encoding and automatic encoding detection.
192 /*!
193  \tparam CharType Type of character for writing.
194  \tparam InputByteStream type of output byte stream to be wrapped.
195 */
196 template <typename CharType, typename OutputByteStream>
198  RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1);
199 public:
200  typedef CharType Ch;
201 
202  //! Constructor.
203  /*!
204  \param os output stream to be wrapped.
205  \param type UTF encoding type.
206  \param putBOM Whether to write BOM at the beginning of the stream.
207  */
208  AutoUTFOutputStream(OutputByteStream& os, UTFType type, bool putBOM) : os_(&os), type_(type) {
209  // RUntime check whether the size of character type is sufficient. It only perform checks with assertion.
210  switch (type_) {
211  case kUTF16LE:
212  case kUTF16BE:
213  RAPIDJSON_ASSERT(sizeof(Ch) >= 2);
214  break;
215  case kUTF32LE:
216  case kUTF32BE:
217  RAPIDJSON_ASSERT(sizeof(Ch) >= 4);
218  break;
219  case kUTF8:
220  // Do nothing
221  break;
222  default:
223  RAPIDJSON_ASSERT(false); // Invalid UTFType
224  }
225 
226  static const PutFunc f[] = { RAPIDJSON_ENCODINGS_FUNC(Put) };
227  putFunc_ = f[type_];
228 
229  if (putBOM)
230  PutBOM();
231  }
232 
233  UTFType GetType() const { return type_; }
234 
235  void Put(Ch c) { putFunc_(*os_, c); }
236  void Flush() { os_->Flush(); }
237 
238  // Not implemented
239  Ch Peek() const { RAPIDJSON_ASSERT(false); }
240  Ch Take() { RAPIDJSON_ASSERT(false); }
241  size_t Tell() const { RAPIDJSON_ASSERT(false); return 0; }
242  Ch* PutBegin() { RAPIDJSON_ASSERT(false); return 0; }
243  size_t PutEnd(Ch*) { RAPIDJSON_ASSERT(false); return 0; }
244 
245 private:
247  AutoUTFOutputStream& operator=(const AutoUTFOutputStream&);
248 
249  void PutBOM() {
250  typedef void (*PutBOMFunc)(OutputByteStream&);
251  static const PutBOMFunc f[] = { RAPIDJSON_ENCODINGS_FUNC(PutBOM) };
252  f[type_](*os_);
253  }
254 
255  typedef void (*PutFunc)(OutputByteStream&, Ch);
256 
257  OutputByteStream* os_;
258  UTFType type_;
259  PutFunc putFunc_;
260 };
261 
262 #undef RAPIDJSON_ENCODINGS_FUNC
263 
264 } // namespace rapidjson
265 
266 #ifdef __GNUC__
267 RAPIDJSON_DIAG_POP
268 #endif
269 
270 #endif // RAPIDJSON_FILESTREAM_H_
UTF-16 little endian.
Definition: encodings.h:525
AutoUTFOutputStream(OutputByteStream &os, UTFType type, bool putBOM)
Constructor.
Definition: encodedstream.h:208
UTF-32 little endian.
Definition: encodings.h:527
Output byte stream wrapper with statically bound encoding.
Definition: encodedstream.h:52
Input stream wrapper with dynamically bound encoding and automatic encoding detection.
Definition: encodedstream.h:87
#define RAPIDJSON_ASSERT(x)
Assertion.
Definition: rapidjson.h:146
UTF-8.
Definition: encodings.h:524
main RapidJSON namespace
Definition: allocators.h:6
AutoUTFInputStream(InputByteStream &is, UTFType type=kUTF8)
Constructor.
Definition: encodedstream.h:97
Input byte stream wrapper with a statically bound encoding.
Definition: encodedstream.h:19
UTF-16 big endian.
Definition: encodings.h:526
common definitions and configuration
Output stream wrapper with dynamically bound encoding and automatic encoding detection.
Definition: encodedstream.h:197
UTF-32 big endian.
Definition: encodings.h:528
UTFType
Runtime-specified UTF encoding type of a stream.
Definition: encodings.h:523