root/trunk/Utf8_16.cpp

Revision 5, 7.2 kB (checked in by qbert, 6 years ago)

Initial ( and last :( ) commit

Line 
1 // Utf8_16.cxx
2 // Copyright (C) 2002 Scott Kirkwood
3 //
4 // Permission to use, copy, modify, distribute and sell this code
5 // and its documentation for any purpose is hereby granted without fee,
6 // provided that the above copyright notice appear in all copies or
7 // any derived copies.  Scott Kirkwood makes no representations
8 // about the suitability of this software for any purpose.
9 // It is provided "as is" without express or implied warranty.
10 ////////////////////////////////////////////////////////////////////////////////
11 #include "stdafx.h"
12
13 #include "Utf8_16.h"
14
15 #include <stdio.h>
16
17 const Utf8_16::utf8 Utf8_16::k_Boms[][3] = {
18     {0x00, 0x00, 0x00},  // Unknown
19     {0xFE, 0xFF, 0x00},  // Big endian
20     {0xFF, 0xFE, 0x00},  // Little endian
21     {0xEF, 0xBB, 0xBF}, // UTF8
22 };
23
24 // ==================================================================
25
26 Utf8_16_Read::Utf8_16_Read() {
27     m_eEncoding = eUnknown;
28     m_nBufSize = 0;
29     m_pNewBuf = NULL;
30     m_bFirstRead = true;
31 }
32
33 Utf8_16_Read::~Utf8_16_Read() {
34     if ((m_eEncoding != eUnknown) && (m_eEncoding != eUtf8)) {
35         delete [] m_pNewBuf;
36         m_pNewBuf = NULL;
37     }
38 }
39
40 size_t Utf8_16_Read::convert(char* buf, size_t len) {
41     m_pBuf = reinterpret_cast<ubyte*>(buf);
42     m_nLen = len;
43
44     int nSkip = 0;
45     if (m_bFirstRead) {
46         nSkip = determineEncoding();
47         m_bFirstRead = false;
48     }
49
50     if (m_eEncoding == eUnknown) {
51         // Do nothing, pass through
52         m_nBufSize = 0;
53         m_pNewBuf = m_pBuf;
54         return len;
55     }
56
57     if (m_eEncoding == eUtf8) {
58         // Pass through after BOM
59         m_nBufSize = 0;
60         m_pNewBuf = m_pBuf + nSkip;
61         return len - nSkip;
62     }
63
64     // Else...
65     size_t newSize = len + len / 2 + 1;
66     if (m_nBufSize != newSize) {
67         delete [] m_pNewBuf;
68         m_pNewBuf = NULL;
69         m_pNewBuf = new ubyte[newSize];
70         m_nBufSize = newSize;
71     }
72
73     ubyte* pCur = m_pNewBuf;
74
75     m_Iter16.set(m_pBuf + nSkip, len - nSkip, m_eEncoding);
76
77     for (; m_Iter16; ++m_Iter16) {
78         *pCur++ = m_Iter16.get();
79     }
80
81     // Return number of bytes writen out
82     return pCur - m_pNewBuf;
83 }
84
85 int Utf8_16_Read::determineEncoding() {
86     m_eEncoding = eUnknown;
87
88     int nRet = 0;
89
90     if (m_nLen > 1) {
91         if (m_pBuf[0] == k_Boms[eUtf16BigEndian][0] && m_pBuf[1] == k_Boms[eUtf16BigEndian][1]) {
92             m_eEncoding = eUtf16BigEndian;
93             nRet = 2;
94         } else if (m_pBuf[0] == k_Boms[eUtf16LittleEndian][0] && m_pBuf[1] == k_Boms[eUtf16LittleEndian][1]) {
95             m_eEncoding = eUtf16LittleEndian;
96             nRet = 2;
97         } else if (m_nLen > 2 && m_pBuf[0] == k_Boms[eUtf8][0] && m_pBuf[1] == k_Boms[eUtf8][1] && m_pBuf[2] == k_Boms[eUtf8][2]) {
98             m_eEncoding = eUtf8;
99             nRet = 3;
100         }
101     }
102
103     return nRet;
104 }
105
106 // ==================================================================
107
108 Utf8_16_Write::Utf8_16_Write() {
109     m_eEncoding = eUnknown;
110     m_pFile = NULL;
111     m_pBuf = NULL;
112     m_bFirstWrite = true;
113     m_nBufSize = 0;
114 }
115
116 Utf8_16_Write::~Utf8_16_Write() {
117     if (m_pFile) {
118         fclose();
119     }
120 }
121
122 FILE * Utf8_16_Write::fopen(const char *_name, const char *_type) {
123     m_pFile = ::fopen(_name, _type);
124
125     m_bFirstWrite = true;
126
127     return m_pFile;
128 }
129
130 size_t Utf8_16_Write::fwrite(const void* p, size_t _size) {
131     if (!m_pFile) {
132         return 0; // fail
133     }
134
135     if (m_eEncoding == eUnknown) {
136         // Normal write
137         return ::fwrite(p, _size, 1, m_pFile);
138     }
139
140     if (m_eEncoding == eUtf8) {
141         if (m_bFirstWrite)
142             ::fwrite(k_Boms[m_eEncoding], 3, 1, m_pFile);
143         return ::fwrite(p, _size, 1, m_pFile);
144     }
145
146     if (_size > m_nBufSize) {
147         m_nBufSize = _size;
148         delete [] m_pBuf;
149         m_pBuf = NULL;
150         m_pBuf = new utf16[_size + 1];
151     }
152
153     if (m_bFirstWrite) {
154         if (m_eEncoding == eUtf16BigEndian || m_eEncoding == eUtf16LittleEndian) {
155             // Write the BOM
156             ::fwrite(k_Boms[m_eEncoding], 2, 1, m_pFile);
157         }
158
159         m_bFirstWrite = false;
160     }
161
162     Utf8_Iter iter8;
163     iter8.set(static_cast<const ubyte*>(p), _size, m_eEncoding);
164
165     utf16* pCur = m_pBuf;
166
167     for (; iter8; ++iter8) {
168         if (iter8.canGet()) {
169             *pCur++ = iter8.get();
170         }
171     }
172
173     size_t ret = ::fwrite(m_pBuf, (const char*)pCur - (const char*)m_pBuf, 1, m_pFile);
174
175     return ret;
176 }
177
178 void Utf8_16_Write::fclose() {
179     delete [] m_pBuf;
180     m_pBuf = NULL;
181
182     ::fclose(m_pFile);
183     m_pFile = NULL;
184 }
185
186 void Utf8_16_Write::setEncoding(Utf8_16::encodingType eType) {
187     m_eEncoding = eType;
188 }
189
190 //=================================================================
191 Utf8_Iter::Utf8_Iter() {
192     reset();
193 }
194
195 void Utf8_Iter::reset() {
196     m_pBuf = NULL;
197     m_pRead = NULL;
198     m_pEnd = NULL;
199     m_eState = eStart;
200     m_nCur = 0;
201     m_eEncoding = eUnknown;
202 }
203
204 void Utf8_Iter::set
205     (const ubyte* pBuf, size_t nLen, encodingType eEncoding) {
206     m_pBuf = pBuf;
207     m_pRead = pBuf;
208     m_pEnd = pBuf + nLen;
209     m_eEncoding = eEncoding;
210     operator++();
211     // Note: m_eState, m_nCur not reset
212 }
213 // Go to the next byte.
214 void Utf8_Iter::operator++() {
215     switch (m_eState) {
216     case eStart:
217         if ((0xE0 & *m_pRead) == 0xE0) {
218             m_nCur = static_cast<utf16>((~0xE0 & *m_pRead) << 12);
219             m_eState = e3Bytes_Byte2;
220         } else if ((0xC0 & *m_pRead) == 0xC0) {
221             m_nCur = static_cast<utf16>((~0xC0 & *m_pRead) << 6);
222             m_eState = e2Bytes_Byte2;
223         } else {
224             m_nCur = *m_pRead;
225             toStart();
226         }
227         break;
228     case e2Bytes_Byte2:
229         m_nCur |= static_cast<utf8>(0x3F & *m_pRead);
230         toStart();
231         break;
232     case e3Bytes_Byte2:
233         m_nCur |= static_cast<utf8>((0x3F & *m_pRead) << 6);
234         m_eState = e3Bytes_Byte3;
235         break;
236     case e3Bytes_Byte3:
237         m_nCur |= static_cast<utf8>(0x3F & *m_pRead);
238         toStart();
239         break;
240     }
241     ++m_pRead;
242 }
243
244 void Utf8_Iter::toStart() {
245     m_eState = eStart;
246     if (m_eEncoding == eUtf16BigEndian) {
247         swap();
248     }
249 }
250
251 void Utf8_Iter::swap() {
252     utf8* p = reinterpret_cast<utf8*>(&m_nCur);
253     utf8 swapbyte = *p;
254     *p = *(p + 1);
255     *(p + 1) = swapbyte;
256 }
257
258 //==================================================
259 Utf16_Iter::Utf16_Iter() {
260     reset();
261 }
262
263 void Utf16_Iter::reset() {
264     m_pBuf = NULL;
265     m_pRead = NULL;
266     m_pEnd = NULL;
267     m_eState = eStart;
268     m_nCur = 0;
269     m_nCur16 = 0;
270     m_eEncoding = eUnknown;
271 }
272
273 void Utf16_Iter::set
274     (const ubyte* pBuf, size_t nLen, encodingType eEncoding) {
275     m_pBuf = pBuf;
276     m_pRead = pBuf;
277     m_pEnd = pBuf + nLen;
278     m_eEncoding = eEncoding;
279     operator++();
280     // Note: m_eState, m_nCur, m_nCur16 not reinitalized.
281 }
282
283 // Goes to the next byte.
284 // Not the next symbol which you might expect.
285 // This way we can continue from a partial buffer that doesn't align
286 void Utf16_Iter::operator++() {
287     switch (m_eState) {
288     case eStart:
289         if (m_eEncoding == eUtf16LittleEndian) {
290             m_nCur16 = *m_pRead++;
291             m_nCur16 |= static_cast<utf16>(*m_pRead << 8);
292         } else {
293             m_nCur16 = static_cast<utf16>(*m_pRead++ << 8);
294             m_nCur16 |= *m_pRead;
295         }
296         ++m_pRead;
297
298         if (m_nCur16 < 0x80) {
299             m_nCur = static_cast<ubyte>(m_nCur16 & 0xFF);
300             m_eState = eStart;
301         } else if (m_nCur16 < 0x800) {
302             m_nCur = static_cast<ubyte>(0xC0 | m_nCur16 >> 6);
303             m_eState = e2Bytes2;
304         } else {
305             m_nCur = static_cast<ubyte>(0xE0 | m_nCur16 >> 12);
306             m_eState = e3Bytes2;
307         }
308         break;
309     case e2Bytes2:
310         m_nCur = static_cast<ubyte>(0x80 | m_nCur16 & 0x3F);
311         m_eState = eStart;
312         break;
313     case e3Bytes2:
314         m_nCur = static_cast<ubyte>(0x80 | ((m_nCur16 >> 6) & 0x3F));
315         m_eState = e3Bytes3;
316         break;
317     case e3Bytes3:
318         m_nCur = static_cast<ubyte>(0x80 | m_nCur16 & 0x3F);
319         m_eState = eStart;
320         break;
321     }
322 }
Note: See TracBrowser for help on using the browser.