| 1 |
// Utf8_16.h |
|---|
| 2 |
// Copyright (C) 2002 Scott Kirkwood |
|---|
| 3 |
// |
|---|
| 4 |
// Permission to use, copy, modify, distribute and sell this code |
|---|
| 5 |
// and its documentation for any purpose is hereby granted without fee, |
|---|
| 6 |
// provided that the above copyright notice appear in all copies or |
|---|
| 7 |
// any derived copies. Scott Kirkwood makes no representations |
|---|
| 8 |
// about the suitability of this software for any purpose. |
|---|
| 9 |
// It is provided "as is" without express or implied warranty. |
|---|
| 10 |
// |
|---|
| 11 |
// Notes: Used the UTF information I found at: |
|---|
| 12 |
// http://www.cl.cam.ac.uk/~mgk25/unicode.html |
|---|
| 13 |
//////////////////////////////////////////////////////////////////////////////// |
|---|
| 14 |
|
|---|
| 15 |
#ifndef UTF8_16 |
|---|
| 16 |
#define UTF8_16 |
|---|
| 17 |
|
|---|
| 18 |
#include <stdio.h> |
|---|
| 19 |
#include <assert.h> |
|---|
| 20 |
|
|---|
| 21 |
#ifdef _MSC_VER |
|---|
| 22 |
#pragma warning(disable: 4514) // nreferenced inline function has been removed |
|---|
| 23 |
#endif |
|---|
| 24 |
|
|---|
| 25 |
class Utf8_16 { |
|---|
| 26 |
public: |
|---|
| 27 |
typedef unsigned short utf16; // 16 bits |
|---|
| 28 |
typedef unsigned char utf8; // 8 bits |
|---|
| 29 |
typedef unsigned char ubyte; |
|---|
| 30 |
enum encodingType { |
|---|
| 31 |
eUnknown, |
|---|
| 32 |
eUtf16BigEndian, |
|---|
| 33 |
eUtf16LittleEndian, // Default on Windows |
|---|
| 34 |
eUtf8, |
|---|
| 35 |
eLast |
|---|
| 36 |
}; |
|---|
| 37 |
static const utf8 k_Boms[eLast][3]; |
|---|
| 38 |
}; |
|---|
| 39 |
|
|---|
| 40 |
// Reads UTF-16 and outputs UTF-8 |
|---|
| 41 |
class Utf16_Iter : public Utf8_16 { |
|---|
| 42 |
public: |
|---|
| 43 |
Utf16_Iter(); |
|---|
| 44 |
void reset(); |
|---|
| 45 |
void set(const ubyte* pBuf, size_t nLen, encodingType eEncoding); |
|---|
| 46 |
utf8 get() const { |
|---|
| 47 |
return m_nCur; |
|---|
| 48 |
} |
|---|
| 49 |
void operator++(); |
|---|
| 50 |
operator bool() { return m_pRead <= m_pEnd; } |
|---|
| 51 |
|
|---|
| 52 |
protected: |
|---|
| 53 |
void toStart(); // Put to start state, swap bytes if necessary |
|---|
| 54 |
enum eState { |
|---|
| 55 |
eStart, |
|---|
| 56 |
e2Bytes2, |
|---|
| 57 |
e3Bytes2, |
|---|
| 58 |
e3Bytes3 |
|---|
| 59 |
}; |
|---|
| 60 |
protected: |
|---|
| 61 |
encodingType m_eEncoding; |
|---|
| 62 |
eState m_eState; |
|---|
| 63 |
utf8 m_nCur; |
|---|
| 64 |
utf16 m_nCur16; |
|---|
| 65 |
const ubyte* m_pBuf; |
|---|
| 66 |
const ubyte* m_pRead; |
|---|
| 67 |
const ubyte* m_pEnd; |
|---|
| 68 |
}; |
|---|
| 69 |
|
|---|
| 70 |
// Reads UTF-8 and outputs UTF-16 |
|---|
| 71 |
class Utf8_Iter : public Utf8_16 { |
|---|
| 72 |
public: |
|---|
| 73 |
Utf8_Iter(); |
|---|
| 74 |
void reset(); |
|---|
| 75 |
void set(const ubyte* pBuf, size_t nLen, encodingType eEncoding); |
|---|
| 76 |
utf16 get() const { |
|---|
| 77 |
#ifdef _DEBUG |
|---|
| 78 |
assert(m_eState == eStart); |
|---|
| 79 |
#endif |
|---|
| 80 |
return m_nCur; |
|---|
| 81 |
} |
|---|
| 82 |
bool canGet() const { return m_eState == eStart; } |
|---|
| 83 |
void operator++(); |
|---|
| 84 |
operator bool() { return m_pRead <= m_pEnd; } |
|---|
| 85 |
|
|---|
| 86 |
protected: |
|---|
| 87 |
void swap(); |
|---|
| 88 |
void toStart(); // Put to start state, swap bytes if necessary |
|---|
| 89 |
enum eState { |
|---|
| 90 |
eStart, |
|---|
| 91 |
e2Bytes_Byte2, |
|---|
| 92 |
e3Bytes_Byte2, |
|---|
| 93 |
e3Bytes_Byte3 |
|---|
| 94 |
}; |
|---|
| 95 |
protected: |
|---|
| 96 |
encodingType m_eEncoding; |
|---|
| 97 |
eState m_eState; |
|---|
| 98 |
utf16 m_nCur; |
|---|
| 99 |
const ubyte* m_pBuf; |
|---|
| 100 |
const ubyte* m_pRead; |
|---|
| 101 |
const ubyte* m_pEnd; |
|---|
| 102 |
}; |
|---|
| 103 |
|
|---|
| 104 |
// Reads UTF16 and outputs UTF8 |
|---|
| 105 |
class Utf8_16_Read : public Utf8_16 { |
|---|
| 106 |
public: |
|---|
| 107 |
Utf8_16_Read(); |
|---|
| 108 |
~Utf8_16_Read(); |
|---|
| 109 |
|
|---|
| 110 |
size_t convert(char* buf, size_t len); |
|---|
| 111 |
char* getNewBuf() { return reinterpret_cast<char*>(m_pNewBuf); } |
|---|
| 112 |
|
|---|
| 113 |
encodingType getEncoding() const { return m_eEncoding; } |
|---|
| 114 |
protected: |
|---|
| 115 |
int determineEncoding(); |
|---|
| 116 |
private: |
|---|
| 117 |
encodingType m_eEncoding; |
|---|
| 118 |
ubyte* m_pBuf; |
|---|
| 119 |
ubyte* m_pNewBuf; |
|---|
| 120 |
size_t m_nBufSize; |
|---|
| 121 |
bool m_bFirstRead; |
|---|
| 122 |
size_t m_nLen; |
|---|
| 123 |
Utf16_Iter m_Iter16; |
|---|
| 124 |
}; |
|---|
| 125 |
|
|---|
| 126 |
// Read in a UTF-8 buffer and write out to UTF-16 or UTF-8 |
|---|
| 127 |
class Utf8_16_Write : public Utf8_16 { |
|---|
| 128 |
public: |
|---|
| 129 |
Utf8_16_Write(); |
|---|
| 130 |
~Utf8_16_Write(); |
|---|
| 131 |
|
|---|
| 132 |
void setEncoding(encodingType eType); |
|---|
| 133 |
|
|---|
| 134 |
FILE * fopen(const char *_name, const char *_type); |
|---|
| 135 |
size_t fwrite(const void* p, size_t _size); |
|---|
| 136 |
void fclose(); |
|---|
| 137 |
protected: |
|---|
| 138 |
encodingType m_eEncoding; |
|---|
| 139 |
FILE* m_pFile; |
|---|
| 140 |
utf16* m_pBuf; |
|---|
| 141 |
size_t m_nBufSize; |
|---|
| 142 |
bool m_bFirstWrite; |
|---|
| 143 |
}; |
|---|
| 144 |
|
|---|
| 145 |
|
|---|
| 146 |
#endif |
|---|