Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
UNICHAR Class Reference

#include <unichar.h>

Public Member Functions

 UNICHAR ()
 
 UNICHAR (const char *utf8_str, int len)
 
 UNICHAR (int unicode)
 
int first_uni () const
 
int utf8_len () const
 
const char * utf8 () const
 
char * utf8_str () const
 

Static Public Member Functions

static int utf8_step (const char *utf8_str)
 

Detailed Description

Definition at line 50 of file unichar.h.

Constructor & Destructor Documentation

UNICHAR::UNICHAR ( )
inline

Definition at line 52 of file unichar.h.

52  {
53  memset(chars, 0, UNICHAR_LEN);
54  }
#define UNICHAR_LEN
Definition: unichar.h:28
UNICHAR::UNICHAR ( const char *  utf8_str,
int  len 
)

Definition at line 28 of file unichar.cpp.

28  {
29  int total_len = 0;
30  int step = 0;
31  if (len < 0) {
32  for (len = 0; utf8_str[len] != 0 && len < UNICHAR_LEN; ++len);
33  }
34  for (total_len = 0; total_len < len; total_len += step) {
35  step = utf8_step(utf8_str + total_len);
36  if (total_len + step > UNICHAR_LEN)
37  break; // Too long.
38  if (step == 0)
39  break; // Illegal first byte.
40  int i;
41  for (i = 1; i < step; ++i)
42  if ((utf8_str[total_len + i] & 0xc0) != 0x80)
43  break;
44  if (i < step)
45  break; // Illegal surrogate
46  }
47  memcpy(chars, utf8_str, total_len);
48  if (total_len < UNICHAR_LEN) {
49  chars[UNICHAR_LEN - 1] = total_len;
50  while (total_len < UNICHAR_LEN - 1)
51  chars[total_len++] = 0;
52  }
53 }
static int utf8_step(const char *utf8_str)
Definition: unichar.cpp:131
char * utf8_str() const
Definition: unichar.cpp:122
#define UNICHAR_LEN
Definition: unichar.h:28
UNICHAR::UNICHAR ( int  unicode)
explicit

Definition at line 57 of file unichar.cpp.

57  {
58  const int bytemask = 0xBF;
59  const int bytemark = 0x80;
60 
61  if (unicode < 0x80) {
62  chars[UNICHAR_LEN - 1] = 1;
63  chars[2] = 0;
64  chars[1] = 0;
65  chars[0] = static_cast<char>(unicode);
66  } else if (unicode < 0x800) {
67  chars[UNICHAR_LEN - 1] = 2;
68  chars[2] = 0;
69  chars[1] = static_cast<char>((unicode | bytemark) & bytemask);
70  unicode >>= 6;
71  chars[0] = static_cast<char>(unicode | 0xc0);
72  } else if (unicode < 0x10000) {
73  chars[UNICHAR_LEN - 1] = 3;
74  chars[2] = static_cast<char>((unicode | bytemark) & bytemask);
75  unicode >>= 6;
76  chars[1] = static_cast<char>((unicode | bytemark) & bytemask);
77  unicode >>= 6;
78  chars[0] = static_cast<char>(unicode | 0xe0);
79  } else if (unicode <= UNI_MAX_LEGAL_UTF32) {
80  chars[UNICHAR_LEN - 1] = 4;
81  chars[3] = static_cast<char>((unicode | bytemark) & bytemask);
82  unicode >>= 6;
83  chars[2] = static_cast<char>((unicode | bytemark) & bytemask);
84  unicode >>= 6;
85  chars[1] = static_cast<char>((unicode | bytemark) & bytemask);
86  unicode >>= 6;
87  chars[0] = static_cast<char>(unicode | 0xf0);
88  } else {
89  memset(chars, 0, UNICHAR_LEN);
90  }
91 }
#define UNI_MAX_LEGAL_UTF32
Definition: unichar.cpp:22
#define UNICHAR_LEN
Definition: unichar.h:28

Member Function Documentation

int UNICHAR::first_uni ( ) const

Definition at line 94 of file unichar.cpp.

94  {
95  static const int utf8_offsets[5] = {
96  0, 0, 0x3080, 0xE2080, 0x3C82080
97  };
98  int uni = 0;
99  int len = utf8_step(chars);
100  const char* src = chars;
101 
102  switch (len) {
103  default:
104  break;
105  case 4:
106  uni += static_cast<unsigned char>(*src++);
107  uni <<= 6;
108  case 3:
109  uni += static_cast<unsigned char>(*src++);
110  uni <<= 6;
111  case 2:
112  uni += static_cast<unsigned char>(*src++);
113  uni <<= 6;
114  case 1:
115  uni += static_cast<unsigned char>(*src++);
116  }
117  uni -= utf8_offsets[len];
118  return uni;
119 }
static int utf8_step(const char *utf8_str)
Definition: unichar.cpp:131
const char* UNICHAR::utf8 ( ) const
inline

Definition at line 76 of file unichar.h.

76  {
77  return chars;
78  }
int UNICHAR::utf8_len ( ) const
inline

Definition at line 70 of file unichar.h.

70  {
71  int len = chars[UNICHAR_LEN - 1];
72  return len >=0 && len < UNICHAR_LEN ? len : UNICHAR_LEN;
73  }
#define UNICHAR_LEN
Definition: unichar.h:28
int UNICHAR::utf8_step ( const char *  utf8_str)
static

Definition at line 131 of file unichar.cpp.

131  {
132  static const char utf8_bytes[256] = {
133  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
134  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
135  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
136  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
137  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
138  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
139  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
140  3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4,0,0,0,0,0,0,0,0
141  };
142 
143  return utf8_bytes[static_cast<unsigned char>(*utf8_str)];
144 }
char * UNICHAR::utf8_str ( ) const

Definition at line 122 of file unichar.cpp.

122  {
123  int len = utf8_len();
124  char* str = new char[len + 1];
125  memcpy(str, chars, len);
126  str[len] = 0;
127  return str;
128 }
int utf8_len() const
Definition: unichar.h:70

The documentation for this class was generated from the following files: