Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
UNICHARSET Class Reference

#include <unicharset.h>

Public Types

enum  Direction {
  U_LEFT_TO_RIGHT = 0, U_RIGHT_TO_LEFT = 1, U_EUROPEAN_NUMBER = 2, U_EUROPEAN_NUMBER_SEPARATOR = 3,
  U_EUROPEAN_NUMBER_TERMINATOR = 4, U_ARABIC_NUMBER = 5, U_COMMON_NUMBER_SEPARATOR = 6, U_BLOCK_SEPARATOR = 7,
  U_SEGMENT_SEPARATOR = 8, U_WHITE_SPACE_NEUTRAL = 9, U_OTHER_NEUTRAL = 10, U_LEFT_TO_RIGHT_EMBEDDING = 11,
  U_LEFT_TO_RIGHT_OVERRIDE = 12, U_RIGHT_TO_LEFT_ARABIC = 13, U_RIGHT_TO_LEFT_EMBEDDING = 14, U_RIGHT_TO_LEFT_OVERRIDE = 15,
  U_POP_DIRECTIONAL_FORMAT = 16, U_DIR_NON_SPACING_MARK = 17, U_BOUNDARY_NEUTRAL = 18, U_CHAR_DIRECTION_COUNT
}
 

Public Member Functions

 UNICHARSET ()
 
 ~UNICHARSET ()
 
const UNICHAR_ID unichar_to_id (const char *const unichar_repr) const
 
const UNICHAR_ID unichar_to_id (const char *const unichar_repr, int length) const
 
int step (const char *str) const
 
bool encodable_string (const char *str, int *first_bad_position) const
 
const char *const id_to_unichar (UNICHAR_ID id) const
 
const char *const id_to_unichar_ext (UNICHAR_ID id) const
 
STRING debug_str (UNICHAR_ID id) const
 
STRING debug_str (const char *unichar_repr) const
 
void unichar_insert (const char *const unichar_repr)
 
bool contains_unichar_id (UNICHAR_ID unichar_id) const
 
bool contains_unichar (const char *const unichar_repr) const
 
bool contains_unichar (const char *const unichar_repr, int length) const
 
bool eq (UNICHAR_ID unichar_id, const char *const unichar_repr) const
 
void delete_pointers_in_unichars ()
 
void clear ()
 
int size () const
 
void reserve (int unichars_number)
 
bool save_to_file (const char *const filename) const
 
bool save_to_file (FILE *file) const
 
bool load_from_inmemory_file (const char *const memory, int mem_size, bool skip_fragments)
 
bool load_from_inmemory_file (const char *const memory, int mem_size)
 
bool load_from_file (const char *const filename, bool skip_fragments)
 
bool load_from_file (const char *const filename)
 
bool load_from_file (FILE *file, bool skip_fragments)
 
bool load_from_file (FILE *file)
 
void post_load_setup ()
 
bool major_right_to_left () const
 
void set_black_and_whitelist (const char *blacklist, const char *whitelist)
 
void set_isalpha (UNICHAR_ID unichar_id, bool value)
 
void set_islower (UNICHAR_ID unichar_id, bool value)
 
void set_isupper (UNICHAR_ID unichar_id, bool value)
 
void set_isdigit (UNICHAR_ID unichar_id, bool value)
 
void set_ispunctuation (UNICHAR_ID unichar_id, bool value)
 
void set_isngram (UNICHAR_ID unichar_id, bool value)
 
void set_script (UNICHAR_ID unichar_id, const char *value)
 
void set_other_case (UNICHAR_ID unichar_id, UNICHAR_ID other_case)
 
void set_direction (UNICHAR_ID unichar_id, UNICHARSET::Direction value)
 
void set_mirror (UNICHAR_ID unichar_id, UNICHAR_ID mirror)
 
void set_normed (UNICHAR_ID unichar_id, const char *normed)
 
bool get_isalpha (UNICHAR_ID unichar_id) const
 
bool get_islower (UNICHAR_ID unichar_id) const
 
bool get_isupper (UNICHAR_ID unichar_id) const
 
bool get_isdigit (UNICHAR_ID unichar_id) const
 
bool get_ispunctuation (UNICHAR_ID unichar_id) const
 
bool get_isngram (UNICHAR_ID unichar_id) const
 
bool get_isprivate (UNICHAR_ID unichar_id) const
 
bool top_bottom_useful () const
 
void set_ranges_empty ()
 
void SetPropertiesFromOther (const UNICHARSET &src)
 
void ExpandRangesFromOther (const UNICHARSET &src)
 
void AppendOtherUnicharset (const UNICHARSET &src)
 
void get_top_bottom (UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, int *min_top, int *max_top) const
 
void set_top_bottom (UNICHAR_ID unichar_id, int min_bottom, int max_bottom, int min_top, int max_top)
 
void get_width_range (UNICHAR_ID unichar_id, int *min_width, int *max_width) const
 
void set_width_range (UNICHAR_ID unichar_id, int min_width, int max_width)
 
void get_bearing_range (UNICHAR_ID unichar_id, int *min_bearing, int *max_bearing) const
 
void set_bearing_range (UNICHAR_ID unichar_id, int min_bearing, int max_bearing)
 
void get_advance_range (UNICHAR_ID unichar_id, int *min_advance, int *max_advance) const
 
void set_advance_range (UNICHAR_ID unichar_id, int min_advance, int max_advance)
 
int get_script (UNICHAR_ID unichar_id) const
 
unsigned int get_properties (UNICHAR_ID unichar_id) const
 
char get_chartype (UNICHAR_ID unichar_id) const
 
UNICHAR_ID get_other_case (UNICHAR_ID unichar_id) const
 
Direction get_direction (UNICHAR_ID unichar_id) const
 
UNICHAR_ID get_mirror (UNICHAR_ID unichar_id) const
 
UNICHAR_ID to_lower (UNICHAR_ID unichar_id) const
 
UNICHAR_ID to_upper (UNICHAR_ID unichar_id) const
 
const CHAR_FRAGMENTget_fragment (UNICHAR_ID unichar_id) const
 
bool get_isalpha (const char *const unichar_repr) const
 
bool get_islower (const char *const unichar_repr) const
 
bool get_isupper (const char *const unichar_repr) const
 
bool get_isdigit (const char *const unichar_repr) const
 
bool get_ispunctuation (const char *const unichar_repr) const
 
unsigned int get_properties (const char *const unichar_repr) const
 
char get_chartype (const char *const unichar_repr) const
 
int get_script (const char *const unichar_repr) const
 
const CHAR_FRAGMENTget_fragment (const char *const unichar_repr) const
 
bool get_isalpha (const char *const unichar_repr, int length) const
 
bool get_islower (const char *const unichar_repr, int length) const
 
bool get_isupper (const char *const unichar_repr, int length) const
 
bool get_isdigit (const char *const unichar_repr, int length) const
 
bool get_ispunctuation (const char *const unichar_repr, int length) const
 
const char * get_normed_unichar (UNICHAR_ID unichar_id) const
 
int get_script (const char *const unichar_repr, int length) const
 
int get_script_table_size () const
 
const char * get_script_from_script_id (int id) const
 
int get_script_id_from_name (const char *script_name) const
 
bool is_null_script (const char *script) const
 
int add_script (const char *script)
 
bool get_enabled (UNICHAR_ID unichar_id) const
 
int null_sid () const
 
int common_sid () const
 
int latin_sid () const
 
int cyrillic_sid () const
 
int greek_sid () const
 
int han_sid () const
 
int hiragana_sid () const
 
int katakana_sid () const
 
int default_sid () const
 
bool script_has_upper_lower () const
 
bool script_has_xheight () const
 

Static Public Member Functions

static STRING debug_utf8_str (const char *str)
 

Static Public Attributes

static const char * kCustomLigatures [][2]
 

Detailed Description

Definition at line 127 of file unicharset.h.

Member Enumeration Documentation

Enumerator
U_LEFT_TO_RIGHT 
U_RIGHT_TO_LEFT 
U_EUROPEAN_NUMBER 
U_EUROPEAN_NUMBER_SEPARATOR 
U_EUROPEAN_NUMBER_TERMINATOR 
U_ARABIC_NUMBER 
U_COMMON_NUMBER_SEPARATOR 
U_BLOCK_SEPARATOR 
U_SEGMENT_SEPARATOR 
U_WHITE_SPACE_NEUTRAL 
U_OTHER_NEUTRAL 
U_LEFT_TO_RIGHT_EMBEDDING 
U_LEFT_TO_RIGHT_OVERRIDE 
U_RIGHT_TO_LEFT_ARABIC 
U_RIGHT_TO_LEFT_EMBEDDING 
U_RIGHT_TO_LEFT_OVERRIDE 
U_POP_DIRECTIONAL_FORMAT 
U_DIR_NON_SPACING_MARK 
U_BOUNDARY_NEUTRAL 
U_CHAR_DIRECTION_COUNT 

Definition at line 135 of file unicharset.h.

135  {
136  U_LEFT_TO_RIGHT = 0,
137  U_RIGHT_TO_LEFT = 1,
138  U_EUROPEAN_NUMBER = 2,
141  U_ARABIC_NUMBER = 5,
143  U_BLOCK_SEPARATOR = 7,
146  U_OTHER_NEUTRAL = 10,
154  U_BOUNDARY_NEUTRAL = 18,
156  };

Constructor & Destructor Documentation

UNICHARSET::UNICHARSET ( )

Definition at line 146 of file unicharset.cpp.

146  :
147  unichars(NULL),
148  ids(),
149  size_used(0),
150  size_reserved(0),
151  script_table(NULL),
152  script_table_size_used(0),
153  null_script("NULL") {
154  clear();
155 }
void clear()
Definition: unicharset.h:233
#define NULL
Definition: host.h:144
UNICHARSET::~UNICHARSET ( )

Definition at line 157 of file unicharset.cpp.

157  {
158  clear();
159 }
void clear()
Definition: unicharset.h:233

Member Function Documentation

int UNICHARSET::add_script ( const char *  script)

Definition at line 866 of file unicharset.cpp.

866  {
867  for (int i = 0; i < script_table_size_used; ++i) {
868  if (strcmp(script, script_table[i]) == 0)
869  return i;
870  }
871  if (script_table_size_reserved == 0) {
872  script_table_size_reserved = 8;
873  script_table = new char*[script_table_size_reserved];
874  }
875  if (script_table_size_used + 1 >= script_table_size_reserved) {
876  char** new_script_table = new char*[script_table_size_reserved * 2];
877  memcpy(new_script_table, script_table, script_table_size_reserved * sizeof(char*));
878  delete[] script_table;
879  script_table = new_script_table;
880  script_table_size_reserved = 2 * script_table_size_reserved;
881  }
882  script_table[script_table_size_used] = new char[strlen(script) + 1];
883  strcpy(script_table[script_table_size_used], script);
884  return script_table_size_used++;
885 }
void UNICHARSET::AppendOtherUnicharset ( const UNICHARSET src)

Definition at line 375 of file unicharset.cpp.

375  {
376  for (int ch = 0; ch < src.size_used; ++ch) {
377  const UNICHAR_PROPERTIES& src_props = src.unichars[ch].properties;
378  const char* utf8 = src.id_to_unichar(ch);
379  if (strcmp(utf8, " ") != 0 && src_props.AnyRangeEmpty()) {
380  // Only use fully valid entries.
381  tprintf("Bad properties for char %s: %d,%d %d,%d %d,%d %d,%d %d,%d\n",
382  utf8, src_props.min_bottom, src_props.max_bottom,
383  src_props.min_top, src_props.max_top,
384  src_props.min_width, src_props.max_width,
385  src_props.min_bearing, src_props.max_bearing,
386  src_props.min_advance, src_props.max_advance);
387  continue;
388  }
389  int id = size_used;
390  if (contains_unichar(utf8)) {
391  id = unichar_to_id(utf8);
392  } else {
393  unichar_insert(utf8);
394  unichars[id].properties.SetRangesEmpty();
395  }
396  if (!unichars[id].properties.AnyRangeEmpty()) {
397  // Just expand current ranges.
398  unichars[id].properties.ExpandRangesFrom(src_props);
399  } else {
400  // Copy properties from src_props.
401  unichars[id].properties.CopyFrom(src_props);
402  // Setup the script_id, other_case and mirror properly.
403  const char* script = src.get_script_from_script_id(src_props.script_id);
404  unichars[id].properties.script_id = add_script(script);
405  const char* other_case = src.id_to_unichar(src_props.other_case);
406  if (!contains_unichar(other_case)) {
407  unichar_insert(other_case);
408  unichars[size_used - 1].properties.SetRangesEmpty();
409  // Other_case will have its ranges set later as it is contained in src.
410  }
411  unichars[id].properties.other_case = unichar_to_id(other_case);
412  const char* mirror_str = src.id_to_unichar(src_props.mirror);
413  if (!contains_unichar(mirror_str)) {
414  unichar_insert(mirror_str);
415  unichars[size_used - 1].properties.SetRangesEmpty();
416  // Mirror will have its ranges set later as it is contained in src.
417  }
418  unichars[id].properties.mirror = unichar_to_id(mirror_str);
419  }
420  }
421 }
int add_script(const char *script)
Definition: unicharset.cpp:866
const char *const id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:233
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:176
const char * get_script_from_script_id(int id) const
Definition: unicharset.h:723
void unichar_insert(const char *const unichar_repr)
Definition: unicharset.cpp:511
bool contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:543
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:41
void UNICHARSET::clear ( )
inline

Definition at line 233 of file unicharset.h.

233  {
234  if (script_table != NULL) {
235  for (int i = 0; i < script_table_size_used; ++i)
236  delete[] script_table[i];
237  delete[] script_table;
238  script_table = NULL;
239  script_table_size_used = 0;
240  }
241  if (unichars != NULL) {
243  delete[] unichars;
244  unichars = NULL;
245  }
246  script_table_size_reserved = 0;
247  size_reserved = 0;
248  size_used = 0;
249  ids.clear();
250  top_bottom_set_ = false;
251  script_has_upper_lower_ = false;
252  script_has_xheight_ = false;
253  null_sid_ = 0;
254  common_sid_ = 0;
255  latin_sid_ = 0;
256  cyrillic_sid_ = 0;
257  greek_sid_ = 0;
258  han_sid_ = 0;
259  hiragana_sid_ = 0;
260  katakana_sid_ = 0;
261  }
#define NULL
Definition: host.h:144
void clear()
Definition: unicharmap.cpp:154
void delete_pointers_in_unichars()
Definition: unicharset.h:223
int UNICHARSET::common_sid ( ) const
inline

Definition at line 753 of file unicharset.h.

753 { return common_sid_; }
bool UNICHARSET::contains_unichar ( const char *const  unichar_repr) const

Definition at line 543 of file unicharset.cpp.

543  {
544  return ids.contains(unichar_repr);
545 }
bool contains(const char *const unichar_repr) const
Definition: unicharmap.cpp:101
bool UNICHARSET::contains_unichar ( const char *const  unichar_repr,
int  length 
) const

Definition at line 547 of file unicharset.cpp.

548  {
549  if (length == 0) {
550  return false;
551  }
552  return ids.contains(unichar_repr, length);
553 }
bool contains(const char *const unichar_repr) const
Definition: unicharmap.cpp:101
bool UNICHARSET::contains_unichar_id ( UNICHAR_ID  unichar_id) const
inline

Definition at line 209 of file unicharset.h.

209  {
210  return unichar_id != INVALID_UNICHAR_ID && unichar_id < size_used &&
211  unichar_id >= 0;
212  }
int UNICHARSET::cyrillic_sid ( ) const
inline

Definition at line 755 of file unicharset.h.

755 { return cyrillic_sid_; }
STRING UNICHARSET::debug_str ( UNICHAR_ID  id) const

Definition at line 285 of file unicharset.cpp.

285  {
286  if (id == INVALID_UNICHAR_ID) return STRING(id_to_unichar(id));
287  const CHAR_FRAGMENT *fragment = this->get_fragment(id);
288  if (fragment) {
289  return fragment->to_string();
290  }
291  const char* str = id_to_unichar(id);
292  STRING result = debug_utf8_str(str);
293  // Append a for lower alpha, A for upper alpha, and x if alpha but neither.
294  if (get_isalpha(id)) {
295  if (get_islower(id))
296  result += "a";
297  else if (get_isupper(id))
298  result += "A";
299  else
300  result += "x";
301  }
302  // Append 0 if a digit.
303  if (get_isdigit(id)) {
304  result += "0";
305  }
306  // Append p is a punctuation symbol.
307  if (get_ispunctuation(id)) {
308  result += "p";
309  }
310  return result;
311 }
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:392
const char *const id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:233
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:420
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:399
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:406
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:413
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:610
Definition: strngs.h:40
static STRING debug_utf8_str(const char *str)
Definition: unicharset.cpp:261
static STRING to_string(const char *unichar, int pos, int total, bool natural)
Definition: unicharset.cpp:889
STRING UNICHARSET::debug_str ( const char *  unichar_repr) const
inline

Definition at line 200 of file unicharset.h.

200  {
201  return debug_str(unichar_to_id(unichar_repr));
202  }
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:176
STRING debug_str(UNICHAR_ID id) const
Definition: unicharset.cpp:285
STRING UNICHARSET::debug_utf8_str ( const char *  str)
static

Definition at line 261 of file unicharset.cpp.

261  {
262  STRING result = str;
263  result += " [";
264  int step = 1;
265  // Chop into unicodes and code each as hex.
266  for (int i = 0; str[i] != '\0'; i += step) {
267  char hex[sizeof(int) * 2 + 1];
268  step = UNICHAR::utf8_step(str + i);
269  if (step == 0) {
270  step = 1;
271  sprintf(hex, "%x", str[i]);
272  } else {
273  UNICHAR ch(str + i, step);
274  sprintf(hex, "%x", ch.first_uni());
275  }
276  result += hex;
277  result += " ";
278  }
279  result += "]";
280  return result;
281 }
static int utf8_step(const char *utf8_str)
Definition: unichar.cpp:131
Definition: strngs.h:40
int step(const char *str) const
Definition: unicharset.cpp:192
int UNICHARSET::default_sid ( ) const
inline

Definition at line 760 of file unicharset.h.

760 { return default_sid_; }
void UNICHARSET::delete_pointers_in_unichars ( )
inline

Definition at line 223 of file unicharset.h.

223  {
224  for (int i = 0; i < size_used; ++i) {
225  if (unichars[i].properties.fragment != NULL) {
226  delete unichars[i].properties.fragment;
227  unichars[i].properties.fragment = NULL;
228  }
229  }
230  }
#define NULL
Definition: host.h:144
bool UNICHARSET::encodable_string ( const char *  str,
int *  first_bad_position 
) const

Definition at line 220 of file unicharset.cpp.

221  {
222  for (int i = 0, len = strlen(str); i < len; ) {
223  int increment = step(str + i);
224  if (increment == 0) {
225  if (first_bad_position) *first_bad_position = i;
226  return false;
227  }
228  i += increment;
229  }
230  return true;
231 }
int step(const char *str) const
Definition: unicharset.cpp:192
bool UNICHARSET::eq ( UNICHAR_ID  unichar_id,
const char *const  unichar_repr 
) const

Definition at line 555 of file unicharset.cpp.

556  {
557  return strcmp(this->id_to_unichar(unichar_id), unichar_repr) == 0;
558 }
const char *const id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:233
void UNICHARSET::ExpandRangesFromOther ( const UNICHARSET src)

Definition at line 361 of file unicharset.cpp.

361  {
362  for (int ch = 0; ch < size_used; ++ch) {
363  const char* utf8 = id_to_unichar(ch);
364  UNICHAR_PROPERTIES properties;
365  if (src.GetStrProperties(utf8, &properties)) {
366  // Expand just the ranges from properties.
367  unichars[ch].properties.ExpandRangesFrom(properties);
368  }
369  }
370 }
const char *const id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:233
void UNICHARSET::get_advance_range ( UNICHAR_ID  unichar_id,
int *  min_advance,
int *  max_advance 
) const
inline

Definition at line 531 of file unicharset.h.

532  {
533  if (INVALID_UNICHAR_ID == unichar_id) {
534  *min_advance = *max_advance = 0;
535  return;
536  }
537  ASSERT_HOST(contains_unichar_id(unichar_id));
538  *min_advance = unichars[unichar_id].properties.min_advance;
539  *max_advance = unichars[unichar_id].properties.max_advance;
540  }
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: unicharset.h:209
#define ASSERT_HOST(x)
Definition: errcode.h:84
void UNICHARSET::get_bearing_range ( UNICHAR_ID  unichar_id,
int *  min_bearing,
int *  max_bearing 
) const
inline

Definition at line 510 of file unicharset.h.

511  {
512  if (INVALID_UNICHAR_ID == unichar_id) {
513  *min_bearing = *max_bearing = 0;
514  return;
515  }
516  ASSERT_HOST(contains_unichar_id(unichar_id));
517  *min_bearing = unichars[unichar_id].properties.min_bearing;
518  *max_bearing = unichars[unichar_id].properties.max_bearing;
519  }
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: unicharset.h:209
#define ASSERT_HOST(x)
Definition: errcode.h:84
char UNICHARSET::get_chartype ( UNICHAR_ID  unichar_id) const

Definition at line 502 of file unicharset.cpp.

502  {
503  if (this->get_isupper(id)) return 'A';
504  if (this->get_islower(id)) return 'a';
505  if (this->get_isalpha(id)) return 'x';
506  if (this->get_isdigit(id)) return '0';
507  if (this->get_ispunctuation(id)) return 'p';
508  return 0;
509 }
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:392
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:420
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:399
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:406
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:413
char UNICHARSET::get_chartype ( const char *const  unichar_repr) const
inline

Definition at line 647 of file unicharset.h.

647  {
648  return get_chartype(unichar_to_id(unichar_repr));
649  }
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:176
char get_chartype(UNICHAR_ID unichar_id) const
Definition: unicharset.cpp:502
Direction UNICHARSET::get_direction ( UNICHAR_ID  unichar_id) const
inline

Definition at line 579 of file unicharset.h.

579  {
580  if (INVALID_UNICHAR_ID == unichar_id) return UNICHARSET::U_OTHER_NEUTRAL;
581  ASSERT_HOST(contains_unichar_id(unichar_id));
582  return unichars[unichar_id].properties.direction;
583  }
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: unicharset.h:209
#define ASSERT_HOST(x)
Definition: errcode.h:84
bool UNICHARSET::get_enabled ( UNICHAR_ID  unichar_id) const
inline

Definition at line 747 of file unicharset.h.

747  {
748  return unichars[unichar_id].properties.enabled;
749  }
const CHAR_FRAGMENT* UNICHARSET::get_fragment ( UNICHAR_ID  unichar_id) const
inline

Definition at line 610 of file unicharset.h.

610  {
611  if (INVALID_UNICHAR_ID == unichar_id) return NULL;
612  ASSERT_HOST(contains_unichar_id(unichar_id));
613  return unichars[unichar_id].properties.fragment;
614  }
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: unicharset.h:209
#define NULL
Definition: host.h:144
#define ASSERT_HOST(x)
Definition: errcode.h:84
const CHAR_FRAGMENT* UNICHARSET::get_fragment ( const char *const  unichar_repr) const
inline

Definition at line 660 of file unicharset.h.

660  {
661  if (unichar_repr == NULL || unichar_repr[0] == '\0' ||
662  !ids.contains(unichar_repr)) {
663  return NULL;
664  }
665  return get_fragment(unichar_to_id(unichar_repr));
666  }
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:176
#define NULL
Definition: host.h:144
bool contains(const char *const unichar_repr) const
Definition: unicharmap.cpp:101
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:610
bool UNICHARSET::get_isalpha ( UNICHAR_ID  unichar_id) const
inline

Definition at line 392 of file unicharset.h.

392  {
393  if (INVALID_UNICHAR_ID == unichar_id) return false;
394  ASSERT_HOST(contains_unichar_id(unichar_id));
395  return unichars[unichar_id].properties.isalpha;
396  }
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: unicharset.h:209
#define ASSERT_HOST(x)
Definition: errcode.h:84
bool UNICHARSET::get_isalpha ( const char *const  unichar_repr) const
inline

Definition at line 617 of file unicharset.h.

617  {
618  return get_isalpha(unichar_to_id(unichar_repr));
619  }
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:392
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:176
bool UNICHARSET::get_isalpha ( const char *const  unichar_repr,
int  length 
) const
inline

Definition at line 670 of file unicharset.h.

671  {
672  return get_isalpha(unichar_to_id(unichar_repr, length));
673  }
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:392
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:176
bool UNICHARSET::get_isdigit ( UNICHAR_ID  unichar_id) const
inline

Definition at line 413 of file unicharset.h.

413  {
414  if (INVALID_UNICHAR_ID == unichar_id) return false;
415  ASSERT_HOST(contains_unichar_id(unichar_id));
416  return unichars[unichar_id].properties.isdigit;
417  }
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: unicharset.h:209
#define ASSERT_HOST(x)
Definition: errcode.h:84
bool UNICHARSET::get_isdigit ( const char *const  unichar_repr) const
inline

Definition at line 632 of file unicharset.h.

632  {
633  return get_isdigit(unichar_to_id(unichar_repr));
634  }
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:176
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:413
bool UNICHARSET::get_isdigit ( const char *const  unichar_repr,
int  length 
) const
inline

Definition at line 691 of file unicharset.h.

692  {
693  return get_isdigit(unichar_to_id(unichar_repr, length));
694  }
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:176
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:413
bool UNICHARSET::get_islower ( UNICHAR_ID  unichar_id) const
inline

Definition at line 399 of file unicharset.h.

399  {
400  if (INVALID_UNICHAR_ID == unichar_id) return false;
401  ASSERT_HOST(contains_unichar_id(unichar_id));
402  return unichars[unichar_id].properties.islower;
403  }
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: unicharset.h:209
#define ASSERT_HOST(x)
Definition: errcode.h:84
bool UNICHARSET::get_islower ( const char *const  unichar_repr) const
inline

Definition at line 622 of file unicharset.h.

622  {
623  return get_islower(unichar_to_id(unichar_repr));
624  }
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:176
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:399
bool UNICHARSET::get_islower ( const char *const  unichar_repr,
int  length 
) const
inline

Definition at line 677 of file unicharset.h.

678  {
679  return get_islower(unichar_to_id(unichar_repr, length));
680  }
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:176
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:399
bool UNICHARSET::get_isngram ( UNICHAR_ID  unichar_id) const
inline

Definition at line 427 of file unicharset.h.

427  {
428  if (INVALID_UNICHAR_ID == unichar_id) return false;
429  ASSERT_HOST(contains_unichar_id(unichar_id));
430  return unichars[unichar_id].properties.isngram;
431  }
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: unicharset.h:209
#define ASSERT_HOST(x)
Definition: errcode.h:84
bool UNICHARSET::get_isprivate ( UNICHAR_ID  unichar_id) const

Definition at line 316 of file unicharset.cpp.

316  {
317  UNICHAR uc(id_to_unichar(unichar_id), -1);
318  int uni = uc.first_uni();
319  return (uni >= 0xE000 && uni <= 0xF8FF);
320 }
const char *const id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:233
bool UNICHARSET::get_ispunctuation ( UNICHAR_ID  unichar_id) const
inline

Definition at line 420 of file unicharset.h.

420  {
421  if (INVALID_UNICHAR_ID == unichar_id) return false;
422  ASSERT_HOST(contains_unichar_id(unichar_id));
423  return unichars[unichar_id].properties.ispunctuation;
424  }
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: unicharset.h:209
#define ASSERT_HOST(x)
Definition: errcode.h:84
bool UNICHARSET::get_ispunctuation ( const char *const  unichar_repr) const
inline

Definition at line 637 of file unicharset.h.

637  {
638  return get_ispunctuation(unichar_to_id(unichar_repr));
639  }
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:176
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:420
bool UNICHARSET::get_ispunctuation ( const char *const  unichar_repr,
int  length 
) const
inline

Definition at line 698 of file unicharset.h.

699  {
700  return get_ispunctuation(unichar_to_id(unichar_repr, length));
701  }
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:176
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:420
bool UNICHARSET::get_isupper ( UNICHAR_ID  unichar_id) const
inline

Definition at line 406 of file unicharset.h.

406  {
407  if (INVALID_UNICHAR_ID == unichar_id) return false;
408  ASSERT_HOST(contains_unichar_id(unichar_id));
409  return unichars[unichar_id].properties.isupper;
410  }
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: unicharset.h:209
#define ASSERT_HOST(x)
Definition: errcode.h:84
bool UNICHARSET::get_isupper ( const char *const  unichar_repr) const
inline

Definition at line 627 of file unicharset.h.

627  {
628  return get_isupper(unichar_to_id(unichar_repr));
629  }
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:176
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:406
bool UNICHARSET::get_isupper ( const char *const  unichar_repr,
int  length 
) const
inline

Definition at line 684 of file unicharset.h.

685  {
686  return get_isupper(unichar_to_id(unichar_repr, length));
687  }
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:176
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:406
UNICHAR_ID UNICHARSET::get_mirror ( UNICHAR_ID  unichar_id) const
inline

Definition at line 586 of file unicharset.h.

586  {
587  if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID;
588  ASSERT_HOST(contains_unichar_id(unichar_id));
589  return unichars[unichar_id].properties.mirror;
590  }
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: unicharset.h:209
#define ASSERT_HOST(x)
Definition: errcode.h:84
const char* UNICHARSET::get_normed_unichar ( UNICHAR_ID  unichar_id) const
inline

Definition at line 704 of file unicharset.h.

704  {
705  return unichars[unichar_id].properties.normed.string();
706  }
UNICHAR_ID UNICHARSET::get_other_case ( UNICHAR_ID  unichar_id) const
inline

Definition at line 572 of file unicharset.h.

572  {
573  if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID;
574  ASSERT_HOST(contains_unichar_id(unichar_id));
575  return unichars[unichar_id].properties.other_case;
576  }
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: unicharset.h:209
#define ASSERT_HOST(x)
Definition: errcode.h:84
unsigned int UNICHARSET::get_properties ( UNICHAR_ID  unichar_id) const

Definition at line 487 of file unicharset.cpp.

487  {
488  unsigned int properties = 0;
489  if (this->get_isalpha(id))
490  properties |= ISALPHA_MASK;
491  if (this->get_islower(id))
492  properties |= ISLOWER_MASK;
493  if (this->get_isupper(id))
494  properties |= ISUPPER_MASK;
495  if (this->get_isdigit(id))
496  properties |= ISDIGIT_MASK;
497  if (this->get_ispunctuation(id))
498  properties |= ISPUNCTUATION_MASK;
499  return properties;
500 }
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:392
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:420
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:399
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:406
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:413
unsigned int UNICHARSET::get_properties ( const char *const  unichar_repr) const
inline

Definition at line 643 of file unicharset.h.

643  {
644  return get_properties(unichar_to_id(unichar_repr));
645  }
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:176
unsigned int get_properties(UNICHAR_ID unichar_id) const
Definition: unicharset.cpp:487
int UNICHARSET::get_script ( UNICHAR_ID  unichar_id) const
inline

Definition at line 552 of file unicharset.h.

552  {
553  if (INVALID_UNICHAR_ID == unichar_id) return null_sid_;
554  ASSERT_HOST(contains_unichar_id(unichar_id));
555  return unichars[unichar_id].properties.script_id;
556  }
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: unicharset.h:209
#define ASSERT_HOST(x)
Definition: errcode.h:84
int UNICHARSET::get_script ( const char *const  unichar_repr) const
inline

Definition at line 654 of file unicharset.h.

654  {
655  return get_script(unichar_to_id(unichar_repr));
656  }
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:176
int get_script(UNICHAR_ID unichar_id) const
Definition: unicharset.h:552
int UNICHARSET::get_script ( const char *const  unichar_repr,
int  length 
) const
inline

Definition at line 712 of file unicharset.h.

713  {
714  return get_script(unichar_to_id(unichar_repr, length));
715  }
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:176
int get_script(UNICHAR_ID unichar_id) const
Definition: unicharset.h:552
const char* UNICHARSET::get_script_from_script_id ( int  id) const
inline

Definition at line 723 of file unicharset.h.

723  {
724  if (id >= script_table_size_used || id < 0)
725  return null_script;
726  return script_table[id];
727  }
int UNICHARSET::get_script_id_from_name ( const char *  script_name) const

Definition at line 944 of file unicharset.cpp.

944  {
945  for (int i = 0; i < script_table_size_used; ++i) {
946  if (strcmp(script_name, script_table[i]) == 0)
947  return i;
948  }
949  return 0; // 0 is always the null_script
950 }
int UNICHARSET::get_script_table_size ( ) const
inline

Definition at line 718 of file unicharset.h.

718  {
719  return script_table_size_used;
720  }
void UNICHARSET::get_top_bottom ( UNICHAR_ID  unichar_id,
int *  min_bottom,
int *  max_bottom,
int *  min_top,
int *  max_top 
) const
inline

Definition at line 459 of file unicharset.h.

461  {
462  if (INVALID_UNICHAR_ID == unichar_id) {
463  *min_bottom = *min_top = 0;
464  *max_bottom = *max_top = 256; // kBlnCellHeight
465  return;
466  }
467  ASSERT_HOST(contains_unichar_id(unichar_id));
468  *min_bottom = unichars[unichar_id].properties.min_bottom;
469  *max_bottom = unichars[unichar_id].properties.max_bottom;
470  *min_top = unichars[unichar_id].properties.min_top;
471  *max_top = unichars[unichar_id].properties.max_top;
472  }
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: unicharset.h:209
#define ASSERT_HOST(x)
Definition: errcode.h:84
void UNICHARSET::get_width_range ( UNICHAR_ID  unichar_id,
int *  min_width,
int *  max_width 
) const
inline

Definition at line 489 of file unicharset.h.

490  {
491  if (INVALID_UNICHAR_ID == unichar_id) {
492  *min_width = 0;
493  *max_width = 256; // kBlnCellHeight;
494  return;
495  }
496  ASSERT_HOST(contains_unichar_id(unichar_id));
497  *min_width = unichars[unichar_id].properties.min_width;
498  *max_width = unichars[unichar_id].properties.max_width;
499  }
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: unicharset.h:209
#define ASSERT_HOST(x)
Definition: errcode.h:84
int UNICHARSET::greek_sid ( ) const
inline

Definition at line 756 of file unicharset.h.

756 { return greek_sid_; }
int UNICHARSET::han_sid ( ) const
inline

Definition at line 757 of file unicharset.h.

757 { return han_sid_; }
int UNICHARSET::hiragana_sid ( ) const
inline

Definition at line 758 of file unicharset.h.

758 { return hiragana_sid_; }
const char *const UNICHARSET::id_to_unichar ( UNICHAR_ID  id) const

Definition at line 233 of file unicharset.cpp.

233  {
234  if (id == INVALID_UNICHAR_ID) {
235  return INVALID_UNICHAR;
236  }
237  ASSERT_HOST(id < this->size());
238  return unichars[id].representation;
239 }
int size() const
Definition: unicharset.h:264
#define ASSERT_HOST(x)
Definition: errcode.h:84
const char *const UNICHARSET::id_to_unichar_ext ( UNICHAR_ID  id) const

Definition at line 241 of file unicharset.cpp.

241  {
242  if (id == INVALID_UNICHAR_ID) {
243  return INVALID_UNICHAR;
244  }
245  ASSERT_HOST(id < this->size());
246  // Resolve from the kCustomLigatures table if this is a private encoding.
247  if (get_isprivate(id)) {
248  const char* ch = id_to_unichar(id);
249  for (int i = 0; kCustomLigatures[i][0] != NULL; ++i) {
250  if (!strcmp(ch, kCustomLigatures[i][1])) {
251  return kCustomLigatures[i][0];
252  }
253  }
254  }
255  // Otherwise return the stored representation.
256  return unichars[id].representation;
257 }
const char *const id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:233
bool get_isprivate(UNICHAR_ID unichar_id) const
Definition: unicharset.cpp:316
int size() const
Definition: unicharset.h:264
#define NULL
Definition: host.h:144
static const char * kCustomLigatures[][2]
Definition: unicharset.h:132
#define ASSERT_HOST(x)
Definition: errcode.h:84
bool UNICHARSET::is_null_script ( const char *  script) const
inline

Definition at line 737 of file unicharset.h.

737  {
738  return script == null_script;
739  }
int UNICHARSET::katakana_sid ( ) const
inline

Definition at line 759 of file unicharset.h.

759 { return katakana_sid_; }
int UNICHARSET::latin_sid ( ) const
inline

Definition at line 754 of file unicharset.h.

754 { return latin_sid_; }
bool UNICHARSET::load_from_file ( const char *const  filename,
bool  skip_fragments 
)
inline

Definition at line 298 of file unicharset.h.

298  {
299  FILE* file = fopen(filename, "rb");
300  if (file == NULL) return false;
301  bool result = load_from_file(file, skip_fragments);
302  fclose(file);
303  return result;
304  }
#define NULL
Definition: host.h:144
bool load_from_file(const char *const filename, bool skip_fragments)
Definition: unicharset.h:298
bool UNICHARSET::load_from_file ( const char *const  filename)
inline

Definition at line 306 of file unicharset.h.

306  {
307  return load_from_file(filename, false);
308  }
bool load_from_file(const char *const filename, bool skip_fragments)
Definition: unicharset.h:298
bool UNICHARSET::load_from_file ( FILE *  file,
bool  skip_fragments 
)

Definition at line 638 of file unicharset.cpp.

638  {
639  LocalFilePointer lfp(file);
642  bool success = load_via_fgets(fgets_cb, skip_fragments);
643  delete fgets_cb;
644  return success;
645 }
char * fgets(char *dst, int size)
Definition: unicharset.cpp:631
_ConstTessMemberResultCallback_0_0< false, R, T1 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)() const)
Definition: tesscallback.h:116
bool UNICHARSET::load_from_file ( FILE *  file)
inline

Definition at line 313 of file unicharset.h.

313 { return load_from_file(file, false); }
bool load_from_file(const char *const filename, bool skip_fragments)
Definition: unicharset.h:298
bool UNICHARSET::load_from_inmemory_file ( const char *const  memory,
int  mem_size,
bool  skip_fragments 
)

Definition at line 618 of file unicharset.cpp.

619  {
620  InMemoryFilePointer mem_fp(memory, mem_size);
623  bool success = load_via_fgets(fgets_cb, skip_fragments);
624  delete fgets_cb;
625  return success;
626 }
char * fgets(char *orig_dst, int size)
Definition: unicharset.cpp:596
_ConstTessMemberResultCallback_0_0< false, R, T1 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)() const)
Definition: tesscallback.h:116
bool UNICHARSET::load_from_inmemory_file ( const char *const  memory,
int  mem_size 
)
inline

Definition at line 291 of file unicharset.h.

291  {
292  return load_from_inmemory_file(memory, mem_size, false);
293  }
bool load_from_inmemory_file(const char *const memory, int mem_size, bool skip_fragments)
Definition: unicharset.cpp:618
bool UNICHARSET::major_right_to_left ( ) const

Definition at line 813 of file unicharset.cpp.

813  {
814  int ltr_count = 0;
815  int rtl_count = 0;
816  for (int id = 0; id < size_used; ++id) {
817  int dir = get_direction(id);
818  if (dir == UNICHARSET::U_LEFT_TO_RIGHT) ltr_count++;
819  if (dir == UNICHARSET::U_RIGHT_TO_LEFT ||
821  dir == UNICHARSET::U_ARABIC_NUMBER) rtl_count++;
822  }
823  return rtl_count > ltr_count;
824 }
Direction get_direction(UNICHAR_ID unichar_id) const
Definition: unicharset.h:579
int UNICHARSET::null_sid ( ) const
inline

Definition at line 752 of file unicharset.h.

752 { return null_sid_; }
void UNICHARSET::post_load_setup ( )

Definition at line 750 of file unicharset.cpp.

750  {
751  // Number of alpha chars with the case property minus those without,
752  // in order to determine that half the alpha chars have case.
753  int net_case_alphas = 0;
754  int x_height_alphas = 0;
755  int cap_height_alphas = 0;
756  top_bottom_set_ = false;
757  for (UNICHAR_ID id = 0; id < size_used; ++id) {
758  int min_bottom = 0;
759  int max_bottom = MAX_UINT8;
760  int min_top = 0;
761  int max_top = MAX_UINT8;
762  get_top_bottom(id, &min_bottom, &max_bottom, &min_top, &max_top);
763  if (min_top > 0)
764  top_bottom_set_ = true;
765  if (get_isalpha(id)) {
766  if (get_islower(id) || get_isupper(id))
767  ++net_case_alphas;
768  else
769  --net_case_alphas;
770  if (min_top < kMeanlineThreshold && max_top < kMeanlineThreshold)
771  ++x_height_alphas;
772  else if (min_top > kMeanlineThreshold && max_top > kMeanlineThreshold)
773  ++cap_height_alphas;
774  }
775  }
776 
777  script_has_upper_lower_ = net_case_alphas > 0;
778  script_has_xheight_ = script_has_upper_lower_ ||
779  (x_height_alphas > cap_height_alphas * kMinXHeightFraction &&
780  cap_height_alphas > x_height_alphas * kMinCapHeightFraction);
781 
782  null_sid_ = get_script_id_from_name(null_script);
783  ASSERT_HOST(null_sid_ == 0);
784  common_sid_ = get_script_id_from_name("Common");
785  latin_sid_ = get_script_id_from_name("Latin");
786  cyrillic_sid_ = get_script_id_from_name("Cyrillic");
787  greek_sid_ = get_script_id_from_name("Greek");
788  han_sid_ = get_script_id_from_name("Han");
789  hiragana_sid_ = get_script_id_from_name("Hiragana");
790  katakana_sid_ = get_script_id_from_name("Katakana");
791 
792  // Compute default script. Use the highest-counting alpha script, that is
793  // not the common script, as that still contains some "alphas".
794  int* script_counts = new int[script_table_size_used];
795  memset(script_counts, 0, sizeof(*script_counts) * script_table_size_used);
796  for (int id = 0; id < size_used; ++id) {
797  if (get_isalpha(id)) {
798  ++script_counts[get_script(id)];
799  }
800  }
801  default_sid_ = 0;
802  for (int s = 1; s < script_table_size_used; ++s) {
803  if (script_counts[s] > script_counts[default_sid_] && s != common_sid_)
804  default_sid_ = s;
805  }
806  delete [] script_counts;
807 }
int get_script_id_from_name(const char *script_name) const
Definition: unicharset.cpp:944
#define MAX_UINT8
Definition: host.h:121
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:392
int UNICHAR_ID
Definition: unichar.h:31
const double kMinCapHeightFraction
Definition: unicharset.cpp:52
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:399
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:406
void get_top_bottom(UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, int *min_top, int *max_top) const
Definition: unicharset.h:459
int get_script(UNICHAR_ID unichar_id) const
Definition: unicharset.h:552
const double kMinXHeightFraction
Definition: unicharset.cpp:51
#define ASSERT_HOST(x)
Definition: errcode.h:84
void UNICHARSET::reserve ( int  unichars_number)

Definition at line 161 of file unicharset.cpp.

161  {
162  if (unichars_number > size_reserved) {
163  UNICHAR_SLOT* unichars_new = new UNICHAR_SLOT[unichars_number];
164  for (int i = 0; i < size_used; ++i)
165  unichars_new[i] = unichars[i];
166  for (int j = size_used; j < unichars_number; ++j) {
167  unichars_new[j].properties.script_id = add_script(null_script);
168  }
169  delete[] unichars;
170  unichars = unichars_new;
171  size_reserved = unichars_number;
172  }
173 }
int add_script(const char *script)
Definition: unicharset.cpp:866
bool UNICHARSET::save_to_file ( const char *const  filename) const
inline

Definition at line 273 of file unicharset.h.

273  {
274  FILE* file = fopen(filename, "w+b");
275  if (file == NULL) return false;
276  bool result = save_to_file(file);
277  fclose(file);
278  return result;
279  }
#define NULL
Definition: host.h:144
bool save_to_file(const char *const filename) const
Definition: unicharset.h:273
bool UNICHARSET::save_to_file ( FILE *  file) const

Definition at line 560 of file unicharset.cpp.

560  {
561  fprintf(file, "%d\n", this->size());
562  for (UNICHAR_ID id = 0; id < this->size(); ++id) {
563  int min_bottom, max_bottom, min_top, max_top;
564  get_top_bottom(id, &min_bottom, &max_bottom, &min_top, &max_top);
565  int min_width, max_width;
566  get_width_range(id, &min_width, &max_width);
567  int min_bearing, max_bearing;
568  get_bearing_range(id, &min_bearing, &max_bearing);
569  int min_advance, max_advance;
570  get_advance_range(id, &min_advance, &max_advance);
571  unsigned int properties = this->get_properties(id);
572  if (strcmp(this->id_to_unichar(id), " ") == 0) {
573  fprintf(file, "%s %x %s %d\n", "NULL", properties,
574  this->get_script_from_script_id(this->get_script(id)),
575  this->get_other_case(id));
576  } else {
577  fprintf(file,
578  "%s %x %d,%d,%d,%d,%d,%d,%d,%d,%d,%d %s %d %d %d %s\t# %s\n",
579  this->id_to_unichar(id), properties,
580  min_bottom, max_bottom, min_top, max_top, min_width, max_width,
581  min_bearing, max_bearing, min_advance, max_advance,
582  this->get_script_from_script_id(this->get_script(id)),
583  this->get_other_case(id), this->get_direction(id),
584  this->get_mirror(id), this->get_normed_unichar(id),
585  this->debug_str(id).string());
586  }
587  }
588  return true;
589 }
const char *const id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:233
int UNICHAR_ID
Definition: unichar.h:31
int size() const
Definition: unicharset.h:264
void get_width_range(UNICHAR_ID unichar_id, int *min_width, int *max_width) const
Definition: unicharset.h:489
const char * get_script_from_script_id(int id) const
Definition: unicharset.h:723
const char * get_normed_unichar(UNICHAR_ID unichar_id) const
Definition: unicharset.h:704
void get_bearing_range(UNICHAR_ID unichar_id, int *min_bearing, int *max_bearing) const
Definition: unicharset.h:510
void get_advance_range(UNICHAR_ID unichar_id, int *min_advance, int *max_advance) const
Definition: unicharset.h:531
STRING debug_str(UNICHAR_ID id) const
Definition: unicharset.cpp:285
void get_top_bottom(UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, int *min_top, int *max_top) const
Definition: unicharset.h:459
UNICHAR_ID get_other_case(UNICHAR_ID unichar_id) const
Definition: unicharset.h:572
int get_script(UNICHAR_ID unichar_id) const
Definition: unicharset.h:552
UNICHAR_ID get_mirror(UNICHAR_ID unichar_id) const
Definition: unicharset.h:586
Direction get_direction(UNICHAR_ID unichar_id) const
Definition: unicharset.h:579
unsigned int get_properties(UNICHAR_ID unichar_id) const
Definition: unicharset.cpp:487
bool UNICHARSET::script_has_upper_lower ( ) const
inline

Definition at line 763 of file unicharset.h.

763  {
764  return script_has_upper_lower_;
765  }
bool UNICHARSET::script_has_xheight ( ) const
inline

Definition at line 770 of file unicharset.h.

770  {
771  return script_has_xheight_;
772  }
void UNICHARSET::set_advance_range ( UNICHAR_ID  unichar_id,
int  min_advance,
int  max_advance 
)
inline

Definition at line 541 of file unicharset.h.

542  {
543  unichars[unichar_id].properties.min_advance =
544  static_cast<inT16>(ClipToRange(min_advance, 0, MAX_INT16));
545  unichars[unichar_id].properties.max_advance =
546  static_cast<inT16>(ClipToRange(max_advance, 0, MAX_INT16));
547  }
short inT16
Definition: host.h:100
#define MAX_INT16
Definition: host.h:119
T ClipToRange(const T &x, const T &lower_bound, const T &upper_bound)
Definition: helpers.h:64
void UNICHARSET::set_bearing_range ( UNICHAR_ID  unichar_id,
int  min_bearing,
int  max_bearing 
)
inline

Definition at line 520 of file unicharset.h.

521  {
522  unichars[unichar_id].properties.min_bearing =
523  static_cast<inT16>(ClipToRange(min_bearing, 0, MAX_INT16));
524  unichars[unichar_id].properties.max_bearing =
525  static_cast<inT16>(ClipToRange(max_bearing, 0, MAX_INT16));
526  }
short inT16
Definition: host.h:100
#define MAX_INT16
Definition: host.h:119
T ClipToRange(const T &x, const T &lower_bound, const T &upper_bound)
Definition: helpers.h:64
void UNICHARSET::set_black_and_whitelist ( const char *  blacklist,
const char *  whitelist 
)

Definition at line 829 of file unicharset.cpp.

830  {
831  bool def_enabled = whitelist == NULL || whitelist[0] == '\0';
832  // Set everything to default
833  for (int ch = 0; ch < size_used; ++ch)
834  unichars[ch].properties.enabled = def_enabled;
835  int ch_step;
836  if (!def_enabled) {
837  // Enable the whitelist.
838  for (int w_ind = 0; whitelist[w_ind] != '\0'; w_ind += ch_step) {
839  ch_step = step(whitelist + w_ind);
840  if (ch_step > 0) {
841  UNICHAR_ID u_id = unichar_to_id(whitelist + w_ind, ch_step);
842  if (u_id != INVALID_UNICHAR_ID) {
843  unichars[u_id].properties.enabled = true;
844  }
845  } else {
846  ch_step = 1;
847  }
848  }
849  }
850  if (blacklist != NULL && blacklist[0] != '\0') {
851  // Disable the blacklist.
852  for (int b_ind = 0; blacklist[b_ind] != '\0'; b_ind += ch_step) {
853  ch_step = step(blacklist + b_ind);
854  if (ch_step > 0) {
855  UNICHAR_ID u_id = unichar_to_id(blacklist + b_ind, ch_step);
856  if (u_id != INVALID_UNICHAR_ID) {
857  unichars[u_id].properties.enabled = false;
858  }
859  } else {
860  ch_step = 1;
861  }
862  }
863  }
864 }
int UNICHAR_ID
Definition: unichar.h:31
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:176
#define NULL
Definition: host.h:144
int step(const char *str) const
Definition: unicharset.cpp:192
void UNICHARSET::set_direction ( UNICHAR_ID  unichar_id,
UNICHARSET::Direction  value 
)
inline

Definition at line 377 of file unicharset.h.

377  {
378  unichars[unichar_id].properties.direction = value;
379  }
void UNICHARSET::set_isalpha ( UNICHAR_ID  unichar_id,
bool  value 
)
inline

Definition at line 336 of file unicharset.h.

336  {
337  unichars[unichar_id].properties.isalpha = value;
338  }
void UNICHARSET::set_isdigit ( UNICHAR_ID  unichar_id,
bool  value 
)
inline

Definition at line 351 of file unicharset.h.

351  {
352  unichars[unichar_id].properties.isdigit = value;
353  }
void UNICHARSET::set_islower ( UNICHAR_ID  unichar_id,
bool  value 
)
inline

Definition at line 341 of file unicharset.h.

341  {
342  unichars[unichar_id].properties.islower = value;
343  }
void UNICHARSET::set_isngram ( UNICHAR_ID  unichar_id,
bool  value 
)
inline

Definition at line 361 of file unicharset.h.

361  {
362  unichars[unichar_id].properties.isngram = value;
363  }
void UNICHARSET::set_ispunctuation ( UNICHAR_ID  unichar_id,
bool  value 
)
inline

Definition at line 356 of file unicharset.h.

356  {
357  unichars[unichar_id].properties.ispunctuation = value;
358  }
void UNICHARSET::set_isupper ( UNICHAR_ID  unichar_id,
bool  value 
)
inline

Definition at line 346 of file unicharset.h.

346  {
347  unichars[unichar_id].properties.isupper = value;
348  }
void UNICHARSET::set_mirror ( UNICHAR_ID  unichar_id,
UNICHAR_ID  mirror 
)
inline

Definition at line 382 of file unicharset.h.

382  {
383  unichars[unichar_id].properties.mirror = mirror;
384  }
void UNICHARSET::set_normed ( UNICHAR_ID  unichar_id,
const char *  normed 
)
inline

Definition at line 387 of file unicharset.h.

387  {
388  unichars[unichar_id].properties.normed = normed;
389  }
void UNICHARSET::set_other_case ( UNICHAR_ID  unichar_id,
UNICHAR_ID  other_case 
)
inline

Definition at line 372 of file unicharset.h.

372  {
373  unichars[unichar_id].properties.other_case = other_case;
374  }
void UNICHARSET::set_ranges_empty ( )

Definition at line 324 of file unicharset.cpp.

324  {
325  for (int id = 0; id < size_used; ++id) {
326  unichars[id].properties.SetRangesEmpty();
327  }
328 }
void UNICHARSET::set_script ( UNICHAR_ID  unichar_id,
const char *  value 
)
inline

Definition at line 367 of file unicharset.h.

367  {
368  unichars[unichar_id].properties.script_id = add_script(value);
369  }
int add_script(const char *script)
Definition: unicharset.cpp:866
void UNICHARSET::set_top_bottom ( UNICHAR_ID  unichar_id,
int  min_bottom,
int  max_bottom,
int  min_top,
int  max_top 
)
inline

Definition at line 473 of file unicharset.h.

475  {
476  unichars[unichar_id].properties.min_bottom =
477  static_cast<uinT8>(ClipToRange(min_bottom, 0, MAX_UINT8));
478  unichars[unichar_id].properties.max_bottom =
479  static_cast<uinT8>(ClipToRange(max_bottom, 0, MAX_UINT8));
480  unichars[unichar_id].properties.min_top =
481  static_cast<uinT8>(ClipToRange(min_top, 0, MAX_UINT8));
482  unichars[unichar_id].properties.max_top =
483  static_cast<uinT8>(ClipToRange(max_top, 0, MAX_UINT8));
484  }
#define MAX_UINT8
Definition: host.h:121
unsigned char uinT8
Definition: host.h:99
T ClipToRange(const T &x, const T &lower_bound, const T &upper_bound)
Definition: helpers.h:64
void UNICHARSET::set_width_range ( UNICHAR_ID  unichar_id,
int  min_width,
int  max_width 
)
inline

Definition at line 500 of file unicharset.h.

500  {
501  unichars[unichar_id].properties.min_width =
502  static_cast<inT16>(ClipToRange(min_width, 0, MAX_INT16));
503  unichars[unichar_id].properties.max_width =
504  static_cast<inT16>(ClipToRange(max_width, 0, MAX_INT16));
505  }
short inT16
Definition: host.h:100
#define MAX_INT16
Definition: host.h:119
T ClipToRange(const T &x, const T &lower_bound, const T &upper_bound)
Definition: helpers.h:64
void UNICHARSET::SetPropertiesFromOther ( const UNICHARSET src)

Definition at line 333 of file unicharset.cpp.

333  {
334  for (int ch = 0; ch < size_used; ++ch) {
335  const char* utf8 = id_to_unichar(ch);
336  UNICHAR_PROPERTIES properties;
337  if (src.GetStrProperties(utf8, &properties)) {
338  // Setup the script_id, other_case, and mirror properly.
339  const char* script = src.get_script_from_script_id(properties.script_id);
340  properties.script_id = add_script(script);
341  const char* other_case = src.id_to_unichar(properties.other_case);
342  if (contains_unichar(other_case)) {
343  properties.other_case = unichar_to_id(other_case);
344  } else {
345  properties.other_case = ch;
346  }
347  const char* mirror_str = src.id_to_unichar(properties.mirror);
348  if (contains_unichar(mirror_str)) {
349  properties.mirror = unichar_to_id(mirror_str);
350  } else {
351  properties.mirror = ch;
352  }
353  unichars[ch].properties.CopyFrom(properties);
354  }
355  }
356 }
int add_script(const char *script)
Definition: unicharset.cpp:866
const char *const id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:233
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:176
const char * get_script_from_script_id(int id) const
Definition: unicharset.h:723
bool contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:543
int UNICHARSET::size ( ) const
inline

Definition at line 264 of file unicharset.h.

264  {
265  return size_used;
266  }
int UNICHARSET::step ( const char *  str) const

Definition at line 192 of file unicharset.cpp.

192  {
193  // Find the length of the first matching unicharset member.
194  int minlength = ids.minmatch(str);
195  if (minlength == 0)
196  return 0; // Empty string or illegal char.
197 
198  int goodlength = minlength;
199  while (goodlength <= UNICHAR_LEN) {
200  if (str[goodlength] == '\0' || ids.minmatch(str + goodlength) > 0)
201  return goodlength; // This length works!
202 
203  // The next char is illegal so find the next usable length.
204  do {
205  ++goodlength;
206  } while (str[goodlength] != '\0' && goodlength <= UNICHAR_LEN &&
207  !ids.contains(str, goodlength));
208  if (goodlength > UNICHAR_LEN || !ids.contains(str, goodlength)) {
209  // This does not constitute a good length!
210  return minlength;
211  }
212  }
213  // Search to find a subsequent legal char failed so return the minlength.
214  return minlength;
215 }
int minmatch(const char *const unichar_repr) const
Definition: unicharmap.cpp:140
bool contains(const char *const unichar_repr) const
Definition: unicharmap.cpp:101
#define UNICHAR_LEN
Definition: unichar.h:28
UNICHAR_ID UNICHARSET::to_lower ( UNICHAR_ID  unichar_id) const
inline

Definition at line 593 of file unicharset.h.

593  {
594  if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID;
595  ASSERT_HOST(contains_unichar_id(unichar_id));
596  if (unichars[unichar_id].properties.islower) return unichar_id;
597  return unichars[unichar_id].properties.other_case;
598  }
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: unicharset.h:209
#define ASSERT_HOST(x)
Definition: errcode.h:84
UNICHAR_ID UNICHARSET::to_upper ( UNICHAR_ID  unichar_id) const
inline

Definition at line 601 of file unicharset.h.

601  {
602  if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID;
603  ASSERT_HOST(contains_unichar_id(unichar_id));
604  if (unichars[unichar_id].properties.isupper) return unichar_id;
605  return unichars[unichar_id].properties.other_case;
606  }
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: unicharset.h:209
#define ASSERT_HOST(x)
Definition: errcode.h:84
bool UNICHARSET::top_bottom_useful ( ) const
inline

Definition at line 438 of file unicharset.h.

438  {
439  return top_bottom_set_;
440  }
void UNICHARSET::unichar_insert ( const char *const  unichar_repr)

Definition at line 511 of file unicharset.cpp.

511  {
512  if (!ids.contains(unichar_repr)) {
513  if (strlen(unichar_repr) > UNICHAR_LEN) {
514  fprintf(stderr, "Utf8 buffer too big, size=%d for %s\n",
515  int(strlen(unichar_repr)), unichar_repr);
516  return;
517  }
518  if (size_used == size_reserved) {
519  if (size_used == 0)
520  reserve(8);
521  else
522  reserve(2 * size_used);
523  }
524 
525  strcpy(unichars[size_used].representation, unichar_repr);
526  this->set_script(size_used, null_script);
527  // If the given unichar_repr represents a fragmented character, set
528  // fragment property to a pointer to CHAR_FRAGMENT class instance with
529  // information parsed from the unichar representation. Use the script
530  // of the base unichar for the fragmented character if possible.
531  CHAR_FRAGMENT *frag = CHAR_FRAGMENT::parse_from_string(unichar_repr);
532  this->unichars[size_used].properties.fragment = frag;
533  if (frag != NULL && this->contains_unichar(frag->get_unichar())) {
534  this->unichars[size_used].properties.script_id =
535  this->get_script(frag->get_unichar());
536  }
537  this->unichars[size_used].properties.enabled = true;
538  ids.insert(unichar_repr, size_used);
539  ++size_used;
540  }
541 }
void set_script(UNICHAR_ID unichar_id, const char *value)
Definition: unicharset.h:367
#define NULL
Definition: host.h:144
int get_script(UNICHAR_ID unichar_id) const
Definition: unicharset.h:552
bool contains(const char *const unichar_repr) const
Definition: unicharmap.cpp:101
bool contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:543
static CHAR_FRAGMENT * parse_from_string(const char *str)
Definition: unicharset.cpp:902
void reserve(int unichars_number)
Definition: unicharset.cpp:161
#define UNICHAR_LEN
Definition: unichar.h:28
const char * get_unichar() const
Definition: unicharset.h:52
void insert(const char *const unichar_repr, UNICHAR_ID id)
Definition: unicharmap.cpp:76
const UNICHAR_ID UNICHARSET::unichar_to_id ( const char *const  unichar_repr) const

Definition at line 176 of file unicharset.cpp.

176  {
177  return ids.contains(unichar_repr) ?
178  ids.unichar_to_id(unichar_repr) : INVALID_UNICHAR_ID;
179 }
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharmap.cpp:36
bool contains(const char *const unichar_repr) const
Definition: unicharmap.cpp:101
const UNICHAR_ID UNICHARSET::unichar_to_id ( const char *const  unichar_repr,
int  length 
) const

Definition at line 181 of file unicharset.cpp.

182  {
183  assert(length > 0 && length <= UNICHAR_LEN);
184  return ids.contains(unichar_repr, length) ?
185  ids.unichar_to_id(unichar_repr, length) : INVALID_UNICHAR_ID;
186 }
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharmap.cpp:36
bool contains(const char *const unichar_repr) const
Definition: unicharmap.cpp:101
#define UNICHAR_LEN
Definition: unichar.h:28

Member Data Documentation

const char * UNICHARSET::kCustomLigatures
static
Initial value:
= {
{"ct", "\uE003"},
{"ſh", "\uE006"},
{"ſi", "\uE007"},
{"ſl", "\uE008"},
{"ſſ", "\uE009"},
}

Definition at line 132 of file unicharset.h.


The documentation for this class was generated from the following files: