20 #ifndef TESSERACT_CCUTIL_UNICHARSET_H__
21 #define TESSERACT_CCUTIL_UNICHARSET_H__
40 inline void set_all(
const char *unichar,
int pos,
int total,
bool natural) {
50 inline void set_pos(
int p) { this->pos = p; }
52 inline const char*
get_unichar()
const {
return this->unichar; }
53 inline int get_pos()
const {
return this->pos; }
54 inline int get_total()
const {
return this->total; }
62 return to_string(unichar, pos, total, natural);
67 inline bool equals(
const char *other_unichar,
68 int other_pos,
int other_total)
const {
69 return (strcmp(this->unichar, other_unichar) == 0 &&
70 this->pos == other_pos && this->total == other_total);
81 return (strcmp(this->unichar, fragment->
get_unichar()) == 0 &&
83 this->pos == fragment->
get_pos() + 1);
90 inline bool is_ending()
const {
return this->pos == this->total-1; }
176 int step(
const char* str)
const;
210 return unichar_id != INVALID_UNICHAR_ID && unichar_id < size_used &&
220 bool eq(
UNICHAR_ID unichar_id,
const char*
const unichar_repr)
const;
224 for (
int i = 0; i < size_used; ++i) {
225 if (unichars[i].properties.fragment !=
NULL) {
226 delete unichars[i].properties.fragment;
227 unichars[i].properties.fragment =
NULL;
234 if (script_table !=
NULL) {
235 for (
int i = 0; i < script_table_size_used; ++i)
236 delete[] script_table[i];
237 delete[] script_table;
239 script_table_size_used = 0;
241 if (unichars !=
NULL) {
246 script_table_size_reserved = 0;
250 top_bottom_set_ =
false;
251 script_has_upper_lower_ =
false;
252 script_has_xheight_ =
false;
269 void reserve(
int unichars_number);
274 FILE* file = fopen(filename,
"w+b");
275 if (file ==
NULL)
return false;
289 bool skip_fragments);
299 FILE* file = fopen(filename,
"rb");
300 if (file ==
NULL)
return false;
337 unichars[unichar_id].properties.isalpha = value;
342 unichars[unichar_id].properties.islower = value;
347 unichars[unichar_id].properties.isupper = value;
352 unichars[unichar_id].properties.isdigit = value;
357 unichars[unichar_id].properties.ispunctuation = value;
362 unichars[unichar_id].properties.isngram = value;
368 unichars[unichar_id].properties.script_id =
add_script(value);
373 unichars[unichar_id].properties.other_case = other_case;
378 unichars[unichar_id].properties.direction = value;
383 unichars[unichar_id].properties.mirror = mirror;
388 unichars[unichar_id].properties.normed = normed;
393 if (INVALID_UNICHAR_ID == unichar_id)
return false;
395 return unichars[unichar_id].properties.isalpha;
400 if (INVALID_UNICHAR_ID == unichar_id)
return false;
402 return unichars[unichar_id].properties.islower;
407 if (INVALID_UNICHAR_ID == unichar_id)
return false;
409 return unichars[unichar_id].properties.isupper;
414 if (INVALID_UNICHAR_ID == unichar_id)
return false;
416 return unichars[unichar_id].properties.isdigit;
421 if (INVALID_UNICHAR_ID == unichar_id)
return false;
423 return unichars[unichar_id].properties.ispunctuation;
428 if (INVALID_UNICHAR_ID == unichar_id)
return false;
430 return unichars[unichar_id].properties.isngram;
439 return top_bottom_set_;
460 int* min_bottom,
int* max_bottom,
461 int* min_top,
int* max_top)
const {
462 if (INVALID_UNICHAR_ID == unichar_id) {
463 *min_bottom = *min_top = 0;
464 *max_bottom = *max_top = 256;
468 *min_bottom = unichars[unichar_id].properties.min_bottom;
469 *max_bottom = unichars[unichar_id].properties.max_bottom;
470 *min_top = unichars[unichar_id].properties.min_top;
471 *max_top = unichars[unichar_id].properties.max_top;
474 int min_bottom,
int max_bottom,
475 int min_top,
int max_top) {
476 unichars[unichar_id].properties.min_bottom =
478 unichars[unichar_id].properties.max_bottom =
480 unichars[unichar_id].properties.min_top =
482 unichars[unichar_id].properties.max_top =
490 int* min_width,
int* max_width)
const {
491 if (INVALID_UNICHAR_ID == unichar_id) {
497 *min_width = unichars[unichar_id].properties.min_width;
498 *max_width = unichars[unichar_id].properties.max_width;
501 unichars[unichar_id].properties.min_width =
503 unichars[unichar_id].properties.max_width =
511 int* min_bearing,
int* max_bearing)
const {
512 if (INVALID_UNICHAR_ID == unichar_id) {
513 *min_bearing = *max_bearing = 0;
517 *min_bearing = unichars[unichar_id].properties.min_bearing;
518 *max_bearing = unichars[unichar_id].properties.max_bearing;
521 int min_bearing,
int max_bearing) {
522 unichars[unichar_id].properties.min_bearing =
524 unichars[unichar_id].properties.max_bearing =
532 int* min_advance,
int* max_advance)
const {
533 if (INVALID_UNICHAR_ID == unichar_id) {
534 *min_advance = *max_advance = 0;
538 *min_advance = unichars[unichar_id].properties.min_advance;
539 *max_advance = unichars[unichar_id].properties.max_advance;
542 int min_advance,
int max_advance) {
543 unichars[unichar_id].properties.min_advance =
545 unichars[unichar_id].properties.max_advance =
553 if (INVALID_UNICHAR_ID == unichar_id)
return null_sid_;
555 return unichars[unichar_id].properties.script_id;
573 if (INVALID_UNICHAR_ID == unichar_id)
return INVALID_UNICHAR_ID;
575 return unichars[unichar_id].properties.other_case;
582 return unichars[unichar_id].properties.direction;
587 if (INVALID_UNICHAR_ID == unichar_id)
return INVALID_UNICHAR_ID;
589 return unichars[unichar_id].properties.mirror;
594 if (INVALID_UNICHAR_ID == unichar_id)
return INVALID_UNICHAR_ID;
596 if (unichars[unichar_id].properties.islower)
return unichar_id;
597 return unichars[unichar_id].properties.other_case;
602 if (INVALID_UNICHAR_ID == unichar_id)
return INVALID_UNICHAR_ID;
604 if (unichars[unichar_id].properties.isupper)
return unichar_id;
605 return unichars[unichar_id].properties.other_case;
611 if (INVALID_UNICHAR_ID == unichar_id)
return NULL;
613 return unichars[unichar_id].properties.fragment;
661 if (unichar_repr ==
NULL || unichar_repr[0] ==
'\0' ||
705 return unichars[unichar_id].properties.normed.string();
719 return script_table_size_used;
724 if (
id >= script_table_size_used ||
id < 0)
726 return script_table[id];
738 return script == null_script;
748 return unichars[unichar_id].properties.enabled;
764 return script_has_upper_lower_;
771 return script_has_xheight_;
776 struct UNICHAR_PROPERTIES {
777 UNICHAR_PROPERTIES();
782 void SetRangesOpen();
784 void SetRangesEmpty();
787 bool AnyRangeEmpty()
const;
789 void ExpandRangesFrom(
const UNICHAR_PROPERTIES& src);
791 void CopyFrom(
const UNICHAR_PROPERTIES& src);
834 struct UNICHAR_SLOT {
836 UNICHAR_PROPERTIES properties;
844 bool GetStrProperties(
const char* utf8_str,
845 UNICHAR_PROPERTIES* props)
const;
851 bool skip_fragments);
853 UNICHAR_SLOT* unichars;
858 int script_table_size_used;
859 int script_table_size_reserved;
860 const char* null_script;
862 bool top_bottom_set_;
864 bool script_has_upper_lower_;
867 bool script_has_xheight_;
884 #endif // TESSERACT_CCUTIL_UNICHARSET_H__
int get_script_id_from_name(const char *script_name) const
bool get_isalpha(UNICHAR_ID unichar_id) const
const char *const id_to_unichar_ext(UNICHAR_ID id) const
void set_natural(bool value)
bool get_ispunctuation(const char *const unichar_repr, int length) const
bool script_has_xheight() const
bool get_isupper(const char *const unichar_repr, int length) const
int add_script(const char *script)
bool top_bottom_useful() const
bool load_from_file(FILE *file)
const char *const id_to_unichar(UNICHAR_ID id) const
void set_isngram(UNICHAR_ID unichar_id, bool value)
bool contains_unichar_id(UNICHAR_ID unichar_id) const
void set_bearing_range(UNICHAR_ID unichar_id, int min_bearing, int max_bearing)
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
void set_isalpha(UNICHAR_ID unichar_id, bool value)
bool get_isprivate(UNICHAR_ID unichar_id) const
void set_islower(UNICHAR_ID unichar_id, bool value)
void get_width_range(UNICHAR_ID unichar_id, int *min_width, int *max_width) const
const char * get_script_from_script_id(int id) const
bool equals(const char *other_unichar, int other_pos, int other_total) const
const char * get_normed_unichar(UNICHAR_ID unichar_id) const
bool equals(const CHAR_FRAGMENT *other) const
bool get_isalpha(const char *const unichar_repr) const
bool load_from_file(const char *const filename)
void set_ispunctuation(UNICHAR_ID unichar_id, bool value)
void set_width_range(UNICHAR_ID unichar_id, int min_width, int max_width)
bool get_isngram(UNICHAR_ID unichar_id) const
unsigned int get_properties(const char *const unichar_repr) const
void ExpandRangesFromOther(const UNICHARSET &src)
void set_script(UNICHAR_ID unichar_id, const char *value)
void set_normed(UNICHAR_ID unichar_id, const char *normed)
void set_isdigit(UNICHAR_ID unichar_id, bool value)
void set_all(const char *unichar, int pos, int total, bool natural)
bool get_ispunctuation(UNICHAR_ID unichar_id) const
bool encodable_string(const char *str, int *first_bad_position) const
void SetPropertiesFromOther(const UNICHARSET &src)
bool get_islower(UNICHAR_ID unichar_id) const
void get_bearing_range(UNICHAR_ID unichar_id, int *min_bearing, int *max_bearing) const
int get_script_table_size() const
bool load_from_file(const char *const filename, bool skip_fragments)
void set_top_bottom(UNICHAR_ID unichar_id, int min_bottom, int max_bottom, int min_top, int max_top)
bool get_isupper(UNICHAR_ID unichar_id) const
bool get_isdigit(UNICHAR_ID unichar_id) const
bool load_from_inmemory_file(const char *const memory, int mem_size)
void get_advance_range(UNICHAR_ID unichar_id, int *min_advance, int *max_advance) const
STRING debug_str(UNICHAR_ID id) const
void get_top_bottom(UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, int *min_top, int *max_top) const
int get_script(const char *const unichar_repr) const
UNICHAR_ID get_other_case(UNICHAR_ID unichar_id) const
bool get_islower(const char *const unichar_repr) const
void set_direction(UNICHAR_ID unichar_id, UNICHARSET::Direction value)
STRING debug_str(const char *unichar_repr) const
bool is_null_script(const char *script) const
bool get_isupper(const char *const unichar_repr) const
static const int kMaxChunks
void unichar_insert(const char *const unichar_repr)
int get_script(UNICHAR_ID unichar_id) const
int get_script(const char *const unichar_repr, int length) const
bool get_isalpha(const char *const unichar_repr, int length) const
bool contains(const char *const unichar_repr) const
bool is_continuation_of(const CHAR_FRAGMENT *fragment) const
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
char get_chartype(const char *const unichar_repr) const
bool eq(UNICHAR_ID unichar_id, const char *const unichar_repr) const
bool get_islower(const char *const unichar_repr, int length) const
void set_black_and_whitelist(const char *blacklist, const char *whitelist)
bool contains_unichar(const char *const unichar_repr) const
char get_chartype(UNICHAR_ID unichar_id) const
void set_unichar(const char *uch)
UNICHAR_ID to_upper(UNICHAR_ID unichar_id) const
static CHAR_FRAGMENT * parse_from_string(const char *str)
bool major_right_to_left() const
void set_isupper(UNICHAR_ID unichar_id, bool value)
bool get_enabled(UNICHAR_ID unichar_id) const
bool load_from_inmemory_file(const char *const memory, int mem_size, bool skip_fragments)
void reserve(int unichars_number)
UNICHAR_ID to_lower(UNICHAR_ID unichar_id) const
bool script_has_upper_lower() const
UNICHAR_ID get_mirror(UNICHAR_ID unichar_id) const
const CHAR_FRAGMENT * get_fragment(const char *const unichar_repr) const
int step(const char *str) const
static const char * kCustomLigatures[][2]
Direction get_direction(UNICHAR_ID unichar_id) const
const char * get_unichar() const
void set_mirror(UNICHAR_ID unichar_id, UNICHAR_ID mirror)
bool is_beginning() const
bool save_to_file(const char *const filename) const
void AppendOtherUnicharset(const UNICHARSET &src)
static STRING debug_utf8_str(const char *str)
void set_other_case(UNICHAR_ID unichar_id, UNICHAR_ID other_case)
bool get_isdigit(const char *const unichar_repr) const
bool get_isdigit(const char *const unichar_repr, int length) const
bool get_ispunctuation(const char *const unichar_repr) const
unsigned int get_properties(UNICHAR_ID unichar_id) const
void set_advance_range(UNICHAR_ID unichar_id, int min_advance, int max_advance)
void delete_pointers_in_unichars()
int direction(EDGEPT *point)
T ClipToRange(const T &x, const T &lower_bound, const T &upper_bound)