44 #pragma warning(disable:4244) // Conversion warnings
45 #pragma warning(disable:4800) // int/bool warnings
50 #define MAX_WERD_SIZE 100
63 #define BestCertainty(Choices) \
64 (((VIABLE_CHOICE) first_node (Choices))->Certainty)
66 #define BestRating(Choices) (((VIABLE_CHOICE) first_node (Choices))->Rating)
68 #define BestFactor(Choices) \
69 (((VIABLE_CHOICE) first_node (Choices))->AdjustFactor)
76 static int CmpChoiceRatings(
void *arg1,
83 return (R1 < R2) ? -1 : 1;
93 ExpandedChoice->
Choice = Choice;
94 for (i = 0, Chunk = 0; i < Choice->
Length; i++)
123 const float certainties[],
131 for (
int i = 0, bw_idx = 0; i < word_choice.
length(); i++, bw_idx++) {
132 int blob_width = pieces_state[bw_idx];
138 blob_width = pieces_state[++bw_idx];
139 assert(blob_width > 0);
155 for (
int i = 0; i < src_choices.
size(); ++i) {
156 BLOB_CHOICE_LIST *cc_list =
new BLOB_CHOICE_LIST();
158 list_it.add_after_then_move(cc_list);
176 for (i = 0, Chunk = 0; i < Choice->
Length; i++) {
195 bool *modified_blobs) {
198 if (modified_blobs !=
NULL) *modified_blobs =
false;
203 if (BestChoice->
length() == 0)
207 cprintf(
"AcceptableChoice(): a choice with fragments beats BestChoice");
214 Choices, modified_blobs));
219 tprintf(
"\nStopper: %s (word=%c, case=%c)\n",
221 (is_valid_word ?
'y' :
'n'),
222 (is_case_ok ?
'y' :
'n'));
225 if (reject_offset_ <= 0.0
f && !is_valid_word)
return false;
226 if (is_valid_word && is_case_ok) {
235 tprintf(
"Stopper: Certainty = %4.1f, Threshold = %4.1f\n",
236 BestChoice->
certainty(), CertaintyThreshold);
238 if (no_dang_ambigs &&
239 BestChoice->
certainty() > CertaintyThreshold &&
244 tprintf(
"AcceptableChoice() returned false"
245 " (no_dang_ambig:%d cert:%g thresh:%g uniform:%d)\n",
259 tprintf(
"\nRejecter: %s (word=%c, case=%c, unambig=%c)\n",
270 cprintf(
"AcceptableResult(): a choice with fragments beats BestChoice\n");
283 cprintf (
"Rejecter: Certainty = %4.1f, Threshold = %4.1f ",
284 BestChoice.
certainty(), CertaintyThreshold);
286 if (BestChoice.
certainty() > CertaintyThreshold &&
302 Alternatives =
list_rest (best_choices_);
312 return (best_choices_ !=
NIL_LIST &&
333 char LabelString[80];
342 if (best_raw_choice_)
346 Choices = best_choices_;
348 cprintf(
"\nBest Cooked Choices:\n");
350 sprintf(LabelString,
"Cooked Choice #%d: ", i);
359 int label_num_unichars) {
363 (label_num_unichars > 1 || Choice->
Length > 1)) {
364 for (
int i = 0; i < Choice->
Length; i++) {
369 fprintf(file,
"\t%s\t%.4f\t%.4f\n", label,
384 ExpandChoice(best_choice, &BestChoice);
390 &BestChoice, is_bad));
405 assert (best_raw_choice_ !=
NULL);
407 ExpandChoice(best_raw_choice_, &BestRaw);
410 for (i = 0, Chunk = 0; i < Choice->
Length; i++, Thresholds++) {
421 if (NumErrorChunks > 0) {
422 AvgRating /= NumErrorChunks;
426 *Thresholds = MaxRating;
428 if (*Thresholds > MaxRating)
429 *Thresholds = MaxRating;
430 if (*Thresholds < MinRating)
431 *Thresholds = MinRating;
438 if (best_raw_choice_)
439 delete best_raw_choice_;
440 best_raw_choice_ =
NULL;
452 for (BlobWidth = current_segmentation_,
454 BlobWidth <
End; *BlobWidth++ = 1);
465 for (Segmentation = current_segmentation_; *BlobWidth != 0;
466 BlobWidth++, Segmentation++)
467 *Segmentation = *BlobWidth;
473 if (best_raw_choice_)
AddNewChunk(best_raw_choice_, Blob);
474 Choices = best_choices_;
478 Choices = raw_choices_;
485 const float Certainties[],
493 if (!keep_word_choices_)
497 if (!best_raw_choice_) {
500 }
else if (WordChoice->
rating() < best_raw_choice_->
Rating) {
505 delete best_raw_choice_;
511 ChoicesList = raw_choices_;
513 ChoicesList = best_choices_;
527 tprintf(
"Discarding choice \"%s\" with an overly low certainty"
528 " %.4f vs best choice certainty %.4f (Threshold: %.4f)\n",
540 Choices = ChoicesList;
562 ChoicesList =
s_adjoin (ChoicesList, NewChoice, CmpChoiceRatings);
575 raw_choices_ = ChoicesList;
577 best_choices_ = ChoicesList;
583 bool fix_replaceable,
585 bool *modified_blobs) {
587 tprintf(
"\nRunning NoDangerousAmbig() for %s\n",
595 bool modified_best_choice =
false;
596 bool ambigs_found =
false;
612 for (
int pass = 0; pass < (fix_replaceable ? 2 : 1); ++pass) {
613 bool replace = (fix_replaceable && pass == 0);
621 for (i = 0; i < best_choice->
length(); ++i) {
622 BLOB_CHOICE_LIST *lst =
new BLOB_CHOICE_LIST();
623 BLOB_CHOICE_IT lst_it(lst);
627 0.0, 0.0, -1, -1, -1, 0, 1,
false));
632 int wrong_ngram_index;
635 for (i = 0; i < best_choice->
length(); ++i) {
639 tprintf(
"Looking for %s ngrams starting with %s:\n",
640 replace ?
"replaceable" :
"ambiguous",
643 wrong_ngram_index = 0;
644 wrong_ngram[wrong_ngram_index] = curr_unichar_id;
645 if (curr_unichar_id == INVALID_UNICHAR_ID ||
646 curr_unichar_id >= table.
size() ||
647 table[curr_unichar_id] ==
NULL) {
650 AmbigSpec_IT spec_it(table[curr_unichar_id]);
651 for (spec_it.mark_cycle_pt(); !spec_it.cycled_list();) {
652 const AmbigSpec *ambig_spec = spec_it.data();
653 wrong_ngram[wrong_ngram_index+1] = INVALID_UNICHAR_ID;
659 tprintf(
"current ngram from spec: ");
661 tprintf(
"comparison result: %d\n", compare);
667 blob_index, blob_index+wrong_ngram_index, replace,
670 tprintf(
"fixpt+=(%d %d %d %d)\n", blob_index,
671 blob_index+wrong_ngram_index,
false,
679 tprintf(
"replace ambiguity with: ");
685 best_choice, blob_choices, modified_blobs);
686 modified_best_choice =
true;
695 for (
int tmp_index = 0; tmp_index <= wrong_ngram_index;
704 BLOB_CHOICE_IT bc_it(ambig_blob_choices[i+tmp_index]);
707 -1, -1, -1, 0, 1,
false));
711 }
else if (compare == -1) {
713 ((next_index = wrong_ngram_index+1+i) < best_choice->
length())) {
716 wrong_ngram[++wrong_ngram_index] =
732 tprintf(
"\nResulting ambig_blob_choices:\n");
733 for (i = 0; i < ambig_blob_choices.
length(); ++i) {
739 ambigs_found = (alt_word->
rating() < 0.0);
742 tprintf (
"Stopper: Possible ambiguous word = %s\n",
751 for (i = 0; i < alt_word->
length(); ++i) {
752 bool replacement_is_ngram =
756 (orig_i == end_i && replacement_is_ngram)) {
758 replacement_is_ngram));
760 tprintf(
"fixpt->dangerous+=(%d %d %d %d)\n", orig_i, end_i,
761 true, replacement_is_ngram);
770 if (output_ambig_words_file_ !=
NULL) {
771 fprintf(output_ambig_words_file_,
"\n");
775 return !ambigs_found;
781 reject_offset_ = 0.0;
790 for (i = 0, LastChunk = 0; i < Choice->
Length; i++) {
792 if (Blob < LastChunk) {
797 cprintf (
"AddNewChunk failed:Choice->Length=%d, LastChunk=%d, Blob=%d\n",
798 Choice->
Length, LastChunk, Blob);
805 bool *modified_blobs) {
806 int num_blobs_to_replace = 0;
807 int begin_blob_index = 0;
809 for (i = 0; i < wrong_ngram_begin_index + wrong_ngram_size; ++i) {
810 if (i >= wrong_ngram_begin_index) {
817 int temp_blob_index = begin_blob_index;
818 const char *temp_uch =
NULL;
819 const char *correct_ngram_str =
821 for (
int replaced_count = 0; replaced_count < wrong_ngram_size;
823 if (blob_choices !=
NULL) {
827 for (i = 0; i < fraglen; ++i) {
834 bit.set_to_list(blob_choices->
get(temp_blob_index));
837 temp_blob_index - begin_blob_index,
838 num_blobs_to_replace,
false);
845 for (bit.mark_cycle_pt(); !bit.cycled_list(); bit.forward()) {
846 if (bit.data()->unichar_id() == correct_frag_uch_id) {
849 if (bit.data()->unichar_id() == uch_id) {
850 bit.add_before_then_move(
new BLOB_CHOICE(*(bit.data())));
851 bit.data()->set_unichar_id(correct_frag_uch_id);
852 if (modified_blobs !=
NULL) *modified_blobs =
true;
861 if (replaced_count + 1 == wrong_ngram_size) {
863 num_blobs_to_replace, 0.0, 0.0, wrong_ngram_begin_index);
869 *modified_blobs && blob_choices !=
NULL) {
870 werd_choice->
print(
"ReplaceAmbig() ");
871 tprintf(
"Modified blob_choices: ");
872 for (
int i = 0; i < blob_choices->
size(); ++i) {
886 for (
int w = 0; w < WordChoice.
length(); ++w) {
889 }
else if (curr_len > 0) {
890 if (curr_len < shortest) shortest = curr_len;
894 if (curr_len > 0 && curr_len < shortest) {
904 const float Certainties[]) {
905 int Length = WordChoice.
length();
906 assert (Length <= MAX_NUM_CHUNKS && Length > 0);
914 fprintf (File,
"%s", Label);
915 fprintf(File,
"(R=%5.1f, C=%4.1f, F=%4.2f, Frag=%d) ",
919 for (i = 0; i < Choice->
Length; i++)
923 for (i = 0; i < Choice->
Length; i++) {
930 for (i = 0; i < Choice->
Length; i++) {
932 fprintf(File,
"%3d ", (
int) (Choice->
Blob[i].
Certainty * -10.0));
936 for (i = 0; i < Choice->
Length; i++) {
944 FLOAT32 AdjustFactor,
const float Certainties[],
946 ViableChoice->
Init(WordChoice, current_segmentation_, Certainties,
958 for (i = 0, CharChoice = &(ViableChoice->
Blob[0]);
959 i < ViableChoice->
Length; CharChoice++, i++) {
968 const char *String_lengths,
972 int current_unichar_length;
974 for (Char = &(ViableChoice->
Blob[0]), i = 0;
975 i < ViableChoice->Length;
976 String += *(String_lengths++), Char++, i++) {
978 if (current_unichar_length != *String_lengths ||
980 current_unichar_length) != 0)
983 return (*String == 0) ?
true :
false;
990 float CertaintyThreshold;
997 WordLength = Choices.
length();
1001 TotalCertainty = TotalCertaintySquared = 0.0;
1002 BLOB_CHOICE_IT BlobChoiceIt;
1003 for (
int i = 0; i < Choices.
length(); ++i) {
1004 BlobChoiceIt.set_to_list(Choices.
get(i));
1005 Certainty = BlobChoiceIt.data()->certainty();
1006 TotalCertainty += Certainty;
1007 TotalCertaintySquared += Certainty * Certainty;
1008 if (Certainty < WorstCertainty)
1009 WorstCertainty = Certainty;
1014 TotalCertainty -= WorstCertainty;
1015 TotalCertaintySquared -= WorstCertainty * WorstCertainty;
1017 Mean = TotalCertainty / WordLength;
1018 Variance = ((WordLength * TotalCertaintySquared -
1019 TotalCertainty * TotalCertainty) /
1020 (WordLength * (WordLength - 1)));
1023 StdDev = sqrt (Variance);
1029 if (BestChoice.
certainty() < CertaintyThreshold) {
1031 cprintf(
"Stopper: Non-uniform certainty = %4.1f"
1032 " (m=%4.1f, s=%4.1f, t=%4.1f)\n",
void delete_data_pointers()
bool AcceptableChoice(BLOB_CHOICE_LIST_VECTOR *Choices, WERD_CHOICE *BestChoice, DANGERR *fixpt, ACCEPTABLE_CHOICE_CALLER caller, bool *modified_blobs)
Returns true if the given best_choice is good enough to stop.
void SettupStopperPass2()
Sets up stopper variables in preparation for the second pass.
#define BestFactor(Choices)
int case_ok(const WERD_CHOICE &word, const UNICHARSET &unicharset)
Check a string to see if it matches a set of lexical rules.
bool get_isalpha(UNICHAR_ID unichar_id) const
void EndDangerousAmbigs()
UNICHAR_ID correct_fragments[MAX_AMBIG_SIZE+1]
void set_rating(float new_val)
double StopperAmbigThreshold(double f1, double f2)
void FillViableChoice(const WERD_CHOICE &WordChoice, FLOAT32 AdjustFactor, const float Certainties[], VIABLE_CHOICE ViableChoice)
const char *const id_to_unichar(UNICHAR_ID id) const
void DeleteViableChoiceStruct(void *vcs)
UNICHAR_ID ChunkClass[MAX_NUM_CHUNKS]
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
const STRING debug_string() const
double stopper_phase2_certainty_rejection_offset
int LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice)
Returns the length of the shortest alpha run in WordChoice.
int ChoiceSameAs(const WERD_CHOICE &WordChoice, VIABLE_CHOICE ViableChoice)
int is_same_node(void *item1, void *item2)
UNICHAR_ID wrong_ngram[MAX_AMBIG_SIZE+1]
void SetBlobChoices(const BLOB_CHOICE_LIST_VECTOR &src_choices)
bool get_isngram(UNICHAR_ID unichar_id) const
const int GetMaxFixedLengthDawgIndex() const
static void print(const UNICHAR_ID array[], const UNICHARSET &unicharset)
void Init(const WERD_CHOICE &word_choice, const PIECES_STATE &pieces_state, const float certainties[], FLOAT32 adjust_factor)
#define BestCertainty(Choices)
LIST delete_d(LIST list, void *key, int_compare is_equal)
const UNICHARSET & getUnicharset() const
void AddNewChunk(VIABLE_CHOICE Choice, int Blob)
bool CurrentWordAmbig()
Returns true if there are multiple good choices for the current word.
const UnicharAmbigsVector & dang_ambigs() const
int tessedit_truncate_wordchoice_log
#define set_rest(l, cell)
void PrintViableChoice(FILE *File, const char *Label, VIABLE_CHOICE Choice)
Dumps a text representation of the specified Choice to File.
void ClearBestChoiceAccum()
Clears best_choices_ list accumulated by the stopper.
VIABLE_CHOICE_STRUCT * VIABLE_CHOICE
int UniformCertainties(const BLOB_CHOICE_LIST_VECTOR &Choices, const WERD_CHOICE &BestChoice)
UNICHAR_ID correct_ngram_id
void * nth_cell(LIST var_list, int item_num)
bool StringSameAs(const WERD_CHOICE &WordChoice, VIABLE_CHOICE ViableChoice)
void destroy_nodes(LIST list, void_dest destructor)
int FreeBadChoice(void *item1, void *item2)
VIABLE_CHOICE NewViableChoice(const WERD_CHOICE &WordChoice, FLOAT32 AdjustFactor, const float Certainties[])
FLOAT32 CurrentBestChoiceAdjustFactor()
Returns the adjustment factor for the best choice for the current word.
void unichar_insert(const char *const unichar_repr)
bool NoDangerousAmbig(WERD_CHOICE *BestChoice, DANGERR *fixpt, bool fix_replaceable, BLOB_CHOICE_LIST_VECTOR *Choices, bool *modified_blobs)
bool AlternativeChoicesWorseThan(FLOAT32 Threshold)
double stopper_allowable_character_badness
double stopper_ambiguity_threshold_offset
void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings, const UNICHARSET ¤t_unicharset)
static int compare(const UNICHAR_ID array1[], const UNICHAR_ID array2[])
BLOB_CHOICE_LIST_CLIST * blob_choices
bool ComposedFromCharFragments
char * word_to_debug_lengths
const char * string() const
#define BestRating(Choices)
bool CurrentBestChoiceIs(const WERD_CHOICE &WordChoice)
Returns true if WordChoice is the same as the current best choice.
int stopper_smallword_size
int valid_word(const WERD_CHOICE &word, bool numbers_ok) const
void FindClassifierErrors(FLOAT32 MinRating, FLOAT32 MaxRating, FLOAT32 RatingMargin, FLOAT32 Thresholds[])
double stopper_certainty_per_char
static const float kBadRating
bool AcceptableResult(const WERD_CHOICE &BestChoice)
double stopper_nondict_certainty_base
void remove_unichar_id(int index)
DLLSYM void tprintf(const char *format,...)
const char fragment_length(int index) const
void SettupStopperPass1()
Sets up stopper variables in preparation for the first pass.
bool fragment_mark() const
void LogNewSplit(int Blob)
LIST s_adjoin(LIST var_list, void *variable, int_compare compare)
void ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size, UNICHAR_ID correct_ngram_id, WERD_CHOICE *werd_choice, BLOB_CHOICE_LIST_VECTOR *blob_choices, bool *modified_blobs)
void PrintAmbigAlternatives(FILE *file, const char *label, int label_num_unichars)
Print all the choices in raw_choices_ list for non 1-1 ambiguities.
void set_unichar_id(UNICHAR_ID unichar_id, int index)
_ConstTessMemberResultCallback_0_0< false, R, T1 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)() const)
FLOAT32 Mean(PROTOTYPE *Proto, uinT16 Dimension)
bool stopper_no_acceptable_choices
float ChunkCertainty[MAX_NUM_CHUNKS]
static BLOB_CHOICE * deep_copy(const BLOB_CHOICE *src)
void LogNewSegmentation(PIECES_STATE BlobWidth)
void DebugWordChoices()
Prints the current choices for this word to stdout.
uinT8 PIECES_STATE[MAX_NUM_CHUNKS+2]
void cprintf(const char *format,...)
const UnicharAmbigs & getUnicharAmbigs()
static bool valid_word_permuter(uinT8 perm, bool numbers_ok)
Check all the DAWGs to see if this word is in any of them.
WERD_CHOICE * dawg_permute_and_select(const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit, int sought_word_length, int end_char_choice_index)
void string_and_lengths(STRING *word_str, STRING *word_lengths_str) const
const UNICHAR_ID unichar_id(int index) const
const UnicharAmbigsVector & replace_ambigs() const
void LogNewChoice(FLOAT32 AdjustFactor, const float Certainties[], bool raw_choice, WERD_CHOICE *WordChoice, const BLOB_CHOICE_LIST_VECTOR &blob_choices)