54 file_name = data_file_path +
lang;
55 file_name +=
".cube.bigrams";
64 if (char_bigrams_obj ==
NULL) {
65 fprintf(stderr,
"Cube ERROR (CharBigrams::Create): could not create "
66 "character bigrams object.\n");
69 CharBigramTable *table = &char_bigrams_obj->bigram_table_;
73 table->char_bigram =
NULL;
76 vector<string> str_vec;
79 for (
int big = 0; big < str_vec.size(); big++) {
83 if (sscanf(str_vec[big].c_str(),
"%d %x %x", &cnt, &ch1, &ch2) != 3) {
84 fprintf(stderr,
"Cube ERROR (CharBigrams::Create): invalid format "
85 "reading line: %s\n", str_vec[big].c_str());
90 if (ch1 > table->max_char) {
91 CharBigram *char_bigram =
new CharBigram[ch1 + 1];
92 if (char_bigram ==
NULL) {
93 fprintf(stderr,
"Cube ERROR (CharBigrams::Create): error allocating "
94 "additional memory for character bigram table.\n");
98 if (table->char_bigram !=
NULL && table->max_char >= 0) {
99 memcpy(char_bigram, table->char_bigram,
100 (table->max_char + 1) *
sizeof(*char_bigram));
102 delete []table->char_bigram;
104 table->char_bigram = char_bigram;
107 for (
int new_big = table->max_char + 1; new_big <= ch1; new_big++) {
108 table->char_bigram[new_big].total_cnt = 0;
109 table->char_bigram[new_big].max_char = -1;
110 table->char_bigram[new_big].bigram =
NULL;
112 table->max_char = ch1;
115 if (ch2 > table->char_bigram[ch1].max_char) {
116 Bigram *bigram =
new Bigram[ch2 + 1];
117 if (bigram ==
NULL) {
118 fprintf(stderr,
"Cube ERROR (CharBigrams::Create): error allocating "
119 "memory for bigram.\n");
123 if (table->char_bigram[ch1].bigram !=
NULL &&
124 table->char_bigram[ch1].max_char >= 0) {
125 memcpy(bigram, table->char_bigram[ch1].bigram,
126 (table->char_bigram[ch1].max_char + 1) *
sizeof(*bigram));
127 delete []table->char_bigram[ch1].bigram;
129 table->char_bigram[ch1].bigram = bigram;
132 for (
int new_big = table->char_bigram[ch1].max_char + 1;
133 new_big <= ch2; new_big++) {
134 table->char_bigram[ch1].bigram[new_big].cnt = 0;
136 table->char_bigram[ch1].max_char = ch2;
139 table->char_bigram[ch1].bigram[ch2].cnt = cnt;
140 table->char_bigram[ch1].total_cnt += cnt;
141 table->total_cnt += cnt;
145 table->worst_cost =
static_cast<int>(
147 for (
char_32 ch1 = 0; ch1 <= table->max_char; ch1++) {
148 for (
char_32 ch2 = 0; ch2 <= table->char_bigram[ch1].max_char; ch2++) {
149 int cnt = table->char_bigram[ch1].bigram[ch2].cnt;
150 table->char_bigram[ch1].bigram[ch2].cost =
152 log(
MAX(0.5, static_cast<double>(cnt)) /
156 return char_bigrams_obj;
static bool ReadFileToString(const string &file_name, string *str)
static void SplitStringUsing(const string &str, const string &delims, vector< string > *str_vec)