1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64
| void applySkipping(int n){
//load the ngrams
String [] tab = load(ngramsfile);
List<int[]> ngramlist = new ArrayList<int[]>(); //list of ngrams
List<Double> ngramcounts = new ArrayList<Double>(); //list of corresponding values
for(int i = 0; i < tab.length; i++){
String [] tab2 = tab[i].split(SEP);
int [] ngram = new int[n];
for(int j = 0; j < n; j++) ngram[j] = Integer.parseInt(tab2[j]);
ngramlist.add(ngram);
ngramcounts.add(Double.parseDouble(tab2[n]));
}
for(int i = 0; i < corpus.length - n; i++){
for(int j = i + n; j < Math.min(i + n + 10, corpus.length); j++){
//search w_i w_i+1 w_j in the list of ngrams
int [] ngram1 = new int[n];
for(int k = 0; k < n - 1; k++) ngram1[k] = corpus[i + k];
ngram1[n - 1] = corpus[j];
int d = j - (i + n - 1);
//binary search
int lb = 0, ub = ngramlist.size() - 1, ind = -1;
boolean found = false;
while (ub - lb > 10 && !found){
int r = (int) (Math.random() * (ub - lb) + lb);
int [] ngram2 = ngramlist.get(r);
if (pos(ngram1, ngram2) == -1) ub = r;
else if (pos(ngram1, ngram2) == 1) lb = r;
else{
found = true;
ind = r;
}
}
if (!found){
for(int k = lb; k <= ub && ind == -1; k++){
int [] ngram2 = ngramlist.get(k);
if (pos(ngram1, ngram2) == -1) ind = k;
else if (pos(ngram1, ngram2) == 0){
found = true;
ind = k;
}
}
}
if (found){
//add the 2^-d value
double value = ngramcounts.get(ind);
value += Math.pow(2, -d);
ngramcounts.set(ind, value);
}
else if (ind == -1){
//add the new ngram at the end of the list
ngramlist.add(ngram1);
ngramcounts.add(Math.pow(2, -d));
}
else{
//insert the new ngram at position ind
ngramlist.add(ind, ngram1);
ngramcounts.add(ind, Math.pow(2, -d));
}
}
}
} |