Pseudocode for bootstrapping
Input:
Output: flat file of size MxN, containing “real” data. Assume that M is not too large. user-specified number of files of size MxN which contain “fake” data based on the real input data.
High-level pseudocode:
If the input data is numeric, generate output data based on the mean and stdev of the input data; else generate output data based on the similarity measure.
Pseudocode draft 1:
For each column:
If the input data is numeric,
//generate output data based on the mean and stdev of the input data: calculate mean and stdev classify input data based on mean and stdev calculate probabilities of input data classification for each class generate fake data else
//generate output data based on the similarity measure find out which data values are present calculate probabilities of input data classification for each class generate fake data
Pseudocode draft 2:
Assume that the input data is called s[i][j], i=1, …,M, j = 1, .., N.
Assume that the fake data are kept in file fakedata[i][j][filenumber], i=1, …,M, j = 1, ..,
N, filenumber = 1, .., Z, and Z is assigned by the user.
Assume numBounds = 4 (i.e. there are 4 bounds and thus 3 bins: min, mean-stdev, mean+stdev, max)
for (file = 1, Z) { // generate Z fake data files of size MxN for j = 1, N if numeric(j),
//go through each column calculate_mean_and_stdev(j, mean, stdev)
}
} classify_numeric_data(j, bounds[]) calculate_numeric_probabilities (j, mean, stdev, bounds[], prob[]) generate_fake_numeric_data(j, bounds[], prob[], file, M, N, fakedata[][][file]) else find_values_and_calculate_probabilities(j, vals[], prob[], numPosVals) generate_fake_nonnumeric_data(j, values[], prob[], numPosVals, file, M, N, fakedata[][][file])
//calculate mean and stdev for column j calculate_mean_and_stdev(j, mean, stdev) { mean =0 stdev = 0 max = +2^30 min = -2^30 for i = 1, M { //go through each row mean = mean + … max = …. min = …
} mean = mean/M – or is it N? think about it
for i = 1, M { //go through each row stdev = stdev + …
}
} stdev = sqrt( …)
// classify input data based on mean and standard deviation such that:
// the first class is from min[j] to mean[j] – stdev[j];
// the second class is from mean[j] – stdev [j] to mean[j] + stdev [j];
// the third class is from mean[j] + stdev [j] to max[j]; classify_numeric_data(j, bounds) { bounds[0] = min[j] bounds[1] = mean[j]-stdev[j], bounds[2] = mean[j] + stdev[j] bounds[3] = max[j]
}
//calculate probabilities of input data classification for each class
calculate_numeric_probabilities (j, mean, stdev, bounds[], prob[]) { int bin[2]; for i = 0, i <= 2, i++) { bin[i] =0;
} bool found = false; for i = 1, M { //go through each row while(!found) { for (i = 0; i <=2; i++) { if (s[i][j] >= bound[i] && s[i][j] <= bound[i+1] ) { bin[i]++ found = true;
}
}
}
} for (i = 0; i <=2; i++) { prob[i] = bin[i]/M;
}
}
//j: column number
// bounds[]: the array with bounds between the bins (see classify_numeric_data
//for definitions)
//numPosVals: size of values
//file: filenumber
//M: number of input samples
//N: number of columns
//fakedata: file with fake data generate_fake_numeric_data(j, bounds[], prob[], file, M, N, fakedata[][][file]) { for (i = 1, i <= M, i++) { r = rand() if r == 0 //take care of the borderline cases fakedata[i][j][file] = min[j] //i.e. bound[0] if r == 1 fakedata[i][j][file] = max[j] //i.e. bound[3]
//find which bin this value belongs to beg = 0; end = prob[0];
} k=0; bool found = false; while (!found && k <= numBounds) { if (beg < r <= end) {
//generate fakedata[i][j] value in between
//bound[k] and bound[k+1] r = rand() fakedata[i][j][file] = (bound[k+1] – bound[k])*r + bound[k] found = true;
} else { beg = end; end = end + prob[k+1]; k++;
}
//OR: findValueSlot(r, 0, prob[0], fakedata[][j][file], bound[])
}
} findValueSlot(beg, end, fakedata[][][], bound[]) { for (k = 0, k <= numBounds - 1, k++) { if (beg < r <= end) {
//generate fakedata[i][j] value in between
}
//bound[k] and bound[k+1] r = rand() fakedata[i][j][file] = (bound[k+1] – bound[k])*r + bound[k] return beg = end end = end + prob[k+1]
}
}
//Assume that the input file has a small number of possible values for this feature.
//j: column number
// vals[]: the array with possible values; vals[i] is the ith value
// prob[]: probability of each possible values; ; prob[i] = probability of ith value
//numPosVals: size of vals[] find_values_and calculate_probabilities(j, vals[], prob[], numPosVals) { vals[] = NIL; k= 0;
} for (i = 1; i<=M; i++) { if ((z= Search(s[i][j], vals[], 0, numPosVals-1)) == NIL) {
//value not in vals[]
} else {
} vals[k] = s[i][j] prob[k]++ k++ prob[z]++
} for (i=0, k-1) { prob[i]=prob[i]/M
} numPosVals = k
//j: column number
// values[]: the array with possible values; values[i] is the ith value
//prob[]: array with probability of each value; prob[i] = probability of ith value
//numPosVals: size of values
//file: filenumber
//M: number of input samples
//N: number of columns
//fakedata: file with fake data generate_fake_nonnumeric_data(j, values[], prob[], numPosVals, file, M, N, fakedata[][][file]) for (i = 1, M) { r = rand()
//take care of the beginning and end value of r if r == 0 fakedata[i][j][file] = vals[0] if r == 1 fakedata[i][j][file] = vals[numPosVals-1] beg = 0 end = prob[numPosVals-1] for (k = 0, k <= numPosVals-1)) { if (beg <= r < end ) fakedata[i][j][file] = vals[k] break
}
}
}
} beg = end end = end + prob[k+1]
}
// search function that looks for value v in array A[]
// first: the index of the element from which to begin searching
// last: index of the element from which to stop
// returns NIL if the value is not found, or the index if found
Search(v, A[], first, last) { while (first <= last) {
} mid = floor((first+last)/2) if v == A[mid] return mid if v > A[mid] first = mid+1 if v < A[mid] last = mid-1