#include #include #include // Structure with callback function and data for matches typedef struct strmatch_s { void (*proc)(int, char *, struct strmatch_s *); int num_matches; } strmatch_t; // This function gets called on every match void proc_match(int i, char * s, strmatch_t * match) { printf("%.*s\n", 4, s); match->num_matches++; } // Return the maximum number of matched characters in two strings inline int max_match(char * a, char * b, int n) { int i = 0; while (a[i] == b[i] && i < n) i++; return i; } // Basic naive match: O(nm) time, no additional space void strmatch_naive(char * p, char * t, strmatch_t * match) { int n, m; n = strlen(p); m = strlen(t); // For every character in the string where we can match a pattern int i; for (i=0; iproc(i, (char *)(t + i), match); } } // Basic naive way to calculate z-boxes void zbox_naive(char * p, int n, int * z) { // For every character in the pattern except the first int k; for (k=1; k < n; k++) z[k] = max_match(p, p+k, n); } typedef struct { int l; int r; } zbox_t; // Linear-time algorithm to calculate z-boxes inline int zbox_linear_k(char * p, int n, int * z, int k, zbox_t * zb) { int zk; // If we are calculating a z-value outside any known z-box if (k > zb->r) { // Do a naive match and adjust the r and l bounds zk = max_match(p, p+k, n); if (zk > 0) { zb->r = k + zk - 1; zb->l = k; } } else // Position k is contained within a z-box { // Calculate k's corresponding position in the prefix (kp) int kp = k - zb->l; // And the distance from k to the end of the z-box (b) int b = zb->r - k + 1; if (z[kp] < b) zk = z[kp]; else { int q = max_match(p+b, p+zb->r+1, n); zk = b+q; zb->r = k+zk - 1; zb->l = k; } } return zk; } // Iterate through all k to calculate all z[k] to n (for the linear version) void zbox_linear(char * p, int n, int * z) { zbox_t zb = {.l=0, .r=0}; int k; for (k=1; kproc(i - (n+1), (char *)(t + i - (n+1)), match); } } free(z); } // Calculates sp' for a pattern via the z-boxes void spp_zbox(int * z, int * spp, int n) { int i; for (i=0; i 1; j--) { i = j + z[j] - 1; spp[i] = z[j]; } } // Calculates sp' for a pattern via the classic method void spp_classic(int * spp, char * p, int n) { int * sp = calloc(n, sizeof(int)); sp[0] = 0; int k, v; char x; for (k=0; k < n-1; k++) { x = p[k+1]; v = sp[k]; while (p[v] != x && v != 0) v = sp[v]; if(p[v] == x) sp[k+1] = v + 1; else sp[k+1] = 0; } spp[0] = 0; int i; for (i=1; i= 0; k--) { if (k == 0) fp[k] = 1; else fp[k] = fp[k-1]; } // Run the algorithm int pi, ti; pi = ti = 0; while (ti + (n-pi) <= m) { while (p[pi] == t[ti] && pi < n) { pi++; ti++; } // We have a match! if (pi == n) { int midx = ti - n; match->proc(midx, (char *)(t + midx), match); } if (pi == 0) ti++; // Perform the correct skip pi = fp[pi]; } free(fp); } // Reverse a string in place void strrev(char * s) { int i, n; char tmp; n = strlen(s); for (i=0; i0; i--) { if (lpp[i+1] > z[i]) lpp[i] = lpp[i+1]; else lpp[i] = z[i]; } // Calculate the r values for the extended bad character rule int ridx = 0; for (i=0; i= 0 && p[i] == t[h]) { i--; h--; } // We have a match! if (i < 0) { int midx = h+1; match->proc(midx, (char *)(t + midx), match); k += n - lpp[1]; } else // Perform the maximal pattern shift { // First comparison is a mismatch if (h == k) maxs = 1; else { gss = bcs = 0; j = i + 1; // Obtain the strong good suffix shift (gss) if (lp[j] > 0) gss = n - lp[j]; else gss = n - lpp[j]; // Also find the bad character shift (bcs) ri = rs[(int)t[h]]; while (ri && ri->pos >= i) ri = ri->next; if (!ri) bcs = i + 1; else bcs = i - ri->pos; // Choose the maximal shift if (gss > bcs) maxs = gss; else maxs = bcs; } // Shift. k += maxs; } } free(pr); free(z); free(zr); free(lp); free(lpp); } int main(int argc, char ** argv) { // Initialize the strmatch structure strmatch_t match = {.num_matches = 0, .proc = proc_match}; // Two sample strings char * p = "abab"; //char * p = "abcc"; char * t = "abababcababcccdxabaxabab"; // Test naive match strmatch_naive(p, t, &match); printf("Testing the naive match: %u\n", match.num_matches); // Test zboxes for a variety of patterns (linear vs. naive as well) int n = strlen(p); int * z = calloc(n, sizeof(int)); zbox_naive(p, n, z); printz(z, n, p); bzero(z, n*sizeof(int)); zbox_linear(p, n, z); printz(z, n, p); char * p2 = "abcd"; memset(z, '\0', n*sizeof(int)); zbox_naive(p2, n, z); printz(z, n, p2); memset(z, '\0', n*sizeof(int)); zbox_linear(p2, n, z); printz(z, n, p2); char * p3 = "aaaa"; memset(z, '\0', n*sizeof(int)); zbox_naive(p3, n, z); printz(z, n, p3); memset(z, '\0', n*sizeof(int)); zbox_linear(p3, n, z); printz(z, n, p3); free(z); // Testing linear match match.num_matches = 0; strmatch_linear(p, t, &match); printf("Testing the linear match: %u\n", match.num_matches); // Testing Knuth-Morris-Pratt match match.num_matches = 0; strmatch_kmp(p, t, &match); printf("Testing the KMP match: %u\n", match.num_matches); // Testing Boyer-Moore match match.num_matches = 0; strmatch_bm(p, t, &match); printf("Testing the Boyer-Moore match: %u\n", match.num_matches); return 0; }