/* Paul Tero, 2004 These are my string matching functions. They basically convert a string into an integer which can then be XORed with another integer to see how closely they match. This conversion can be done whenever the string is updated and stored in a different database column, so that the string matching will be very quick (quicker than LEVENSTHEIN at least). The core function is XORC, which does the conversion. It divides the string into syllables and then finds the least common letter in each syllable and stores it's value in an integer. The 32 bit integer contains the values of 6 such letters. */ /* To include this in MySQL do something like (where the include directory contains MySQL headers): gcc -I/home/local/mysql-3.23.44-pc-linux-gnu-i686/include/ -shared -o /usr/lib/xorc.so xorc.c mysql> DROP FUNCTION xorc; CREATE FUNCTION xorc RETURNS INT SONAME "xorc.so"; mysql> SELECT xorc('Air Jamaica'); gcc -I/home/local/mysql-3.23.44-pc-linux-gnu-i686/include/ -shared -o /usr/lib/xorm.so xorc.c mysql> DROP FUNCTION xorm; CREATE FUNCTION xorm RETURNS INT SONAME "xorm.so"; mysql> SELECT xorm('Air Jamaica', 'Air France'); mysql> SELECT xorm('Air Jamaica', 89); gcc -I/home/local/mysql-3.23.44-pc-linux-gnu-i686/include/ -shared -o /usr/lib/xorp.so xorc.c mysql> DROP FUNCTION xorp; CREATE FUNCTION xorp RETURNS INT SONAME "xorp.so"; mysql> SELECT xorp('Air Jamaica', 'Air France'); mysql> SELECT xorp('Air Jamaica', 89); For command line testing, uncomment the main function below and compile as an executable: gcc -I/home/local/mysql-3.23.44-pc-linux-gnu-i686/include/ xorc.c ./a.out */ /*****************************************************/ /*Function prototypes and libraries needed to compile*/ /*****************************************************/ #include #include #include #include #include #include #include #include //The following lines are for testing it from the command line //#include //int main(int argc, char **argv) {printf("Answer: %d\n", xorc_internal ("Air Jamaica", 12));} //Export the xorc, xorm and xorp functions long long xorm (UDF_INIT *initid, UDF_ARGS *args, char *is_null, char *error); long long xorp (UDF_INIT *initid, UDF_ARGS *args, char *is_null, char *error); long long xorc (UDF_INIT *initid, UDF_ARGS *args, char *is_null, char *error); long long xorm (UDF_INIT *initid, UDF_ARGS *args, char *is_null, char *error) {return xorm_internal (args);} my_bool xorm_init (UDF_INIT *initid, UDF_ARGS *args, char *message) { if (args->arg_count != 2) {strcpy (message,"XORM() AND XORP() require two string or integer arguments"); return 1;} if (args->arg_type[0] != STRING_RESULT) args->arg_type[0] = INT_RESULT; //we accept strings or integers, but not null if (args->arg_type[1] != STRING_RESULT) args->arg_type[1] = INT_RESULT; return 0; } int xorm_internal (UDF_ARGS *args) { //if we are passed in valid arguements, check for a STRING or INT, or full null arguments return 0 int first = args->args[0] ? (args->arg_type[0] == STRING_RESULT ? xorc_internal (args->args[0], args->lengths[0]) : *((long long*) args->args[0])) : 0; int second = args->args[1] ? (args->arg_type[1] == STRING_RESULT ? xorc_internal (args->args[1], args->lengths[1]) : *((long long*) args->args[1])) : 0; return first ^ second; } long long xorp (UDF_INIT *initid, UDF_ARGS *args, char *is_null, char *error) { long long result = xorm_internal (args); if ((result >> 26) > 0) return 0; //0 of 31 bits match if ((result >> 21) > 0) return 16; //at least 5 of 31 bits match if ((result >> 16) > 0) return 32; //10 of 31 if ((result >> 11) > 0) return 48; //15 of 31 if ((result >> 6) > 0) return 65; //20 of 31 if ((result >> 1) > 0) return 81; //25 of 31 if (result > 0) return 97; //30 of 31 return 100; //31 of 31 } my_bool xorp_init (UDF_INIT *initid, UDF_ARGS *args, char *message) {xorm_init (initid, args, message);} long long xorc (UDF_INIT *initid, UDF_ARGS *args, char *is_null, char *error) { if (!args->args[0]) return 0; //we don't accept null arguments (even with the type coercion in xorc_init, we still get nulls here in joins) return (long long) xorc_internal (args->args[0], args->lengths[0]); } my_bool xorc_init (UDF_INIT *initid, UDF_ARGS *args, char *message) { if (args->arg_count != 1) {strcpy (message,"XORC() requires one string argument"); return 1;} args->arg_type[0] = STRING_RESULT; //we only accept strings (this forces the argument to be a string) return 0; } int xorc_internal (char *phrase, int phraselen) { int isvowel = 0; //whether the current character is a vowel int numvowels = 0; //number of vowels found in a row int maxvalue = 0; //the current maximum value consonant found int shift = 26; //the first one is shifted this far to the right int result = 0; //the result int i; //this will be used to loop though each character int c; //the numberical value of the current characater int frequencies[31] = {2, 19, 11, 10, 0, 14, 16, 8, 4, 23, 21, 9, 13, 5, 3, 15, 24, 7, 6, 1, 12, 20, 17, 22, 18, 25, 26, 27, 28, 29, 30}; //frequency of each letter (a is the 2nd most frequent, b the 19th, etc, the last 5 are for 0/1, 2/3, etc) for (i=0; i= 97 && c <= 122) c-=32; //make it into a lower case character if (c >= 48 && c <= 57) c = c/2 + 67; //numbers come after all the letters isvowel = (c==65 || c==69 || c==73 || c==79 || c==85); //is it a vowel //printf ("%dth character is %d, vowel: %d\n", i, c, isvowel); if (c < 33 || (numvowels && !isvowel)) { //end of a word //printf ("end of syllable, maxvalue is %d, numvowels is %d\n", maxvalue, numvowels); result |= maxvalue << shift; //add to the result shift -= 5; //shift a bit less the next time numvowels = maxvalue = 0; //reset variables if (shift < 0) break; //leave the loop when we've got enough data } if (c >= 65 && c <= 95) { //the letter is a valid one (just skip other ones) if (isvowel) numvowels++; //increment the number of vowels in a row maxvalue = max (maxvalue, frequencies[c-65]); //the max value of this syllable //printf ("max value is now %d\n", maxvalue); } } result |= (shift < 0) ? 1 : (maxvalue << shift); //set the last bit to 1 if we've run out of bits, or add the final character return result; //return the result }