*!* According to this algorithm, the position of each character of the input value is to be compared with the >>>>*!* position of the same characters found in the corresponding field from the file. It calculates differences >>>>*!* between positions of the matching characters. >>>> >>>>*!* I assign a matching rate based on the difference in the number of positions. If a certain character is >>>>*!* found in the same position of both the search string and the database field, I assign a matching rate of >>>>*!* 100. If the two are one position away from each other, the rate is 90. Here is the complete scale: >>>> >>>>*!* Absolute difference between character positions Matching rate >>>>*!* 0 100 >>>>*!* 1 90 >>>>*!* 2 80 >>>>*!* 3 70 >>>>*!* 4 60 >>>>*!* 5 50 >>>>*!* 6 40 >>>>*!* 7 30 >>>>*!* 8 20 >>>>*!* 9 10 >>>>*!* 10 or more 0 >>>>LPARAMETERS tcText1, tcText2 >>>> >>>>m.tcText1 = ALLTRIM(m.tcText1) >>>>m.tcText2 = ALLTRIM(m.tcText2) >>>> >>>>IF EMPTY(m.tcText1) OR EMPTY(m.tcText2) >>>> RETURN 0 >>>>ENDIF >>>> >>>>*!* force both terms to uppercase and put the shorter term in lcText1 >>>>m.lcText1 = UPPER(IIF(LEN(tcText1) < LEN(tcText2), m.tcText1, m.tcText2)) >>>>m.lcText2 = UPPER(IIF(LEN(tcText1) < LEN(tcText2), m.tcText2, m.tcText1)) >>>> >>>>LOCAL ARRAY laMatchRate[10] >>>> >>>>laMatchRate[1] = 100 >>>>laMatchRate[2] = 90 >>>>laMatchRate[3] = 80 >>>>laMatchRate[4] = 70 >>>>laMatchRate[5] = 60 >>>>laMatchRate[6] = 50 >>>>laMatchRate[7] = 40 >>>>laMatchRate[8] = 30 >>>>laMatchRate[9] = 20 >>>>laMatchRate[10] = 10 >>>> >>>>*!* rather than go thru character by character if the terms match exactly >>>>IF m.lcText1 == m.lcText2 >>>> RETURN 100 >>>>ENDIF >>>> >>>>m.lnrate = -1 >>>>FOR i = 1 TO LEN(m.lcText1) >>>> IF SUBSTR(m.lcText1, i, 1) = SUBSTR(m.lcText2, i, 1) >>>> m.lnRate = m.lnRate + laMatchRate[1] >>>> ELSE >>>> lnPositionDiff = ABS(AT(SUBSTR(m.lcText1, i, 1), m.lcText2) - i) + 1 >>>> IF BETWEEN(m.lnPositionDiff, 0, 9) >>>> m.lnRate = m.lnRate + laMatchrate[m.lnPositionDiff] >>>> ENDIF >>>> ENDIF >>>>ENDFOR >>>> >>>>*!* do checks from start of string and from end of string >>>>FOR i = LEN(m.lcText1) TO 1 STEP -1 >>>> IF SUBSTR(m.lcText1, i, 1) = SUBSTR(m.lcText2, i, 1) >>>> m.lnRate = m.lnRate + laMatchRate[1] >>>> ELSE >>>> lnPositionDiff = ABS(AT(SUBSTR(m.lcText1, i, 1), m.lcText2) - i) + 1 >>>> IF BETWEEN(m.lnPositionDiff, 0, 9) >>>> m.lnRate = m.lnRate + laMatchrate[m.lnPositionDiff] >>>> ENDIF >>>> ENDIF >>>>ENDFOR >>>> >>>>m.lnLengthDiff = LEN(m.lcText2) - LEN(m.lcText1) >>>> >>>>m.lnRank = ROUND((m.lnRate/(LEN(m.lcText1) * laMatchrate[1] * 2)) * 100, 2) - m.lnLengthDiff >>>>*!* STRTOFILE(CHR(13) + TRANSFORM(m.lnrank), "Duplicate.log",1) >>>>RETURN m.lnRank>>>>
*!* function CalcLevenDistance >>>>LPARAMETERS m.str1, m.str2 >>>>LOCAL m.i, m.j, m.diagonal, m.let, m.cost, lnLengthStr1, lnLengthStr2 >>>> >>>>m.lnLengthStr1 = LEN(m.str1) >>>>m.lnLengthStr2 = LEN(m.str2) >>>> >>>>DIMENSION m.arr(m.lnLengthStr1 + 1) >>>>DIMENSION m.letters(m.lnLengthStr1) >>>> >>>>IF m.lnLengthStr1 * m.lnLengthStr2 == 0 >>>> RETURN m.lnLengthStr1 + m.lnLengthStr2 >>>>ENDIF >>>> >>>>FOR m.i = 1 TO ALEN(m.arr) >>>> m.arr(m.i) = m.i >>>>ENDFOR >>>> >>>>FOR m.i = 1 TO m.lnLengthStr1 >>>> m.letters(m.i) = SUBSTR(m.str1, m.i, 1) >>>>ENDFOR >>>> >>>>FOR m.i = 1 TO m.lnLengthStr2 >>>> m.diagonal = m.arr(1) - 1 >>>> m.arr(1) = m.i + 1 >>>> m.let = SUBSTR(m.str2, m.i, 1) >>>> FOR m.j = 1 TO m.lnLengthStr1 >>>> m.cost = m.diagonal >>>> IF m.letters(m.j) != m.let >>>> m.cost = m.cost + 1 >>>> ENDIF >>>> IF m.cost > m.arr(m.j) >>>> m.cost = m.arr(m.j) >>>> ENDIF >>>> IF m.cost > m.arr(m.j + 1) >>>> m.cost = m.arr(m.j + 1) >>>> ENDIF >>>> m.diagonal = m.arr(m.j + 1) - 1 >>>> m.arr(m.j + 1) = m.cost + 1 >>>> ENDFOR >>>>ENDFOR >>>> >>>>RETURN 100 - (m.arr(ALEN(m.arr)) - 1) >>>>>>>>