*-- init to unequal to save cycles in loop >lcCompMe = iif(testdup.CompMe=="!", "?", "!") >>
Table TestDups has 10000 records Method 1: 0.313 sec. Method 2: 6.312 sec. Method 3: 6.265 sec. Method 4: 0.172 sec. __________________________________________________ Table TestDups has 24310 records Method 1: 0.916 sec. Method 2: 14.438 sec. Method 3: 14.344 sec. Method 4: 0.313 sec. __________________________________________________ Table TestDups has 34982 records Method 1: 1.078 sec. Method 2: 19.438 sec. Method 3: 19.031 sec. Method 4: 0.437 sec. __________________________________________________ Table TestDups has 45000 records Method 1: 1.172 sec. Method 2: 24.035 sec. Method 3: 24.200 sec. Method 4: 0.556 sec. __________________________________________________ Table TestDups has 55000 records Method 1: 0.680 sec. Method 2: 28.800 sec. Method 3: 28.242 sec. Method 4: 0.702 sec. __________________________________________________Let's see where do I have an error:
******************************************************************** * Description.......: TestDups - this program tests 3 different methods * to eliminate duplicates from a table * Calling Samples...: * Parameter List....: * Created by........: Nadya Nosonovsky 10/07/01 07:17:40 PM * Modified by.......: 08/03/07 ******************************************************************** CLEAR #DEFINE CRLF2 CHR(13) + CHR(10) + CHR(13) + CHR(10) CREATE TABLE TestDups (KeyFld I, NameFld C(60), DateFld D) INDEX ON KeyFld TAG KeyFld SET ORDER TO LOCAL I, KeyFld, NameFld, DateFld, lnLower, lnUpper, lnSeconds, lnKeyID, lnRecs lnLower = 1 lnUpper = 5000 FOR lnI = 1 TO 5 FOR I=1 TO 10000 * m.lnI DateFld = DATE()-MOD(I,7) NameFld="Test"+TRANSFORM(I) KeyFld = INT((m.lnUpper - m.lnLower + 1) * RAND( ) + m.lnLower) INSERT INTO TestDups FROM MEMVAR NEXT =STRTOFILE('Table TestDups has ' + TRANSFORM(RECCOUNT()) + ' records' + CRLF2, 'SpeedTest.txt',1) * Note that our tests operate with the exclusively used table (cursor) *!* - The table cannot have deleted records before this operation *!* - It requires to delete all records in the table that could be slow on the big table *!* - It requires creating a new index and on the big table it could take a long time *!* - It requires to recall almost all records in the table that could *!* be time consuming on the big table also. CLOSE ALL lnSeconds = SECONDS() SET DELETED OFF USE TestDups EXCLUSIVE DELETE ALL INDEX ON KeyFld TAG UniqueKey UNIQUE RECALL ALL SET ORDER TO =STRTOFILE('Method 1: ' + ; PADL(SECONDS()-m.lnSeconds, 20) + ' sec.' + CRLF2, 'SpeedTest.txt',1) * ?"Method 1:",SECONDS()-m.lnSeconds DELETE TAG UniqueKey && We no longer need this tag SET DELETED ON *!* COUNT TO lnRecs *!* =STRTOFILE('After de-duping table TestDups now has ' + TRANSFORM(m.lnRecs) + ' records' + CRLF2, 'SpeedTest.txt',1) SELECT TestDups RECALL ALL && We recalled all records, so returned to state 1 ******** Method two CLOSE ALL USE TestDups SHARED lnSeconds = SECONDS() SELECT KeyFld, COUNT(*) AS CntDups FROM TestDups GROUP BY 1 ; HAVING CntDups > 1 ; INTO CURSOR curDupes SELECT TestDups SET ORDER TO KeyFld SELECT curDupes SCAN && curDupes lnKeyID = KeyFld SELECT TestDups SEEK m.lnKeyID SKIP && leave the first record DELETE WHILE KeyFld = m.lnKeyID ENDSCAN =STRTOFILE('Method 2: ' + ; PADL(SECONDS()-m.lnSeconds, 20) + ' sec.' + CRLF2, 'SpeedTest.txt',1) USE IN curDupes *!* COUNT TO lnRecs *!* =STRTOFILE('After de-duping table TestDups now has ' + TRANSFORM(m.lnRecs) + ' records' + CRLF2, 'SpeedTest.txt',1) SELECT TestDups RECALL ALL && We recalled all records, so returned to state 1 *Method three CLOSE ALL USE TestDups SHARED lnSeconds = SECONDS() SET ORDER TO KeyFld lnKey = -1000 && non-existant value SCAN IF KeyFld = m.lnKey DELETE WHILE KeyFld = m.lnKey ENDIF lnKey = KeyFld ENDSCAN =STRTOFILE('Method 3: ' + ; PADL(SECONDS()-m.lnSeconds, 20) + ' sec.' + CRLF2, 'SpeedTest.txt',1) SELECT TestDups *!* COUNT TO lnRecs *!* =STRTOFILE('After de-duping table TestDups now has ' + TRANSFORM(m.lnRecs) + ' records' + CRLF2, 'SpeedTest.txt',1) RECALL ALL && We recalled all records, so returned to state 1 * ?"Method 3:",SECONDS()-m.lnSeconds *********Method four CLOSE ALL USE TestDups SHARED lnSeconds = SECONDS() SET ENGINEBEHAVIOR 70 SELECT * FROM TestDups GROUP BY KeyFld INTO CURSOR curNoDupes nofilter SET ENGINEBEHAVIOR 80 USE IN TestDups USE TestDups EXCLUSIVE IN 0 SELECT TestDups ZAP APPEND FROM (DBF('curNoDupes')) * COUNT TO lnRecs * At this point we should have only unique records - no duplicates =STRTOFILE('Method 4: ' + ; PADL(SECONDS()-m.lnSeconds, 20) + ' sec.' + CRLF2 + ; REPLICATE('_',50) + CRLF2 , 'SpeedTest.txt',1) * ?"Method 4:",SECONDS()-m.lnSeconds NEXTActually, after re-reading the code there is nothing wrong with the numbers, in fact they tell us how many unique records we had. But may be I should start with the clean state anyway, e.g. zap after the last method again.