>#IF _TIMING_ > ACTIVATE SCREEN >#ENDIF >APPEND FROM _FOLDER_ + "Adressen.txt" DELIMITED >#IF _TIMING_ > ? SECONDS() - m.lnSec0, "Append from" >#ENDIF > > >#define CL_USEHASH .t. >#if CL_USEHASH > * SET LIBRARY TO md5.fll > SET LIBRARY TO vfpencryption71.fll > > REPLACE ALL ; > ; && Vorname WITH LTRIM( Vorname ), ; && !needed > Name WITH LTRIM( Name ), ; > Strasse WITH LTRIM( Strasse ), ; > HausNr WITH LTRIM( HausNr ), ; > PLZ WITH LTRIM( PLZ ), ; > Ort WITH LTRIM( Ort ), ; > eMail WITH LTRIM( eMail ) ; > , ChkSum with Hash(Vorname + name + Strasse + HausNr + PLZ + Ort + email, 5) > > * Ed Laefe Version > *ChkSum with md5Hash(Vorname + name + Strasse + HausNr ; > + PLZ + Ort + email) > > *-- Checksum too slow > * ChkSum with INT(VAL(SYS(2017,"ChkSum",0,1))-m.lnDiff) >#else > REPLACE ALL ; > ; && Vorname WITH LTRIM( Vorname ), ; && !needed > Name WITH LTRIM( Name ), ; > Strasse WITH LTRIM( Strasse ), ; > HausNr WITH LTRIM( HausNr ), ; > PLZ WITH LTRIM( PLZ ), ; > Ort WITH LTRIM( Ort ), ; > eMail WITH LTRIM( eMail ) >#endif > >#IF _TIMING_ > ? SECONDS() - m.lnSec0, "Repl all" >#ENDIF > >#if CL_USEHASH > SET UNIQUE off > INDEX on ChkSum TO _unique COMPACT > local lcLastHash, laRecs[1], llWasDup > lcLastHash = "1" && since length is different, it is initialized ok > ? SECONDS() - m.lnSec0, "Idx ChkSum" >#elif .f. > *-- not totally clear, seems to be faster than unique in index > SET UNIQUE on > INDEX ON HausNr + PLZ + Ort + Vorname + Name + Strasse + eMail TO _unique COMPACT > #IF _TIMING_ > ? SECONDS() - m.lnSec0, "Rearranged Idx Built" > #ENDIF >#else > INDEX ON Vorname + Name + Strasse + HausNr + PLZ + Ort + eMail TAG _unique UNIQUE > #IF _TIMING_ > ? SECONDS() - m.lnSec0, "CDX Built" > #ENDIF >#endif > >create table _FOLDER_ + "Table2" ; > (name C(61), Strasse C(34), Ort C(36), eMail C(30) ) >flock("Table2") >select Table1 > >scan > #if CL_USEHASH > *-- first try using a hash index to filter out duplicate records > *-- assumption: only a minuscle part is really duplicate > *-- hash function is nearly collision free (2**32 Bins for 5*10**5 should work > *-- even for mediocre hash function key distribution > *-- therefore not always updating the "laststore" and only checking a boolean > *-- but each first duplicate hashentry gets added with a skip-1/initialize/skip > if !m.lcLastHash == ChkSum > insert into Table2 values ; > ( rtrim( Table1.Vorname ) +" "+ Table1.name; > , rtrim( Table1.Strasse ) +" "+ Table1.HausNr ; > , Table1.PLZ + Table1.Ort, Table1.eMail) > lcLastHash = ChkSum > IF m.llWasDup > llWasDup = .f. > dimension laRecs[1] > endif > ELSE > IF not m.llWasDup > llWasDup = .t. > SKIP -1 > laRecs[1] = Vorname + name + Strasse + HausNr + PLZ + Ort + eMail > SKIP > IF !laRecs[1] == Vorname + name + Strasse + HausNr + PLZ + Ort + eMail > insert into Table2 values ; > ( rtrim( Table1.Vorname ) +" "+ Table1.name; > , rtrim( Table1.Strasse ) +" "+ Table1.HausNr ; > , Table1.PLZ + Table1.Ort, Table1.eMail) > > dimension laRecs[2] > laRecs[2] = Vorname + name + Strasse + HausNr + PLZ + Ort + eMail > * else > * Show first false Hash collision > endif > ELSE > if ascan(laRecs, Vorname + name + Strasse + HausNr + PLZ + Ort + eMail)=0 > insert into Table2 values ; > ( rtrim( Table1.Vorname ) +" "+ Table1.name; > , rtrim( Table1.Strasse ) +" "+ Table1.HausNr ; > , Table1.PLZ + Table1.Ort, Table1.eMail) > > dimension laRecs[ALEN(laRecs)+1] > laRecs[ALEN(laRecs)] = Vorname + name + Strasse + HausNr + PLZ + Ort + eMail > * else > * Show multiple false Hash collision > endif > endif > endif > #else > insert into Table2 values ; > ( rtrim( Table1.Vorname ) +" "+ Table1.name; > , rtrim( Table1.Strasse ) +" "+ Table1.HausNr ; > , Table1.PLZ + Table1.Ort, Table1.eMail) > #endif >endscan >#if _TIMING_ > ? seconds() - m.lnSec0, "Scan/Insert", RECCOUNT("Table2") >#endif >* DELETE TAG _Unique >use in Table1 >#if _TIMING_ > ? seconds() - m.lnSec0, "TableClose" >#endif > >ERASE _FOLDER_ + "Adressen2.*" >lnFHandle = FCREATE( _FOLDER_ + "Adressen2.txt", 0 ) >*ASSERT NOT m.lnFHandle == -1 >SELECT Table2 >SCAN > #if .t. > *-- no variables are fastest here - CPU faster than RAM even in P-Code! > FPUTS( m.lnFHandle, ; > STUFF( RTRIM( Name ), AT( " ", Name ), 1, ", " ) +", "+ ; > STUFF( RTRIM( Strasse ), RAT( " ", RTRIM( Strasse ) ), 1, ", " ) +", "+ ; > STUFF( RTRIM( Ort ), AT( " ", Ort ), 1, ", " ) +", "+ ; > RTRIM( eMail ) ) > #elif .t. > lcStrasse = RTRIM( Strasse ) > FPUTS( m.lnFHandle, ; > STUFF( RTRIM( Name ), AT( " ", Name ), 1, ", " ) +", "+ ; > STUFF( m.lcStrasse, RAT( " ", m.lcStrasse ), 1, ", " ) +", "+ ; > STUFF( RTRIM( Ort ), AT( " ", Ort ), 1, ", " ) +", "+ ; > RTRIM( eMail ) ) > #else > lcName = RTRIM( Name ) > lcStrasse = RTRIM( Strasse ) > > FPUTS( m.lnFHandle, ; > STUFF( m.lcName, RAT( " ", m.lcName ), 1, ", " ) +", "+ ; > STUFF( m.lcStrasse, RAT( " ", m.lcStrasse ), 1, ", " ) +", "+ ; > STUFF( RTRIM( Ort ), AT( " ", Ort ), 1, ", " ) +", "+ ; > RTRIM( eMail ) ) > #endif >ENDSCAN >FCLOSE( m.lnFHandle ) >>