Plateforme Level Extreme
Abonnement
Profil corporatif
Produits & Services
Support
Légal
English
Dotnetpro database performance contest
Message
De
13/05/2007 13:04:34
 
 
À
13/05/2007 04:55:18
Information générale
Forum:
Visual FoxPro
Catégorie:
VFP Compiler for .NET
Divers
Thread ID:
01224231
Message ID:
01225086
Vues:
16
Hi Markus & Samuel,

for your testing pleasure
#IF _TIMING_
  ACTIVATE SCREEN
#ENDIF
APPEND FROM _FOLDER_ + "Adressen.txt" DELIMITED
#IF _TIMING_
  ? SECONDS() - m.lnSec0, "Append from"
#ENDIF


#define CL_USEHASH	.t.
#if CL_USEHASH
	* SET LIBRARY TO md5.fll
	SET LIBRARY TO vfpencryption71.fll

	REPLACE ALL ;
	  ; && Vorname WITH LTRIM( Vorname ), ; && !needed
	  Name WITH LTRIM( Name ), ;
	  Strasse WITH LTRIM( Strasse ), ;
	  HausNr WITH LTRIM( HausNr ), ;
	  PLZ WITH LTRIM( PLZ ), ;
	  Ort WITH LTRIM( Ort ), ;
	  eMail WITH LTRIM( eMail ) ;
	  , ChkSum with Hash(Vorname + name + Strasse + HausNr + PLZ + Ort + email, 5)

	  * Ed Laefe Version
	  *ChkSum with md5Hash(Vorname + name + Strasse + HausNr ;
		+ PLZ + Ort + email)
		
		*-- Checksum too slow
		*  ChkSum with INT(VAL(SYS(2017,"ChkSum",0,1))-m.lnDiff)
#else
	REPLACE ALL ;
	  ; && Vorname WITH LTRIM( Vorname ), ; && !needed
	  Name WITH LTRIM( Name ), ;
	  Strasse WITH LTRIM( Strasse ), ;
	  HausNr WITH LTRIM( HausNr ), ;
	  PLZ WITH LTRIM( PLZ ), ;
	  Ort WITH LTRIM( Ort ), ;
	  eMail WITH LTRIM( eMail ) 
#endif

#IF _TIMING_
  ? SECONDS() - m.lnSec0, "Repl all"
#ENDIF

#if CL_USEHASH
	SET UNIQUE off
	INDEX on ChkSum TO _unique COMPACT
	local lcLastHash, laRecs[1], llWasDup
	lcLastHash = "1" && since length is different, it is initialized ok
	? SECONDS() - m.lnSec0, "Idx ChkSum"
#elif .f.
	*-- not totally clear, seems to be faster than unique in index
	SET UNIQUE on
	INDEX ON HausNr + PLZ + Ort + Vorname + Name + Strasse + eMail TO _unique COMPACT 
	#IF _TIMING_
	  ? SECONDS() - m.lnSec0, "Rearranged Idx Built"
	#ENDIF
#else
	INDEX ON Vorname + Name + Strasse + HausNr + PLZ + Ort + eMail TAG _unique UNIQUE
	#IF _TIMING_
	  ? SECONDS() - m.lnSec0, "CDX Built"
	#ENDIF
#endif

create table _FOLDER_ + "Table2" ;
	(name	C(61), Strasse C(34), Ort C(36), eMail C(30) )
flock("Table2")
select Table1

scan
	#if CL_USEHASH
		*-- first try using a hash index to filter out duplicate records
		*-- assumption: only a minuscle part is really duplicate
                *-- hash function is nearly collision free (2**32 Bins for 5*10**5 should work
                *-- even for mediocre hash function key distribution
		*-- therefore not always updating the "laststore" and only checking a boolean
		*-- but each first duplicate hashentry gets added with a skip-1/initialize/skip
		if !m.lcLastHash == ChkSum
			insert into Table2 values ;
				(	rtrim( Table1.Vorname ) +" "+ Table1.name;
				,	rtrim( Table1.Strasse ) +" "+ Table1.HausNr ;
				,	Table1.PLZ + Table1.Ort, Table1.eMail)
			lcLastHash = ChkSum
			IF m.llWasDup
				llWasDup = .f.
				dimension laRecs[1]
			endif
		ELSE
			IF not m.llWasDup
				llWasDup = .t.
				SKIP -1
				laRecs[1] = Vorname + name + Strasse + HausNr + PLZ + Ort + eMail
				SKIP
				IF !laRecs[1] == Vorname + name + Strasse + HausNr + PLZ + Ort + eMail
					insert into Table2 values ;
						(	rtrim( Table1.Vorname ) +" "+ Table1.name;
						,	rtrim( Table1.Strasse ) +" "+ Table1.HausNr ;
						,	Table1.PLZ + Table1.Ort, Table1.eMail)

					dimension laRecs[2]
					laRecs[2] = Vorname + name + Strasse + HausNr + PLZ + Ort + eMail
				* else
					* Show first false Hash collision
				endif
			ELSE
				if ascan(laRecs, Vorname + name + Strasse + HausNr + PLZ + Ort + eMail)=0
					insert into Table2 values ;
						(	rtrim( Table1.Vorname ) +" "+ Table1.name;
						,	rtrim( Table1.Strasse ) +" "+ Table1.HausNr ;
						,	Table1.PLZ + Table1.Ort, Table1.eMail)

					dimension laRecs[ALEN(laRecs)+1]
					laRecs[ALEN(laRecs)] = Vorname + name + Strasse + HausNr + PLZ + Ort + eMail
				* else
					* Show multiple false Hash collision
				endif
			endif
		endif
	#else
		insert into Table2 values ;
			(	rtrim( Table1.Vorname ) +" "+ Table1.name;
			,	rtrim( Table1.Strasse ) +" "+ Table1.HausNr ;
			,	Table1.PLZ + Table1.Ort, Table1.eMail)
	#endif
endscan
#if _TIMING_
	? seconds() - m.lnSec0, "Scan/Insert", RECCOUNT("Table2")
#endif
* DELETE TAG _Unique
use in Table1
#if _TIMING_
	? seconds() - m.lnSec0, "TableClose"
#endif

ERASE _FOLDER_ + "Adressen2.*"
lnFHandle = FCREATE( _FOLDER_ + "Adressen2.txt", 0 )
*ASSERT NOT m.lnFHandle == -1
SELECT Table2
SCAN
	#if .t.
                *-- no variables are fastest here - CPU faster than RAM even in P-Code!
		FPUTS( m.lnFHandle, ;
		    STUFF( RTRIM( Name ), AT( " ", Name ), 1, ", " ) +", "+ ;
		    STUFF( RTRIM( Strasse ), RAT( " ", RTRIM( Strasse ) ), 1, ", " ) +", "+ ;
		    STUFF( RTRIM( Ort ), AT( " ", Ort ), 1, ", " ) +", "+ ;
		    RTRIM( eMail ) )
 	#elif .t.
		lcStrasse = RTRIM( Strasse )
		FPUTS( m.lnFHandle, ;
		    STUFF( RTRIM( Name ), AT( " ", Name ), 1, ", " ) +", "+ ;
		    STUFF( m.lcStrasse, RAT( " ", m.lcStrasse ), 1, ", " ) +", "+ ;
		    STUFF( RTRIM( Ort ), AT( " ", Ort ), 1, ", " ) +", "+ ;
		    RTRIM( eMail ) )
	#else
		lcName = RTRIM( Name )
		lcStrasse = RTRIM( Strasse )
		
		FPUTS( m.lnFHandle, ;
		    STUFF( m.lcName, RAT( " ", m.lcName ), 1, ", " ) +", "+ ;
		    STUFF( m.lcStrasse, RAT( " ", m.lcStrasse ), 1, ", " ) +", "+ ;
		    STUFF( RTRIM( Ort ), AT( " ", Ort ), 1, ", " ) +", "+ ;
		    RTRIM( eMail ) )
	#endif
ENDSCAN
FCLOSE( m.lnFHandle )
and don't forget to add a
, ChkSum C(16)
to your definition of table1.

Now this gets a hefty speed up on my slower machine (can't work on the fast ones right now, they all must earn money<g>)
You need a hash fll from Ed Leafe
http://www.leafe.com/dls/vfp

or Craig Boyd:
http://www.sweetpotatosoftware.com/SPSBlog/PermaLink,guid,db662a8f-d47c-46c8-b0d2-a591c20d024b.aspx

but this was just as proof of concept, as MD5 should be considered way too heavy for this task. A few links to faster hash functions (which will also index faster as they return shorter hash values).
@Samuel: The first one is in german, but mostly code. If you have question on translation, feel free to ask.

http://www.fantasy-coders.de/projects/gh/html/x435.html
http://burtleburtle.net/bob/hash/evahash.html
http://www.azillionmonkeys.com/qed/hash.html

so fire up your testing harnass, I spent way too much time this weekend on this (but had FUN). I think either Sleepycat or Hsieh would make a nice first test, if the results on your machines are also undeniable.

regards

thomas
Précédent
Suivant
Répondre
Fil
Voir

Click here to load this message in the networking platform