[Tutor] finding duplicates within a tuple of tuples

Peter Otten __peter__ at web.de
Fri Jul 30 10:33:01 CEST 2010


Norman Khine wrote:

> Here is the latest version http://pastie.org/1066582 can this be
> further improved?

> # get all the duplicates and clean the products table
> main.execute("SELECT product_Id, url FROM %s.product WHERE url != ''" % 
db)
> results = main.fetchall()
> 
> d = defaultdict(set)
> for id, url in results:
> 	d[url].add(id)
> 
> for ids in d.itervalues():
> 	if len(ids) > 1:
> 		# we now hove the first product_id added
> 		canonical = min(ids)
> 		ids = list(ids)
> 		ids.pop(ids.index(canonical))
> 		for id in ids:
> 			update_product_id = 'UPDATE oneproduct.productList_product_assoc SET 
productList_id=%s WHERE productList_id=%s'
> 			main.execute(update_product_id, (id, canonical))
> 			org.commit()
> main.close()

Yes; do it in SQL. Here's my attempt:

# Disclaimer: I stopped at the first point where it seemed to work; don't 
# apply the following on valuable data without extensive prior tests.
 
import sqlite3 
WIDTH = 80
db = sqlite3.connect(":memory:")

url_table = [
(24715,"http://aqoon.local/muesli/2-muesli-tropical-500g.html"),
(24719,"http://aqoon.local/muesli/2-muesli-tropical-500g.html"),
(24720,"http://example.com/index.html")
]
cursor = db.cursor()
cursor.execute("create table alpha (id, url)")
cursor.executemany("insert into alpha values (?, ?)", url_table)

c2 = db.cursor()

id_table = [
(1, 24715),
(2, 24719),
(3, 24720)
]

cursor.execute("create table beta (id, alpha_id)")
cursor.executemany("insert into beta values (?, ?)", id_table)

def show(name):
    print name.center(WIDTH, "-")
    for row in cursor.execute("select * from %s" % name):
        print row
    print

print " BEFORE ".center(WIDTH, "=")
show("alpha")
show("beta")

cursor.execute("""
create view gamma as 
select min(a.id) new_id, b.id old_id from alpha a, alpha b 
where a.url = b.url group by a.url
""")
cursor.execute("""
update beta 
set alpha_id = (select new_id from gamma where alpha_id = old_id) 
where (select new_id from gamma where alpha_id = old_id) is not Null
""")

cursor.execute("""
delete from alpha 
where id not in (select min(b.id) 
from alpha b where alpha.url = b.url)
""")

print " AFTER ".center(WIDTH, "=")
show("alpha")
show("beta")

A database expert could probably simplify that a bit.

Again: duplicate records are best not created rather than removed. If you 
can create a unique index for the url column and alter your insertion code 
appropriately.

Peter



More information about the Tutor mailing list