[Tutor] finding duplicates within a tuple of tuples
Peter Otten
__peter__ at web.de
Fri Jul 30 10:33:01 CEST 2010
Norman Khine wrote:
> Here is the latest version http://pastie.org/1066582 can this be
> further improved?
> # get all the duplicates and clean the products table
> main.execute("SELECT product_Id, url FROM %s.product WHERE url != ''" %
db)
> results = main.fetchall()
>
> d = defaultdict(set)
> for id, url in results:
> d[url].add(id)
>
> for ids in d.itervalues():
> if len(ids) > 1:
> # we now hove the first product_id added
> canonical = min(ids)
> ids = list(ids)
> ids.pop(ids.index(canonical))
> for id in ids:
> update_product_id = 'UPDATE oneproduct.productList_product_assoc SET
productList_id=%s WHERE productList_id=%s'
> main.execute(update_product_id, (id, canonical))
> org.commit()
> main.close()
Yes; do it in SQL. Here's my attempt:
# Disclaimer: I stopped at the first point where it seemed to work; don't
# apply the following on valuable data without extensive prior tests.
import sqlite3
WIDTH = 80
db = sqlite3.connect(":memory:")
url_table = [
(24715,"http://aqoon.local/muesli/2-muesli-tropical-500g.html"),
(24719,"http://aqoon.local/muesli/2-muesli-tropical-500g.html"),
(24720,"http://example.com/index.html")
]
cursor = db.cursor()
cursor.execute("create table alpha (id, url)")
cursor.executemany("insert into alpha values (?, ?)", url_table)
c2 = db.cursor()
id_table = [
(1, 24715),
(2, 24719),
(3, 24720)
]
cursor.execute("create table beta (id, alpha_id)")
cursor.executemany("insert into beta values (?, ?)", id_table)
def show(name):
print name.center(WIDTH, "-")
for row in cursor.execute("select * from %s" % name):
print row
print
print " BEFORE ".center(WIDTH, "=")
show("alpha")
show("beta")
cursor.execute("""
create view gamma as
select min(a.id) new_id, b.id old_id from alpha a, alpha b
where a.url = b.url group by a.url
""")
cursor.execute("""
update beta
set alpha_id = (select new_id from gamma where alpha_id = old_id)
where (select new_id from gamma where alpha_id = old_id) is not Null
""")
cursor.execute("""
delete from alpha
where id not in (select min(b.id)
from alpha b where alpha.url = b.url)
""")
print " AFTER ".center(WIDTH, "=")
show("alpha")
show("beta")
A database expert could probably simplify that a bit.
Again: duplicate records are best not created rather than removed. If you
can create a unique index for the url column and alter your insertion code
appropriately.
Peter
More information about the Tutor
mailing list