[pypy-issue] [issue901] *** glibc detected *** pypy: corrupted double-linked list: 0x000000000bd76de0 ***

Alexander Milenko tracker at bugs.pypy.org
Mon Oct 10 20:47:16 CEST 2011


Alexander Milenko <alvein.flea at gmail.com> added the comment:

I do not know how to give information. them more than 5 million records. Before 
committing an error was 8-hour treatment on the nuclear machine with 8 GB of RAM

________________________________________
PyPy bug tracker <tracker at bugs.pypy.org>
<https://bugs.pypy.org/issue901>
________________________________________
-------------- next part --------------
# -*- coding: utf8 -*-

import sys
import os
import time

path = os.path.normpath(os.path.join(os.getcwd(), '..'))
sys.path.append(path)

from django.db.models.query_utils import Q
from idea.vasya.models import Idei74Matrix, ProviderMatrix, ProviderMatrixAnalogs, Providers

BITS = {
    u'4': 1152921504606846976,
    u'5': 2305843009213693952,
    u'1': 144115188075855872,
    u'0': 72057594037927936,
    u'3': 576460752303423488,
    u'б': 268435456,
    u'а': 134217728,
    u'г': 1073741824,
    u'в': 536870912,
    u'е': 4294967296,
    u'д': 2147483648,
    u'з': 17179869184,
    u'ж': 8589934592,
    u'7': 288230376151711744,
    u'и': 34359738368,
    u'л': 137438953472,
    u'к': 68719476736,
    u'н': 549755813888,
    u'м': 274877906944,
    u'п': 2199023255552,
    u'о': 1099511627776,
    u'с': 8796093022208,
    u'Ñ€': 4398046511104,
    u'у': 35184372088832,
    u'Ñ‚': 17592186044416,
    u'Ñ…': 140737488355328,
    u'Ñ„': 70368744177664,
    u'ч': 562949953421312,
    u'ц': 281474976710656,
    u'щ': 2251799813685248,
    u'ш': 1125899906842624,
    u'6': 144115188075855872,
    u'э': 4503599627370496,
    u'я': 36028797018963968,
    u'ÑŽ': 9007199254740992,
    u'9': 1152921504606846976,
    u'8': 576460752303423488,
    u'2': 288230376151711744,
    u'a': 2,
    u'c': 8,
    u'b': 4,
    u'e': 32,
    u'd': 16,
    u'g': 128,
    u'f': 64,
    u'i': 512,
    u'h': 256,
    u'k': 2048,
    u'j': 1024,
    u'm': 8192,
    u'l': 4096,
    u'o': 32768,
    u'n': 16384,
    u'q': 131072,
    u'p': 65536,
    u's': 524288,
    u'r': 262144,
    u'u': 2097152,
    u't': 1048576,
    u'w': 8388608,
    u'v': 4194304,
    u'y': 33554432,
    u'x': 16777216,
    u'z': 67108864,
    }

from Queue import Queue
from threading import Thread

class Worker(Thread):
    def __init__(self, tasks):
        Thread.__init__(self)
        self.tasks = tasks
        self.daemon = True
        self.start()

    def __del__(self):
        self.tasks = None
        self.daemon = None

    def run(self):
        while True:
            func, args, kwargs = self.tasks.get()
            try:
                func(*args, **kwargs)
            except Exception, e:
                print "%s %s %s" % (e, args, kwargs)

            self.tasks.task_done()


class ThreadPool:
    def __init__(self, num_threads):
        self.tasks = Queue(num_threads)
        for _ in range(num_threads):
            Worker(self.tasks)

    def __del__(self):
        self.tasks = None

    def add_task(self, func, *args, **kargs):
        self.tasks.put((func, args, kargs))

    def awaiting_completion(self):
        self.tasks.join()


def start_check(matrix, provider_items):
    src_pma = ProviderMatrixAnalogs.objects.filter(provider__in=provider_items, idei__id=matrix.id,
                                                   provider__provider=provider_items[0].provider.pk, bad_variant=True)
    src_pma = [pma.provider.id for pma in src_pma]

    if src_pma:
        provider_items = provider_items.filter(~Q(id__in=src_pma))

    matrix_hashes = matrix.hash.split(":::") if matrix.hash else []
    matrix_frases = matrix.frase.split(":::") if matrix.frase else []

    try:
        for item in provider_items:
            find_zero = False

            hashes = item.hash.split(":::")
            frases = item.frase.split(":::")

            for i, hash in enumerate(hashes):
                if not find_zero:
                    for ii, mh in enumerate(matrix_hashes):
                        if not find_zero and mh and hash:
                            r = int(mh) ^ int(hash)
                            l = 0

                            for b in BITS:
                                if r & BITS[b]:
                                    l += 1
                                    if l > 2:
                                        break


                            if l < 3:
                                d = distance(matrix_frases[ii], frases[i])

                                if d < 3:
                                    pma = ProviderMatrixAnalogs()
                                    pma.idei = matrix
                                    pma.provider = item
                                    pma.distance = d
                                    pma.algorythm = 'alg%s' % i

                                    if not d:
                                        find_zero = True
                                        pma.is_analog = True

                                    pma.save(using='master')

    except ValueError, e:
        print "error on matrix item id: %s" % matrix.id
        print "e: %s" % e
        print "matrix_hashes: %s" % matrix_hashes
        print "matrix_frases: %s" % matrix_frases
        print "hashes: %s" % hashes
        print "frases: %s" % frases
        print "-----"



def distance(a, b):
    "Calculates the Levenshtein distance between a and b."
    n, m = len(a), len(b)
    if n > m:
        # Make sure n <= m, to use O(min(n,m)) space
        a, b = b, a
        n, m = m, n

    current_row = range(n + 1) # Keep current and previous row, not entire matrix
    for i in range(1, m + 1):
        previous_row, current_row = current_row, [i] + [0] * m
        for j in range(1, n + 1):
            add, delete, change = previous_row[j] + 1, current_row[j - 1] + 1, previous_row[j - 1]
            if a[j - 1] != b[i - 1]:
                change += 1

            current_row[j] = min(add, delete, change)

    return current_row[n]

pool = ThreadPool(16)

idea_items = Idei74Matrix.objects.filter(~Q(group=221) & Q(show=True))
providers = Providers.objects.all()
for p in providers:
    analogs_items = ProviderMatrixAnalogs.objects.filter(Q(provider__provider__id=p.id) & Q(is_analog=True) & Q(provider__show=True))
    bad_items = ProviderMatrixAnalogs.objects.filter(Q(provider__provider__id=p.id) & Q(bad_variant=True))
    idea_items_new = [obj.idei.id for obj in analogs_items]
    provider_items = [obj.provider.id for obj in analogs_items]
    idea_items_new = idea_items.filter(~Q(id__in=idea_items_new) & Q(group__providers=p))
    provider_items = ProviderMatrix.objects.filter(Q(provider__id=p.id) & ~Q(id__in=provider_items) & Q(show=True) & Q(hide=False))

    if provider_items:
        for item in idea_items_new:
            pool.add_task(start_check, item, provider_items)

        pool.awaiting_completion()


More information about the pypy-issue mailing list