joining files

Tuomas Vesterinen tuomas.vesterinen at iki.fi
Sun May 16 14:01:17 EDT 2010


On 05/16/2010 05:04 PM, Dave Angel wrote:
> (You forgot to include the python-list in your response.  So it only
> went to me. Normally, you just do reply-all to the message)
>
> mannu jha wrote:
>> On Sun, 16 May 2010 13:52:31 +0530 wrote
>>> mannu jha wrote:
>>
>>> Hi,
>>
>>
>>> I have few files like this:
>>> file1:
>>> 22 110.1 33 331.5 22.7 5 271.9 17.2 33.4
>>> 4 55.1
>>
>>> file1 has total 4 column but some of them are missing in few row.
>>
>>> file2:
>>> 5 H
>>> 22 0
>>
>>> file3:
>>> 4 T
>>> 5 B
>>> 22 C
>>> 121 S
>>
>>
>>> in all these files first column is the main source of matching their
>>> entries. So What I want in the output is only those entries which is
>>> coming in all three files.
>>
>>> output required:
>>
>>> 5 271.9 17.2 33.4 5 H 5 T
>>> 22 110.1 22 0 22 C
>>
>> I am trying with this :
>>
>> from collections import defaultdict
>>
>> def merge(sources):
>> blanks = [blank for items, blank, keyfunc in sources]
>> d = defaultdict(lambda: blanks[:])
>> for index, (items, blank, keyfunc) in enumerate(sources):
>> for item in items:
>> d[keyfunc(item)][index] = item
>> for key in sorted(d):
>> yield d[key]
>>
>> if __name__ == "__main__":
>> a = open("input1.txt")
>> c = open("input2.txt")
>>
>> def key(line):
>> return line[:2]
>> def source(stream, blank="", key=key):
>> return (line.strip() for line in stream), blank, key
>> for m in merge([source(x) for x in [a,c]]):
>> print "|".join(c.ljust(10) for c in m)
>>
>> but with input1.txt:
>> 187 7.79 122.27 54.37 4.26 179.75
>> 194 8.00 121.23 54.79 4.12 180.06
>> 15 8.45 119.04 55.02 4.08 178.89
>> 176 7.78 118.68 54.57 4.20 181.06
>> 180 7.50 119.21 53.93 179.80
>> 190 7.58 120.44 54.62 4.25 180.02
>> 152 8.39 120.63 55.10 4.15 179.10
>> 154 7.79 119.62 54.47 4.22 180.46
>> 175 8.42 120.50 55.31 4.04 180.33
>> and input2.txt:
>> 15 H 37 H 95 T
>> 124 H 130 H 152 H 154 H 158 H 164 H
>> 175 H 176 H 180 H
>> 187 H 190 T
>> 194 C
>> 196 H 207 H 210 H 232 H it is giving output as:
>> |
>> |124 H
>> |130 H
>> 154 7.79 119.62 54.47 4.22 180.46|158 H
>> |164 H
>> 175 8.42 120.50 55.31 4.04 180.33|176 H
>> 180 7.50 119.21 53.93 179.80|187 H
>> 190 7.58 120.44 54.62 4.25 180.02|196 H
>> |207 H
>> |210 H
>> |232 H
>> |37 H
>> |95 T
>> so it not matching it properly, can anyone please suggest where I am
>> doing mistake.
>>
>>
>>
> I'm about to travel all day, so my response will be quite brief.
>
> Not sure what you mean by the blank and key values that source() takes,
> since they're just passed on to its return value.
>
> I don't see any place where you compare the items from the various
> files, so you aren't checking if an item is in multiple files.
>
> DaveA
>
import os

def merge_sources(sources):
     # sources is a list of tuples (source_name, source_data)
     data = []
     keysets = []
     for nme, sce in sources:
         lines = {}
         for line in sce.split(os.linesep):
             lst = line.split()
             lines[lst[0]] = (nme, lst)
         keysets.append(set(lines.keys()))
         data.append(lines)
     common_keys = keysets[0]
     for keys in keysets[1:]:
         common_keys = common_keys.intersection(keys)
     result = {}
     for key in common_keys:
         result[key] = dict(d[key] for d in data if key in d)
     return result

if __name__ == "__main__":
     # Your test files here are replaced by local strings
     print merge_sources([("file1", file1), ("file2", file2), ("file3", 
file3)])
     print merge_sources([("input1", input1), ("input2", input2)])

Test_results = '''
{'22': {'file3': ['22', 'C'],
         'file2': ['22', '0'],
         'file1': ['22', '110.1', '33', '331.5', '22.7', '5', '271.9',
                   '17.2', '33.4']}}

{'194': {'input2': ['194', 'C'],
          'input1': ['194', '8.00', '121.23', '54.79', '4.12',
                     '180.06']},
  '175': {'input2': ['175', 'H', '176', 'H', '180', 'H'],
          'input1': ['175', '8.42', '120.50', '55.31', '4.04',
                     '180.33']},
   '15': {'input2': ['15', 'H', '37', 'H', '95', 'T'],
          'input1': ['15', '8.45', '119.04', '55.02', '4.08',
                     '178.89']},
  '187': {'input2': ['187', 'H', '190', 'T'],
          'input1': ['187', '7.79', '122.27', '54.37', '4.26',
                     '179.75']}}
'''




More information about the Python-list mailing list