Regular Expression
patrick.waldo at gmail.com
patrick.waldo at gmail.com
Sat Oct 27 08:19:18 EDT 2007
Finally I solved the problem, with some really minor things to tweak.
I guess it's true that I had two problems working with regular
expressions.
Thank you all for your help. I really learned a lot on quite a
difficult problem.
Final Code:
#For text files in a directory...
#Analyzes a randomly organized UTF8 document with EINECS, CAS,
Chemical, and Chemical Formula
#into a document structured as EINECS|CAS|Chemical|Chemical Formula.
import os
import codecs
import re
path = "C:\\text_samples\\text\\"
path2 = "C:\\text_samples\\text\\output\\"
EINECS = re.compile(r'^\d\d\d-\d\d\d-\d$')
CAS = re.compile(r'^\d*-\d\d-\d$')
FORMULA = re.compile(r'([A-Z][A-Za-z0-9]+\.?[A-Za-z0-9]+/?[A-Za-
z0-9]+)')
def iter_elements(tokens):
product = []
for tok in tokens:
if EINECS.match(tok) and len(product) >= 4:
match = re.match(FORMULA,product[-1])
if match:
product[2:-1] = [' '.join(product[2:-1])]
yield product
product = []
else:
product[2:-1] = [' '.join(product[2:])]
del product[-1]
yield product
product = []
product.append(tok)
yield product
for text in os.listdir(path):
input_text = os.path.join(path,text)
output_text = os.path.join(path2,text)
input = codecs.open(input_text, 'r','utf8')
output = codecs.open(output_text, 'w', 'utf8')
tokens = input.read().split()
for element in iter_elements(tokens):
output.write('|'.join(element))
output.write("\r\n")
input.close()
output.close()
More information about the Python-list
mailing list