[Tutor] Finding lines in .txt file that contain keywords from two different set()
Alan Gauld
alan.gauld at btinternet.com
Sun Sep 8 19:35:13 EDT 2019
On 08/09/2019 17:05, A S wrote:
> This is the code I have thus far...:
The formatting is all messed up, you need to post in plain text to
ensure the indentation is preserved.
I've tried to disentangle it but may have got it wrong...
> import os, sys
> from os.path import join
> import re
> import xlrd from xlrd
> import open_workbook
> import openpyxl from openpyxl.reader.excel
> import load_workbook
> import xlsxwriter
>
> #All the paths
> dict_folder = 'C:/Users/xxxx/Documents/xxxx/Test Excel'
> text_folder = 'C:/Users/xxxx/Documents/xxxx/Text'
>
> words = set()
> fieldset = set()
> for file in os.listdir(dict_folder):
> if file.endswith(".xlsx"):
> wb1 = load_workbook(join(dict_folder, file), data_only = True)
> ws = wb1.active
> #Here I am reading and printing all the data source names
> #set(words) in the excel dictionaries:
> cellvalues = ws["A1"].value
> wordsextract = re.findall(r"\((.+?)\)", str(cellvalues))
> results = wordsextract[0]
> words.add(results)
> print(results)
>
> for rowofcellobj in ws["C" : "D"]:
> for cellobj in rowofcellobj:
> #2. Here I am printing all the field names in col C & D in
> # the excel dictionaries:
> data = re.findall(r"\w+_.*?\w+", str(cellobj.value))
> if data != []:
> fields = data[0]
> fieldset.add(fields)
> print(fieldset)
> #listing = str.remove("")
> #print(listing)
>
> #Here I am reading the name of each .txt file to the separate .xlsx
> file:
> for r, name in enumerate(os.listdir(text_folder)):
> if name.endswith(".txt"):
> print(name)
> #Reading .txt file and trying to make the sentence
> # into words instead of lines so that I can compare
> # the individual .txt file words with the .xlsx file
> txtfilespath = os.chdir("C:/Users/xxxx/Documents/xxxx/Text")
>
> #Here I am reading and printing all the words in
> # the .txt files and compare with the excel Cell A1:
> for name in os.listdir(txtfilespath):
> if name.endswith(".txt"):
> with open (name, "r") as texts:
> # Read each line of the file:
> s = texts.read()
> print(s)
>
>
> #if .txt files contain.....() or select or
> # from or words from sets..search that sentence
> # and extract the common fields
>
> result1 = []
> parens = 0
> buff = ""
> for line in s:
> if line == "(":
> parens += 1
> if parens > 0:
> buff += line
> if line == ")":
> parens -= 1
> if not parens and buff:
> result1.append(buff)
> buff = ""
> set(result1)
That last line does nothing useful. It declares a set but does not assign
it to anything so it is immediately destroyed again.
> #Here, I include other keywords other than those found in the Excel workbooks
> checkhere = set()
> checkhere.add("Select")
> checkhere.add("From")
> checkhere.add("select")
> checkhere.add("from")
> checkhere.add("SELECT")
> checkhere.add("FROM")
> # k = list(checkhere)
> # print(k)
I assume all of that should be at the outer level since you wouldn't
want to do it inside a loop?
But why use add()? You can just declare theset using the literal notation:
checkhere = {"Select", "From",...."SELECT","FROM"}
> #I only want to read/ extract the lines containing brackets () as
> well as the keywords in the checkhere set. So that I can check capture
> the source and field in each line:
> #I tried this but nothing was printed......
> for element in checkhere:
> if element in result1:
> print(result1)
>
> *My desired output for the code that could not be printed when I tried is:*
>
> (/* 1.select_no., biiiiiyyyy FROM apple_x_Ex_x */
> proc sql; "TRUuuuth")
> (/* 1.xxxxx FROM xxxxx*/
> proc sql; "TRUuuuth")
> (SELECT abc AS abc1, ab33_2_ AS mon, a_rr, iirir_vf, jk_ff, sfa_jfkj
> FROM &orange..xxx_xxx_xxE
> where (asre(kkk_ix as format 'xxxx-xx') gff &bcbcb_hhaha.) and
> (axx(xx_ix as format 'xxxx-xx') lec &jgjsd_vnv.)
> )
>
> (/* 1.select_no. FROM apple_x_Ex_x */
> proc sql; "TRUuuuth")
>
> (SELECT abc AS kfcccc, mcfg_2_ AS dokn, b_rr, jjhj_vf, jjjk_hj, fjjh_jhjkj
> FROM &bfbd..pear_xxx_xxE
> where (afdfe(kkffk_ix as format 'xxxxd-xx') gdaff &bcdadabcb_hdahaha.) and
> (axx(xx_ix as format 'xxxx-xx') lec &jgjsdfdf_vnv.)
> )
--
Alan G
Author of the Learn to Program web site
http://www.alan-g.me.uk/
http://www.amazon.com/author/alan_gauld
Follow my photo-blog on Flickr at:
http://www.flickr.com/photos/alangauldphotos
More information about the Tutor
mailing list