[Tutor] Finding lines in .txt file that contain keywords from two different set()

Alan Gauld alan.gauld at btinternet.com
Sun Sep 8 19:35:13 EDT 2019


On 08/09/2019 17:05, A S wrote:
> This is the code I have thus far...:

The formatting is all messed up, you need to post in plain text to
ensure the indentation is preserved.


I've tried to disentangle it but may have got it wrong...

> import os, sys
> from os.path import join
> import re
> import xlrd from xlrd
> import open_workbook
> import openpyxl from openpyxl.reader.excel 
> import load_workbook
> import xlsxwriter
>
> #All the paths
> dict_folder = 'C:/Users/xxxx/Documents/xxxx/Test Excel'
> text_folder = 'C:/Users/xxxx/Documents/xxxx/Text'
>
> words = set()
> fieldset = set()
> for file in os.listdir(dict_folder):
>   if file.endswith(".xlsx"):
>     wb1 = load_workbook(join(dict_folder, file), data_only = True)
>     ws = wb1.active
>     #Here I am reading and printing all the data source names
>     #set(words) in the excel dictionaries:
>     cellvalues = ws["A1"].value
>     wordsextract = re.findall(r"\((.+?)\)", str(cellvalues))
>     results = wordsextract[0]
>     words.add(results)
>     print(results)
>
>     for rowofcellobj in ws["C" : "D"]:
>         for cellobj in rowofcellobj:
>            #2. Here I am printing all the field names in col C & D in
>            # the excel dictionaries:
>            data = re.findall(r"\w+_.*?\w+", str(cellobj.value))
>            if data != []:
>                 fields = data[0]
>                 fieldset.add(fields)
>                 print(fieldset)
>                 #listing = str.remove("")
>                 #print(listing)
>
> #Here I am reading the name of each .txt file to the separate .xlsx
> file:

> for r, name in enumerate(os.listdir(text_folder)):
>     if name.endswith(".txt"):
>         print(name)
>         #Reading .txt file and trying to make the sentence 
>         # into words instead of lines so that I can compare 
>         # the individual .txt file words with the .xlsx file
>         txtfilespath = os.chdir("C:/Users/xxxx/Documents/xxxx/Text")
>
> #Here I am reading and printing all the words in 
> # the .txt files and compare with the excel Cell A1:

> for name in os.listdir(txtfilespath):
>     if name.endswith(".txt"):
>         with open (name, "r") as texts:
>             # Read each line of the file:
>             s = texts.read()
>             print(s)
>
>
>             #if .txt files contain.....() or select or 
>             # from or words from sets..search that sentence 
>             # and extract the common fields
>
>             result1 = []
>             parens = 0
>             buff = ""
>             for line in s:
>                 if line == "(":
>                     parens += 1
>                 if parens > 0:
>                     buff += line
>                 if line == ")":
>                     parens -= 1
>                if not parens and buff:
>                     result1.append(buff)
>                     buff = ""
>                     set(result1)

That last line does nothing useful. It declares a set but does not assign
it to anything so it is immediately destroyed again.



> #Here, I include other keywords other than those found in the Excel workbooks
>    checkhere = set()
>    checkhere.add("Select")
>    checkhere.add("From")
>    checkhere.add("select")
>    checkhere.add("from")
>    checkhere.add("SELECT")
>    checkhere.add("FROM")
>    # k = list(checkhere)
>    # print(k)

I assume all of that should be at the outer level since you wouldn't
want to do it inside a loop?


But why use add()? You can just declare theset using the literal notation:


checkhere = {"Select", "From",...."SELECT","FROM"}


>    #I only want to read/ extract the lines containing brackets () as
> well as the keywords in the checkhere set. So that I can check capture
> the source and field in each line:
>    #I tried this but nothing was printed......
>    for element in checkhere:
>        if element in result1:
>         print(result1)
>
> *My desired output for the code that could not be printed when I tried is:*
>
> (/* 1.select_no., biiiiiyyyy FROM apple_x_Ex_x */
>  proc sql; "TRUuuuth")
> (/* 1.xxxxx FROM xxxxx*/
> proc sql; "TRUuuuth")
> (SELECT abc AS abc1, ab33_2_ AS mon, a_rr, iirir_vf, jk_ff, sfa_jfkj
>     FROM &orange..xxx_xxx_xxE
>  where (asre(kkk_ix as format 'xxxx-xx') gff &bcbcb_hhaha.) and
>   (axx(xx_ix as format 'xxxx-xx') lec &jgjsd_vnv.)
>  )
>
>  (/* 1.select_no. FROM apple_x_Ex_x */
>  proc sql; "TRUuuuth")
>
>  (SELECT abc AS kfcccc, mcfg_2_ AS dokn, b_rr, jjhj_vf, jjjk_hj, fjjh_jhjkj
>     FROM &bfbd..pear_xxx_xxE
>  where (afdfe(kkffk_ix as format 'xxxxd-xx') gdaff &bcdadabcb_hdahaha.) and
>   (axx(xx_ix as format 'xxxx-xx') lec &jgjsdfdf_vnv.)
>  )

-- 
Alan G
Author of the Learn to Program web site
http://www.alan-g.me.uk/
http://www.amazon.com/author/alan_gauld
Follow my photo-blog on Flickr at:
http://www.flickr.com/photos/alangauldphotos



More information about the Tutor mailing list