Strange use of Lambda arrow

edmondo.giovannozzi at gmail.com edmondo.giovannozzi at gmail.com
Sat Jun 6 06:52:51 EDT 2020


Have a look at:

https://docs.python.org/3/library/typing.html


Il giorno venerdì 5 giugno 2020 18:35:10 UTC+2, Agnese Camellini ha scritto:
> Hello to everyone, lately i building up an open source project, with some
> collaborator, but one of them cannot contribute any more. He is a solution
> architect so he is very skilled (much more than me!). I am now analysing
> his code to finish the job but i don't get this use of the lambda arrow,
> it's like he is deplaring the returned tipe in the function signature (as
> you would do in Java). I have never seen something like this in python..
> 
> Can someone please explain to me this usage (the part regarding the
> question is highlighted in yellow):
> 
>     @classmethod
>     def extract_document_data(cls, file_path : str) -> DocumentData:
>         """
>         Entry point of the module, it extracts the data from the document
>         whose path is passed as input.
>         The extraction strategy is automatically chosen based on the MIME
> type
>         of the file.
> 
>         @type file_path: str
>         @param file_path: The path of the document to be parsed.
>         @rtype: DocumentData
>         @returns: An object containing the data of the parsed document.
>         """
> 
>         mime = magic.Magic(mime=True)
>         mime_type = mime.from_file(file_path)
>         document_type = DocumentType.get_instance(mime_type)
>         strategy = cls.strategies[document_type]
>         return strategy.extract_document_data(file_path)
> 
> 
> To be more verbose, this is the whole script:
> 
> from enum import Enum
> import json
> import magic
> 
> import docx
> from pdfminer.converter import PDFPageAggregator
> from pdfminer.layout import LAParams, LTContainer, LTTextContainer
> from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines
> from pdfminer.pdfinterp import PDFPageInterpreter
> from pdfminer.pdfinterp import PDFResourceManager
> from pdfminer.pdfpage import PDFPage
> from pdfminer.pdfparser import PDFParser
> 
> 
> class DocumentType(Enum):
>     """
>     Defines the handled document types.
>     Each value is associated to a MIME type.
>     """
> 
>     def __init__(self, mime_type):
>         self.mime_type = mime_type
> 
>     @classmethod
>     def get_instance(cls, mime_type : str):
>         values = [e for e in cls]
>         for value in values:
>             if value.mime_type == mime_type:
>                 return value
>         raise MimeNotValidError(mime_type)
> 
>     PDF = 'application/pdf'
>     DOCX =
> 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
> 
> 
> class MimeNotValidError(Exception):
>     """
>     Exception to be raised when a not valid MIME type is processed.
>     """
> 
>     pass
> 
> 
> class DocumentData:
>     """
>     Wrapper for the extracted document data (TOC and contents).
>     """
> 
>     def __init__(self, toc : list = [], pages : list = [], document_text :
> str = None):
>         self.toc = toc
>         self.pages = pages
>         if document_text is not None:
>             self.document_text = document_text
>         else:
>             self.document_text = ' '.join([page.replace('\n', ' ') for page
> in pages])
> 
>     def toc_as_json(self) -> str:
>         return json.dumps(self.toc)
> 
> 
> class ExtractionStrategy:
>     """
>     Base class for the extraction strategies.
>     """
> 
>     @staticmethod
>     def extract_document_data(file_path : str) -> DocumentData:
>         pass
> 
> 
> class DOCXExtractionStrategy(ExtractionStrategy):
>     """
>     It implements the TOC and contents extraction from a DOCX document.
>     """
> 
>     @staticmethod
>     def extract_document_data(file_path : str) -> DocumentData:
>         document = docx.Document(file_path)
>         body_elements = document._body._body
>         # Selecting only the <w:t> elements from DOCX XML,
>         # as they're the only to contain some text.
>         text_elems = body_elements.xpath('.//w:t')
>         return DocumentData(document_text = ' '.join([elem.text for elem in
> text_elems]))
> 
> 
> class PDFExtractionStrategy(ExtractionStrategy):
>     """
>     It implements the TOC and contents extraction from a PDF document.
>     """
> 
>     @staticmethod
>     def parse_toc(doc : PDFDocument) -> list:
>         raw_toc = []
>         try:
>             outlines = doc.get_outlines()
>             for (level, title, dest, a, se) in outlines:
>                 raw_toc.append((level, title))
>         except PDFNoOutlines:
>             pass
>         return PDFExtractionStrategy.build_toc_tree(raw_toc)
> 
>     @staticmethod
>     def build_toc_tree(items : list) -> list:
>         """
>         Builds the TOC tree from a list of TOC items.
> 
>         @type items: list
>         @param items: The TOC items.
>         Each item must have the following format: (<item depth>, <item
> description>).
>         E.g: [(1, 'Contents'), (2, 'Chapter 1'), (2, 'Chapter 2')]
>         @rtype: list
>         @returns: The TOC tree. The tree hasn't a root element, therefore it
>         actually is a list.
>         """
> 
>         toc = []
>         if items is None or len(items) == 0:
>             return toc
>         current_toc_level = toc
>         # Using an explicit stack containing the lists corresponding to
>         # the various levels of the TOC, to simulate the recursive building
>         # of the TOC tree in a more efficient way
>         toc_levels_stack = []
>         toc_levels_stack.append(current_toc_level)
> 
>         # Each TOC item can be inserted into the current TOC level as
>         # string (just the item description) or as dict, where the key is
>         # the item description and the value is a list containing the
>         # children TOC items.
>         # To correctly determine how to insert the current item into
>         # the current level, a kind of look-ahead is needed, that is
>         # the depth of the next item has to be considered.
> 
>         # Initializing the variables related to the previous item.
>         prev_item_depth, prev_item_desc = items[0]
>         # Adding a fake final item in order to handle all the TOC items
>         # inside the cycle.
>         items.append((-1, ''))
> 
>         for i in range(1, len(items)):
>             # In fact each iteration handles the item of the previous
>             # one, using the current item to determine how to insert
>             # the previous item into the current TOC level,
>             # as explained before.
>             curr_item = items[i]
>             curr_item_depth = curr_item[0]
> 
>             if curr_item_depth == prev_item_depth:
>                 # The depth of the current item is the same
>                 # as the previous one.
>                 # Inserting the previous item into the current TOC level
>                 # as string.
>                 current_toc_level.append(prev_item_desc)
>             elif curr_item_depth == prev_item_depth + 1:
>                 # The depth of the current item is increased by 1 compared
> to
>                 # the previous one.
>                 # Inserting the previous item into the current TOC level
>                 # as dict.
>                 prev_item_dict = { prev_item_desc : [] }
>                 current_toc_level.append(prev_item_dict)
>                 # Updating the current TOC level with the newly created one
>                 # which contains the children of the previous item.
>                 current_toc_level = prev_item_dict[prev_item_desc]
>                 toc_levels_stack.append(current_toc_level)
>             elif curr_item_depth < prev_item_depth:
>                 # The depth of the current item is lesser than
>                 # the previous one.
>                 # Inserting the previous item into the current TOC level
>                 # as string.
>                 current_toc_level.append(prev_item_desc)
>                 if i < len(items)-1:
>                     # Executing these steps for all the items except the
> last one
>                     depth_diff = prev_item_depth - curr_item_depth
>                     # Removing from the stack as many TOC levels as the
> difference
>                     # between the depth of the previous item and the depth
> of the
>                     # current one.
>                     for i in range(0, depth_diff):
>                         toc_levels_stack.pop()
>                     # Updating the current TOC level with the one contained
> in
>                     # the head of the stack.
>                     current_toc_level = toc_levels_stack[-1]
>             # Updating the previous item with the current one
>             prev_item_depth, prev_item_desc = curr_item
> 
>         return toc
> 
>     @staticmethod
>     def from_bytestring(s) -> str:
>         """
>         If the input string is a byte-string, converts it to a string using
>         UTF-8 as encoding.
> 
>         @param s: A string or a byte-string.
>         @rtype: str
>         @returns: The potentially converted string.
>         """
> 
>         if s:
>             if isinstance(s, str):
>                 return s
>             else:
>                 return s.encode('utf-8')
> 
>     @staticmethod
>     def parse_layout_nodes(container : LTContainer) -> str:
>         """
>         Recursively extracts the text from all the nodes contained in the
>         input PDF layout tree/sub-tree.
> 
>         @type container: LTContainer
>         @param container: The PDF layout tree/sub-tree from which to
> extract the text.
>         @rtype: str
>         @returns: A string containing the extracted text.
>         """
> 
>         text_content = []
> 
>         # The iterator returns the children nodes.
>         for node in container:
>             if isinstance(node, LTTextContainer):
>                 # Only nodes of type LTTextContainer contain text.
> 
> text_content.append(PDFExtractionStrategy.from_bytestring(node.get_text()))
>             elif isinstance(node, LTContainer):
>                 # Recursively calling the method on the current node, which
> is a container itself.
> 
> text_content.append(PDFExtractionStrategy.parse_layout_nodes(node))
>             else:
>                 # Ignoring all the other node types.
>                 pass
> 
>         # Joining all the extracted text chunks with a new line character.
>         return "\n".join(text_content)
> 
>     @staticmethod
>     def parse_pages(doc : PDFDocument) -> list:
>         rsrcmgr = PDFResourceManager()
>         laparams = LAParams()
>         device = PDFPageAggregator(rsrcmgr, laparams=laparams)
>         interpreter = PDFPageInterpreter(rsrcmgr, device)
> 
>         text_content = []
>         for i, page in enumerate(PDFPage.create_pages(doc)):
>             interpreter.process_page(page)
>             layout = device.get_result()
>             # Extracts the text from all the nodes of the PDF layout tree
> of each page
> 
> text_content.append(PDFExtractionStrategy.parse_layout_nodes(layout))
> 
>         return text_content
> 
>     @staticmethod
>     def parse_pdf(file_path : str) -> (list, list):
>         toc = []
>         pages = []
>         try:
>             fp = open(file_path, 'rb')
>             parser = PDFParser(fp)
>             doc = PDFDocument(parser)
>             parser.set_document(doc)
> 
>             if doc.is_extractable:
>                 toc = PDFExtractionStrategy.parse_toc(doc)
>                 pages = PDFExtractionStrategy.parse_pages(doc)
> 
>             fp.close()
>         except IOError:
>             pass
>         return (toc, pages)
> 
>     @staticmethod
>     def extract_document_data(file_path : str) -> DocumentData:
>         toc, pages = PDFExtractionStrategy.parse_pdf(file_path)
>         return DocumentData(toc, pages = pages)
> 
> 
> class DocumentDataExtractor:
>     """
>     Main class of the module.
>     It's responsible for actually executing the text extraction.
>     The output is constituted by the following items:
>     -table of contents (TOC);
>     -pages contents.
>     """
> 
>     # Dictionary containing the extraction strategies for the different
>     # document types, indexed by the corresponding DocumentType enum values.
>     strategies = {
>         DocumentType.DOCX : DOCXExtractionStrategy(),
>         DocumentType.PDF : PDFExtractionStrategy()
>                  }
> 
>     @classmethod
>     def extract_document_data(cls, file_path : str) -> DocumentData:
>         """
>         Entry point of the module, it extracts the data from the document
>         whose path is passed as input.
>         The extraction strategy is automatically chosen based on the MIME
> type
>         of the file.
> 
>         @type file_path: str
>         @param file_path: The path of the document to be parsed.
>         @rtype: DocumentData
>         @returns: An object containing the data of the parsed document.
>         """
> 
>         mime = magic.Magic(mime=True)
>         mime_type = mime.from_file(file_path)
>         document_type = DocumentType.get_instance(mime_type)
>         strategy = cls.strategies[document_type]
>         return strategy.extract_document_data(file_path)



More information about the Python-list mailing list