{"id":24701,"library":"textract-trp","title":"Amazon Textract Parser (textract-trp)","description":"A parser for Amazon Textract results that converts the raw JSON response into a structured document model with pages, lines, words, tables, and forms. Version 0.1.3 (latest as of verification) supports Python >=3.6. The library provides high-level abstractions for navigating Textract output, including bounding boxes, confidence scores, and relationships between elements. It is maintained on GitHub by mludvig.","status":"active","version":"0.1.3","language":"python","source_language":"en","source_url":"https://github.com/mludvig/amazon-textract-parser","tags":["amazon-textract","aws","document-parsing","ocr","textract"],"install":[{"cmd":"pip install textract-trp","lang":"bash","label":"Standard pip install"}],"dependencies":[{"reason":"For calling Amazon Textract API and handling responses.","package":"boto3","optional":false}],"imports":[{"note":"Old paths used in early releases; current package is textract_trp.","wrong":"from textract.trp import TextractParser","symbol":"TextractParser","correct":"from textract_trp import TextractParser"},{"note":"TRP is the high-level document model class.","wrong":"","symbol":"TRP","correct":"from textract_trp import TRP"}],"quickstart":{"code":"import boto3\nfrom textract_trp import TextractParser\n\n# Initialize Textract client\nclient = boto3.client('textract', region_name='us-east-1',\n    aws_access_key_id=os.environ.get('AWS_ACCESS_KEY_ID', ''),\n    aws_secret_access_key=os.environ.get('AWS_SECRET_ACCESS_KEY', ''))\n\n# Analyze a document from S3\nresponse = client.analyze_document(\n    Document={'S3Object': {'Bucket': 'my-bucket', 'Name': 'document.pdf'}},\n    FeatureTypes=['TABLES', 'FORMS']\n)\n\n# Parse the response\nparser = TextractParser()\ndocument = parser.parse(response)\n\n# Iterate pages and lines\nfor page in document.pages:\n    for line in page.lines:\n        print(line.text)\n\n# Access tables\nfor page in document.pages:\n    for table in page.tables:\n        for row in table.rows:\n            print([cell.text for cell in row.cells])","lang":"python","description":"Parse Amazon Textract output into a structured document with pages, lines, tables, and forms."},"warnings":[{"fix":"Loop over responses by passing NextToken from previous response until NextToken is missing.","message":"The library does not handle pagination of Textract responses with multiple pages. You must call Textract with the 'NextToken' yourself and parse each response separately.","severity":"gotcha","affected_versions":"all"},{"fix":"Always use 'pip install textract-trp'. Do not confuse with 'textract' (general OCR) or 'amazon-textract-textractor'.","message":"Version 0.1.3 uses 'pip install textract-trp' but the package name on PyPI is 'textract-trp'. Some older documentation references 'textract' which is a different library (for OCR).","severity":"deprecated","affected_versions":">=0.1.0"},{"fix":"Ensure you pass the response object directly from the boto3 client call, e.g., result = client.analyze_document(...); document = parser.parse(result).","message":"TextractParser.parse() expects the raw response dictionary from boto3, not the JSON string. Passing a string will cause JSONDecodeError or attribute errors.","severity":"gotcha","affected_versions":"all"},{"fix":"Use .text property on line and word objects. For plain list of strings, use list comprehension: [line.text for line in page.lines].","message":"In version 0.1.3, the TRP class attributes have changed from previous releases. 'page.lines' and 'page.words' are now objects, not lists of strings. Accessing .text on these objects is correct.","severity":"breaking","affected_versions":">=0.1.0"}],"env_vars":null,"last_verified":"2026-05-01T00:00:00.000Z","next_check":"2026-07-30T00:00:00.000Z","problems":[{"fix":"Run 'pip install textract-trp' and import as 'from textract_trp import TextractParser'.","cause":"Installed wrong package. 'textract' is a different OCR library.","error":"ModuleNotFoundError: No module named 'textract'"},{"fix":"Use parser = TextractParser(); document = parser.parse(response) where response is the raw boto3 client response.","cause":"Passed a dictionary instead of parsing it with TextractParser.parse().","error":"AttributeError: 'dict' object has no attribute 'pages'"},{"fix":"Check that the response has 'Blocks' key. Call Textract API correctly and ensure no errors in the response.","cause":"The input is not a valid Textract response. Possibly the JSON is malformed or an error response.","error":"KeyError: 'Blocks' in JSON response"}],"ecosystem":"pypi","meta_description":null,"install_score":null,"install_tag":null,"quickstart_score":null,"quickstart_tag":null}