I am calling AWS Textract GetDocumentAnalysisRequest on a PDF document which I sent to Textract, which returns blocks of data. Then I am trying to convert the data into a key value map. The document was analyzed for FORM and TABLES. I was trying to port the example code provided in python into go to convert the blocks into KV Maps (https://github.com/awsdocs/aws-doc-sdk-examples/blob/master/python/example_code/textract/textract-python-kv-parser.py). It was working fine in python. In my code there are random spaces in the keys and values. I am assuming its due to this line in the code text = fmt.Sprintf("%s %s", text, *word.Text)
.
// Contains tells whether a contains x.
func Contains(a []*string, x string) bool {
for _, n := range a {
if x == *n {
return true
}
}
return false
}
type KeyValueBlock struct {
KeyMap map[string]textract.Block
ValueMap map[string]textract.Block
BlockMap map[string]textract.Block
}
func getKeyValueMap(blocks []*textract.Block) KeyValueBlock {
keyMap := map[string]textract.Block{}
valueMap := map[string]textract.Block{}
blockMap := map[string]textract.Block{}
for _, block := range blocks {
blockID := block.Id
blockMap[*blockID] = *block
if *block.BlockType == textract.BlockTypeKeyValueSet {
if Contains(block.EntityTypes, textract.EntityTypeKey) {
keyMap[*blockID] = *block
} else {
valueMap[*blockID] = *block
}
}
}
return KeyValueBlock{
KeyMap: keyMap,
ValueMap: valueMap,
BlockMap: blockMap,
}
}
func findValueBlock(keyBlock textract.Block, valueMap map[string]textract.Block) textract.Block {
var valueBlock textract.Block
for _, relationship := range keyBlock.Relationships {
if *relationship.Type == textract.EntityTypeValue {
for _, valueID := range relationship.Ids {
valueBlock = valueMap[*valueID]
}
}
}
return valueBlock
}
func getText(result textract.Block, blocksMap map[string]textract.Block) string {
var text string
for _, relationship := range result.Relationships {
if *relationship.Type == textract.RelationshipTypeChild {
for _, childID := range relationship.Ids {
word := blocksMap[*childID]
if *word.BlockType == textract.BlockTypeWord {
text = fmt.Sprintf("%s %s", text, *word.Text)
}
}
}
}
return text
}
func getKeyValueRelationship(keyValueBlock KeyValueBlock) {
keyValueMap := map[string]string{}
for _, keyBlock := range keyValueBlock.KeyMap {
valueBlock := findValueBlock(keyBlock, keyValueBlock.ValueMap)
key := getText(keyBlock, keyValueBlock.BlockMap)
val := getText(valueBlock, keyValueBlock.BlockMap)
keyValueMap[key] = val
}
log.Info(keyValueMap)
}
// Do takes a GetDocumentAnalysisInput and sends to textract
func Do(docAnalysisInput *textract.GetDocumentAnalysisInput, svc textractiface.TextractAPI) error {
// Sending a request using the StartDocumentAnalysisRequest method.
req, resp := svc.GetDocumentAnalysisRequest(docAnalysisInput)
err := req.Send()
if err == nil { // resp is now filled
log.Infof("Job completeted with status: %s", *resp.JobStatus)
}
data := getKeyValueMap(resp.Blocks)
getKeyValueRelationship(data)
return err
}
Getting: {"level":"info","msg":"map[ +: Test Key1 (total amount provided): 6002.00 A santance key name:: Some Text Some Key3: 0.00 Some Char ge: $ 5000.00 Som e Key6: 5.88 % Some sentence key: value word Some Long Key: Some Key2: # 552242]","time":"2019-04-16T19:06:18Z"}
Expected {"level":"info","msg":"map[Test Key1 (total amount provided): 6002.00 A santance key name: Some Text Some Key3: 0.00 Some Charge: $ 5000.00 Some Key6: 5.88 % Some sentence key: value word Some Long Key: Some Key2: # 552242]","time":"2019-04-16T19:06:18Z"}