Parse

yaml
type: "io.kestra.plugin.tika.parse"

Examples

yaml
id: tika_parse
namespace: company.team

inputs:
  - id: file
    type: FILE

tasks:
  - id: parse
    type: io.kestra.plugin.tika.Parse
    from: '{{ inputs.file }}'
    extractEmbedded: true
    store: false

yaml
id: tika_parse
namespace: company.team

inputs:
  - id: file
    type: FILE

tasks:
  - id: parse
    type: io.kestra.plugin.tika.Parse
    from: '{{ inputs.file }}'
    ocrOptions:
      strategy: OCR_AND_TEXT_EXTRACTION
    store: true

yaml
id: parse-image-metadata-using-apache-tika
namespace: company.team

tasks:
  - id: get_image
    type: io.kestra.plugin.core.http.Download
    uri: https://kestra.io/blogs/2023-05-31-beginner-guide-kestra.jpg

  - id: tika
    type: io.kestra.plugin.tika.Parse
    from: "{{ outputs.get_image.uri }}"
    store: false
    contentType: TEXT
    ocrOptions:
      strategy: OCR_AND_TEXT_EXTRACTION

yaml
id: parse-pdf
namespace: company.team

tasks:
  - id: download_pdf
    type: io.kestra.plugin.core.http.Download
    uri: https://huggingface.co/datasets/kestra/datasets/resolve/main/pdf/app_store.pdf

  - id: parse_text
    type: io.kestra.plugin.tika.Parse
    from: "{{ outputs.download_pdf.uri }}"
    contentType: TEXT
    store: false

  - id: log_extracted_text
    type: io.kestra.plugin.core.log.Log
    message: "{{ outputs.parse_text.result.content }}"

Properties

charactersLimit integerstring

contentType string

Default XHTML

Possible Values

TEXTXHTMLXHTML_NO_HEADER

extractEmbedded booleanstring

Default false

from string

ocrOptions Parse-OcrOptions

Default

{
  "strategy": "NO_OCR"
}

store booleanstring

Default true

Outputs

result Parse-Parsed

uri string

Format uri

Definitions

io.kestra.plugin.tika.Parse-OcrOptions

enableImagePreprocessing booleanstring

language string

strategy string

Default NO_OCR

Possible Values

AUTONO_OCROCR_ONLYOCR_AND_TEXT_EXTRACTION

io.kestra.plugin.tika.Parse-Parsed

content string

embedded object

SubType string

metadata object

​Parse

Parse