PythonSubmit ​Python​Submit

yaml
type: "io.kestra.plugin.spark.pythonsubmit"
yaml
id: spark_python_submit
namespace: company.team

tasks:
  - id: python_submit
    type: io.kestra.plugin.spark.PythonSubmit
    containerImage: bitnami/spark
    taskRunner:
      type: io.kestra.plugin.scripts.runner.docker.Docker
      networkMode: host
      user: root
    master: spark://localhost:7077
    args:
      - "10"
    mainScript: |
      import sys
      from random import random
      from operator import add
      from pyspark.sql import SparkSession


      if __name__ == "__main__":
          spark = SparkSession.builder.appName("PythonPi").getOrCreate()

          partitions = int(sys.argv[1]) if len(sys.argv) > 1 else 2
          n = 100000 * partitions

          def f(_: int) -> float:
              x = random() * 2 - 1
              y = random() * 2 - 1
              return 1 if x ** 2 + y ** 2 <= 1 else 0

          count = spark.sparkContext.parallelize(range(1, n + 1), partitions).map(f).reduce(add)
          print("Pi is roughly %f" % (4.0 * count / n))

          spark.stop()
Properties
SubType string
SubType string
SubType string
Default bitnami/spark
Default CLIENT
Possible Values
CLIENTCLUSTER
SubType string
SubType string
Possible Values
PROCESSDOCKER
Default spark-submit
Default false
Default 0
SubType string
Default busybox
Default true
Default { "image": "busybox" }
Default default
Default ALWAYS
Possible Values
IF_NOT_PRESENTALWAYSNEVER
Default true
Validation RegExp \d+\.\d+\.\d+(-[a-zA-Z0-9-]+)?|([a-zA-Z0-9]+)
Default PT5S
Format duration
Default PT1H
Format duration
Default PT10M
Format duration
Default PT5S
Format duration
Default true
Default true
Default false
Validation RegExp \d+\.\d+\.\d+(-[a-zA-Z0-9-]+)?|([a-zA-Z0-9]+)
Default PT1H
Format duration
SubType integer
Default PT5S
Format duration
Default true
Default true
SubType string
Default ["https://www.googleapis.com/auth/cloud-platform"]
Validation RegExp \d+\.\d+\.\d+(-[a-zA-Z0-9-]+)?|([a-zA-Z0-9]+)
Default PT5S
Format duration
Default PT1H
Format duration
Possible Values
ACTION_UNSPECIFIEDRETRY_TASKFAIL_TASKUNRECOGNIZED
Default v1
Default RSA
Default https://kubernetes.default.svc
Validation RegExp \d+\.\d+\.\d+(-[a-zA-Z0-9-]+)?|([a-zA-Z0-9]+)
Default PT5S
Format duration
Default true
SubType string
Default e2-medium
Default 2
Minimum >= 0
Maximum <= 10
Default true
SubType string
Default ["https://www.googleapis.com/auth/cloud-platform"]
Validation RegExp \d+\.\d+\.\d+(-[a-zA-Z0-9-]+)?|([a-zA-Z0-9]+)
Default PT5S
Format duration
Default PT1H
Format duration
Min length 1
SubType
SubType string
SubType string
Default IF_NOT_PRESENT
Possible Values
IF_NOT_PRESENTALWAYSNEVER
SubType string
Default true
SubType
SubType string
Default [ "" ]
SubType string
Default VOLUME
Possible Values
MOUNTVOLUME
Default PT0S
Format duration
SubType string
Default IF_NOT_PRESENT
Possible Values
IF_NOT_PRESENTALWAYSNEVER
Validation RegExp \d+\.\d+\.\d+(-[a-zA-Z0-9-]+)?|([a-zA-Z0-9]+)
SubType string
Default true
SubType array
SubType string
SubType string
Default PT5S
Format duration
Default true
Default { "request": { "memory": "2048", "cpu": "1" } }
Default true
Default PT15M
Format duration
Validation RegExp \d+\.\d+\.\d+(-[a-zA-Z0-9-]+)?|([a-zA-Z0-9]+)
Default PT1H
Format duration