{"library":"soda-core-spark-df","title":"Soda Core Spark DataFrame Connector","type":"library","description":"Soda Core Spark DF is a Soda Core package that enables Soda Core to connect to Spark DataFrames as a data source. It allows users to define and run data quality checks directly on Spark DataFrames, making it suitable for data pipelines that operate within a Spark environment. Current version is 3.5.6. Releases follow Soda Core's cadence, typically monthly or bi-monthly for minor versions.","language":"python","status":"active","last_verified":"Thu May 21","install":{"commands":["pip install soda-core-spark-df pyspark"],"cli":{"name":"soda","version":""}},"imports":["from soda.scan import Scan","from soda.spark_df_data_source import SparkDfDataSource"],"auth":{"required":false,"env_vars":[]},"links":{"homepage":"https://www.soda.io","github":null,"docs":null,"changelog":null,"pypi":"https://pypi.org/project/soda-core-spark-df/","npm":null,"openapi_spec":null,"status_page":null,"smithery":null},"quickstart":{"code":"from pyspark.sql import SparkSession\nfrom soda.scan import Scan\nfrom soda.spark_df_data_source import SparkDfDataSource\n\n# 1. Prepare your Spark DataFrame\nspark = SparkSession.builder.appName(\"SodaSparkTest\").getOrCreate()\ndata = [(\"Alice\", 1), (\"Bob\", 2), (\"Charlie\", None)]\ncolumns = [\"name\", \"id\"]\ndf = spark.createDataFrame(data, columns)\n\n# 2. Configure Soda Core and register the Spark DataFrame\nscan = Scan()\nscan.add_configuration_yaml_str(\n    f\"\"\"\ndata_source spark_df_source:\n  type: spark_df\n\"\"\"\n)\nscan.add_spark_session(spark) # Pass the active SparkSession\n\n# 3. Add the DataFrame to the scan as a data source\nspark_df_data_source = SparkDfDataSource(spark=spark, data_frame=df, data_source_name=\"spark_df_source\", table_name=\"my_spark_df_table\")\nscan.add_data_source(spark_df_data_source)\n\n# 4. Define your checks (e.g., in a checks.yaml or programmatically)\nscan.add_sodacl_yaml_str(\n    \"\"\"\nchecks_for my_spark_df_table:\n  - row_count > 0\n  - missing_count(id) = 1\n  - column_count = 2\n\"\"\"\n)\n\n# 5. Execute the scan\nscan.execute()\n\n# 6. Print scan results\nprint(scan.get_logs_text())\nif scan.has_failures():\n    print(\"Scan finished with failures.\")\n    exit(1)\nelif scan.has_warnings():\n    print(\"Scan finished with warnings.\")\n    exit(0)\nelse:\n    print(\"Scan finished successfully.\")\n","lang":"python","description":"This quickstart demonstrates how to set up a Spark Session, create a DataFrame, initialize a Soda `Scan` object, register the DataFrame as a data source using `SparkDfDataSource`, define basic data quality checks, and execute the scan to get results.","tag":null,"tag_description":null,"last_tested":"2026-04-25","results":[{"runtime":"python:3.10-alpine","exit_code":-1},{"runtime":"python:3.10-slim","exit_code":-1},{"runtime":"python:3.11-alpine","exit_code":-1},{"runtime":"python:3.11-slim","exit_code":-1},{"runtime":"python:3.12-alpine","exit_code":-1},{"runtime":"python:3.12-slim","exit_code":-1},{"runtime":"python:3.13-alpine","exit_code":-1},{"runtime":"python:3.13-slim","exit_code":-1},{"runtime":"python:3.9-alpine","exit_code":-1},{"runtime":"python:3.9-slim","exit_code":-1}]},"compatibility":{"tag":null,"tag_description":null,"last_tested":"2026-05-21","installed_version":"3.5.6","pypi_latest":"3.5.6","is_stale":false,"summary":{"python_range":"3.10–3.9","success_rate":50,"avg_install_s":37.6,"avg_import_s":1.61,"wheel_type":"sdist"},"results":[{"runtime":"python:3.10-alpine","python_version":"3.10","os_libc":"alpine (musl)","variant":"soda-core-spark-df","exit_code":0,"wheel_type":"sdist","failure_reason":null,"import_side_effects":"clean","install_time_s":null,"import_time_s":1.55,"mem_mb":27.5,"disk_size":"393.6M"},{"runtime":"python:3.10-alpine","python_version":"3.10","os_libc":"alpine (musl)","variant":"soda-core-spark-df","exit_code":1,"wheel_type":null,"failure_reason":null,"import_side_effects":null,"install_time_s":null,"import_time_s":null,"mem_mb":null,"disk_size":null},{"runtime":"python:3.10-slim","python_version":"3.10","os_libc":"slim (glibc)","variant":"soda-core-spark-df","exit_code":0,"wheel_type":"sdist","failure_reason":null,"import_side_effects":"clean","install_time_s":38.9,"import_time_s":1.02,"mem_mb":26.2,"disk_size":"394M"},{"runtime":"python:3.10-slim","python_version":"3.10","os_libc":"slim (glibc)","variant":"soda-core-spark-df","exit_code":1,"wheel_type":null,"failure_reason":null,"import_side_effects":null,"install_time_s":null,"import_time_s":null,"mem_mb":null,"disk_size":null},{"runtime":"python:3.11-alpine","python_version":"3.11","os_libc":"alpine (musl)","variant":"soda-core-spark-df","exit_code":0,"wheel_type":"sdist","failure_reason":null,"import_side_effects":"clean","install_time_s":null,"import_time_s":2.71,"mem_mb":30.1,"disk_size":"402.6M"},{"runtime":"python:3.11-alpine","python_version":"3.11","os_libc":"alpine (musl)","variant":"soda-core-spark-df","exit_code":1,"wheel_type":null,"failure_reason":null,"import_side_effects":null,"install_time_s":null,"import_time_s":null,"mem_mb":null,"disk_size":null},{"runtime":"python:3.11-slim","python_version":"3.11","os_libc":"slim (glibc)","variant":"soda-core-spark-df","exit_code":0,"wheel_type":"sdist","failure_reason":null,"import_side_effects":"clean","install_time_s":36.1,"import_time_s":1.64,"mem_mb":28.9,"disk_size":"403M"},{"runtime":"python:3.11-slim","python_version":"3.11","os_libc":"slim (glibc)","variant":"soda-core-spark-df","exit_code":1,"wheel_type":null,"failure_reason":null,"import_side_effects":null,"install_time_s":null,"import_time_s":null,"mem_mb":null,"disk_size":null},{"runtime":"python:3.12-alpine","python_version":"3.12","os_libc":"alpine (musl)","variant":"soda-core-spark-df","exit_code":0,"wheel_type":"sdist","failure_reason":null,"import_side_effects":"broken","install_time_s":null,"import_time_s":null,"mem_mb":null,"disk_size":"391.1M"},{"runtime":"python:3.12-alpine","python_version":"3.12","os_libc":"alpine (musl)","variant":"soda-core-spark-df","exit_code":1,"wheel_type":null,"failure_reason":null,"import_side_effects":null,"install_time_s":null,"import_time_s":null,"mem_mb":null,"disk_size":null},{"runtime":"python:3.12-slim","python_version":"3.12","os_libc":"slim (glibc)","variant":"soda-core-spark-df","exit_code":0,"wheel_type":"sdist","failure_reason":null,"import_side_effects":"broken","install_time_s":36.1,"import_time_s":null,"mem_mb":null,"disk_size":"392M"},{"runtime":"python:3.12-slim","python_version":"3.12","os_libc":"slim (glibc)","variant":"soda-core-spark-df","exit_code":1,"wheel_type":null,"failure_reason":null,"import_side_effects":null,"install_time_s":null,"import_time_s":null,"mem_mb":null,"disk_size":null},{"runtime":"python:3.13-alpine","python_version":"3.13","os_libc":"alpine (musl)","variant":"soda-core-spark-df","exit_code":0,"wheel_type":"sdist","failure_reason":null,"import_side_effects":"broken","install_time_s":null,"import_time_s":null,"mem_mb":null,"disk_size":"388.7M"},{"runtime":"python:3.13-alpine","python_version":"3.13","os_libc":"alpine (musl)","variant":"soda-core-spark-df","exit_code":1,"wheel_type":null,"failure_reason":null,"import_side_effects":null,"install_time_s":null,"import_time_s":null,"mem_mb":null,"disk_size":null},{"runtime":"python:3.13-slim","python_version":"3.13","os_libc":"slim (glibc)","variant":"soda-core-spark-df","exit_code":0,"wheel_type":"sdist","failure_reason":null,"import_side_effects":"broken","install_time_s":35.8,"import_time_s":null,"mem_mb":null,"disk_size":"389M"},{"runtime":"python:3.13-slim","python_version":"3.13","os_libc":"slim (glibc)","variant":"soda-core-spark-df","exit_code":1,"wheel_type":null,"failure_reason":null,"import_side_effects":null,"install_time_s":null,"import_time_s":null,"mem_mb":null,"disk_size":null},{"runtime":"python:3.9-alpine","python_version":"3.9","os_libc":"alpine (musl)","variant":"soda-core-spark-df","exit_code":0,"wheel_type":"sdist","failure_reason":null,"import_side_effects":"clean","install_time_s":null,"import_time_s":1.52,"mem_mb":27.7,"disk_size":"393.2M"},{"runtime":"python:3.9-alpine","python_version":"3.9","os_libc":"alpine (musl)","variant":"soda-core-spark-df","exit_code":1,"wheel_type":null,"failure_reason":null,"import_side_effects":null,"install_time_s":null,"import_time_s":null,"mem_mb":null,"disk_size":null},{"runtime":"python:3.9-slim","python_version":"3.9","os_libc":"slim (glibc)","variant":"soda-core-spark-df","exit_code":0,"wheel_type":"sdist","failure_reason":null,"import_side_effects":"clean","install_time_s":41.1,"import_time_s":1.24,"mem_mb":26.4,"disk_size":"394M"},{"runtime":"python:3.9-slim","python_version":"3.9","os_libc":"slim (glibc)","variant":"soda-core-spark-df","exit_code":1,"wheel_type":null,"failure_reason":null,"import_side_effects":null,"install_time_s":null,"import_time_s":null,"mem_mb":null,"disk_size":null}]}}