{"id":24402,"library":"pyspark-extension","title":"pyspark-extension","description":"A library providing useful extensions to Apache Spark, including DataFrame diff, column transformation utilities, Parquet metadata reading, Spark Connect support, and dependency installation helpers. Current version 2.15.0.4.1, supports Spark 3.2+, 4.0; release cadence is irregular with multiple releases per year.","status":"active","version":"2.15.0.4.1","language":"python","source_language":"en","source_url":"https://github.com/G-Research/spark-extension","tags":["apache-spark","pyspark","data-diff","dataframe","spark-extension"],"install":[{"cmd":"pip install pyspark-extension","lang":"bash","label":"Standard install"},{"cmd":"pip install pyspark-extension[scala]","lang":"bash","label":"With Scala/Java support"}],"dependencies":[{"reason":"Core dependency for Spark functionality","package":"pyspark","optional":false},{"reason":"Used for Java bridge","package":"py4j","optional":false}],"imports":[{"note":"Module is named 'spark_extension' not 'pyspark_extension'","wrong":"from pyspark_extension import diff","symbol":"diff","correct":"from spark_extension import diff"},{"note":"comparators is a submodule, not a top-level object","wrong":"from spark_extension import comparators","symbol":"comparators","correct":"from spark_extension.comparators import default_comparator"},{"note":"Encrypted parquet functions are in the parquet submodule","wrong":"from spark_extension import read_encrypted_parquet","symbol":"encrypted parquet support","correct":"from spark_extension.parquet import read_encrypted_parquet"}],"quickstart":{"code":"from pyspark.sql import SparkSession\nfrom spark_extension import diff\n\nspark = SparkSession.builder.appName(\"test\").getOrCreate()\ndf1 = spark.createDataFrame([(1, \"a\"), (2, \"b\")], [\"id\", \"val\"])\ndf2 = spark.createDataFrame([(1, \"a\"), (3, \"c\")], [\"id\", \"val\"])\nresult = diff(df1, df2)\nresult.show()","lang":"python","description":"Basic usage: diff two DataFrames"},"warnings":[{"fix":"Use Spark 3.2+ or install pyspark-extension==2.14.2","message":"Removed support for Spark 3.0 and 3.1 starting from version 2.15.0. Upgrade to Spark 3.2+ or pin pyspark-extension <2.15.0.","severity":"breaking","affected_versions":">=2.15.0"},{"fix":"Only use documented public API; check release notes for symbols that became private.","message":"All undocumented unintended public API parts were made private in version 2.15.0. Any use of internal symbols (e.g., _internal methods) will break.","severity":"breaking","affected_versions":">=2.15.0"},{"fix":"Install with [scala] extra or configure spark.jars","message":"The Java/Scala jar is required for some features (e.g., diff on DataFrames with complex types). Install with pip install pyspark-extension[scala] or manually add the jar to Spark session.","severity":"gotcha","affected_versions":"all"},{"fix":"Review column name quoting in any custom SQL using column names from this library.","message":"Backticks handling: In version 2.14.0, columns with special characters are quoted with backticks; columns with only alphanumerics and underscores are no longer quoted. This may break comparisons relying on quoted column names in SQL expressions.","severity":"deprecated","affected_versions":">=2.14.0"}],"env_vars":null,"last_verified":"2026-05-01T00:00:00.000Z","next_check":"2026-07-30T00:00:00.000Z","problems":[{"fix":"Use 'import spark_extension' instead of 'import pyspark_extension'.","cause":"Incorrect import path; the module is named 'spark_extension' not 'pyspark_extension'.","error":"ModuleNotFoundError: No module named 'pyspark_extension'"},{"fix":"Install with scala extra: pip install pyspark-extension[scala] or add the jar manually via spark.jars.","cause":"Missing Java/Scala jar required for diff operation on certain DataFrame types.","error":"Py4JJavaError: An error occurred while calling o123.diff."},{"fix":"Use 'from spark_extension.comparators import ...' instead of 'spark_extension.comparators'.","cause":"Trying to import comparators as a top-level attribute.","error":"AttributeError: module 'spark_extension' has no attribute 'comparators'"}],"ecosystem":"pypi","meta_description":null,"install_score":null,"install_tag":null,"quickstart_score":null,"quickstart_tag":null}