{"id":28104,"library":"pyspark-regression","title":"PySpark Regression","description":"PySpark Regression is a Python library for regression testing Spark DataFrames, enabling comparison of outputs between different runs or environments. Version 4.2.4 is current; release cadence is irregular but active.","status":"active","version":"4.2.4","language":"python","source_language":"en","source_url":"https://github.com/forrest-bajbek/pyspark-regression","tags":["spark","pyspark","testing","regression-testing"],"install":[{"cmd":"pip install pyspark-regression","lang":"bash","label":"Install via pip"}],"dependencies":[{"reason":"Core dependency for Spark DataFrame operations.","package":"pyspark","optional":false}],"imports":[{"note":"Common mistake: assuming RegressionTest is in a submodule; it's directly in the top-level package.","wrong":"from pyspark_regression.testing import RegressionTest","symbol":"RegressionTest","correct":"from pyspark_regression import RegressionTest"}],"quickstart":{"code":"from pyspark.sql import SparkSession\nfrom pyspark_regression import RegressionTest\n\nspark = SparkSession.builder.appName('test').getOrCreate()\ndf1 = spark.createDataFrame([(1, 'a'), (2, 'b')], ['id', 'val'])\ndf2 = spark.createDataFrame([(1, 'a'), (2, 'b')], ['id', 'val'])\n\ntester = RegressionTest()\nresult = tester.compare(df1, df2, key='id')\nprint(result)  # Should indicate equality","lang":"python","description":"Basic usage: compare two Spark DataFrames with a key column."},"warnings":[{"fix":"Replace `compare_dataframes(df1, df2, key)` with `RegressionTest().compare(df1, df2, key=key)`.","message":"In version 2.x, the API used `compare_dataframes()`; this was removed in 3.0+. Use `RegressionTest.compare()` instead.","severity":"breaking","affected_versions":">=3.0.0"},{"fix":"Ensure key column contains unique values, or use additional columns to form a composite key.","message":"The key column must have unique values in each DataFrame; non-unique keys will raise an error.","severity":"gotcha","affected_versions":"all"},{"fix":"Use `df.select(*ordered_columns)` before comparing to enforce column order.","message":"Both DataFrames must have identical column order; differences in column order cause match failures.","severity":"gotcha","affected_versions":"all"},{"fix":"Replace `ignore_nulls=True` with `allow_null_mismatch=True`.","message":"The `ignore_nulls` parameter is deprecated starting 4.0.0 and will be removed; use `allow_null_mismatch` instead.","severity":"deprecated","affected_versions":">=4.0.0"}],"env_vars":null,"last_verified":"2026-05-09T00:00:00.000Z","next_check":"2026-08-07T00:00:00.000Z","problems":[{"fix":"Install with pip install pyspark-regression and import using from pyspark_regression import RegressionTest.","cause":"The package is installed as pyspark-regression but import uses underscore.","error":"ModuleNotFoundError: No module named 'pyspark_regression'"},{"fix":"Use from pyspark_regression import RegressionTest directly.","cause":"Using wrong import path; RegressionTest is not in a submodule.","error":"AttributeError: module 'pyspark_regression' has no attribute 'RegressionTest'"},{"fix":"Drop duplicates or use a different key/combination of columns that yields unique rows.","cause":"Key column contains non-unique values.","error":"pyspark.errors.exceptions.capture.SparkException: Found duplicate keys in DataFrame"}],"ecosystem":"pypi","meta_description":null,"install_score":null,"install_tag":null,"quickstart_score":null,"quickstart_tag":null}