{"id":27275,"library":"pyddq","title":"pyddq","description":"Python API for Drunken Data Quality (DDQ), a data quality validation library for Apache Spark DataFrames. Current version 5.0.0, supports Spark 2.2.1 and Python 3. Last release in 2017; project appears stable with no recent updates.","status":"deprecated","version":"5.0.0","language":"python","source_language":"en","source_url":"https://github.com/FRosner/drunken-data-quality","tags":["data-quality","spark","validation"],"install":[{"cmd":"pip install pyddq","lang":"bash","label":"pip install"}],"dependencies":[],"imports":[{"note":"Direct import from pyddq is the standard method.","wrong":"import pyddq; pyddq.Constraint","symbol":"Constraint","correct":"from pyddq import Constraint"},{"note":"Check class is inside the runner module, not top-level.","wrong":"from pyddq import Check","symbol":"Check","correct":"from pyddq.runner import Check"},{"note":"Runner must be imported from the runner submodule.","wrong":"import pyddq; pyddq.Runner","symbol":"Runner","correct":"from pyddq.runner import Runner"}],"quickstart":{"code":"from pyddq import Constraint\nfrom pyddq.runner import Runner, Check\nfrom pyspark.sql import SparkSession\n\nspark = SparkSession.builder.appName('ddq_example').getOrCreate()\ndf = spark.createDataFrame([('Alice', 34), ('Bob', 45), ('Charlie', 28)], ['name', 'age'])\n\nconstraint = Constraint(name='age_not_null', condition=\"age IS NOT NULL\")\ncheck = Check(df, [constraint])\nrunner = Runner()\nresults = runner.run(check)\nfor r in results:\n    print(r.result, r.constraint_name)","lang":"python","description":"Quickstart: create a Spark DataFrame, define a constraint, run a check with Runner."},"warnings":[{"fix":"Ensure SparkSession is created before using pyddq functions.","message":"pyddq requires a running SparkSession. Without initializing Spark, imports will fail or hang.","severity":"gotcha","affected_versions":"all"},{"fix":"Use SparkSession instead of SparkContext when creating DataFrame.","message":"The API uses Spark 2.0+ SparkSession; older SparkContext-based code will not work.","severity":"breaking","affected_versions":"4.0.0+"},{"fix":"Consider alternatives like great_expectations or Deequ for modern Spark setups.","message":"The library is no longer actively maintained. No updates since 2017; may not work with newer Spark versions.","severity":"deprecated","affected_versions":"5.0.0"}],"env_vars":null,"last_verified":"2026-05-01T00:00:00.000Z","next_check":"2026-07-30T00:00:00.000Z","problems":[{"fix":"Run 'pip install pyddq' and ensure it's in the correct Python environment.","cause":"pyddq is not installed or installed incorrectly.","error":"ImportError: cannot import name 'Constraint' from 'pyddq'"},{"fix":"Create a SparkSession before running checks: spark = SparkSession.builder.appName('test').getOrCreate()","cause":"Spark session not initialized or incompatible Spark version.","error":"Py4JJavaError: An error occurred while calling o123.run."},{"fix":"Import as: from pyddq.runner import Runner","cause":"Wrong import path; Runner is in the runner submodule.","error":"AttributeError: module 'pyddq' has no attribute 'Runner'"}],"ecosystem":"pypi","meta_description":null,"install_score":null,"install_tag":null,"quickstart_score":null,"quickstart_tag":null}