{"id":24401,"library":"pyspark-data-sources","title":"PySpark Data Sources","description":"Custom Spark data sources for reading and writing data in Apache Spark, using the Python Data Source API. Current version: 0.1.11. Release cadence: irregular, low activity.","status":"active","version":"0.1.11","language":"python","source_language":"en","source_url":"https://github.com/jcrobak/pyspark-data-sources","tags":["spark","data-source","pyspark"],"install":[{"cmd":"pip install pyspark-data-sources","lang":"bash","label":"Install from PyPI"}],"dependencies":[{"reason":"Required to use Spark DataFrame API and DataSource API","package":"pyspark","optional":false}],"imports":[{"note":"The top-level module is `pyspark_datasources`, not `pyspark-data-sources` or `pyspark_data_sources` as a standalone module.","wrong":"import pyspark_data_sources","symbol":"DataSource","correct":"from pyspark_datasources import DataSource"}],"quickstart":{"code":"from pyspark.sql import SparkSession\nfrom pyspark_datasources import DataSource\n\nspark = SparkSession.builder.appName('example').getOrCreate()\ndf = spark.read.format('custom_source').load('path/to/data')\ndf.show()","lang":"python","description":"Create a Spark session and use a custom data source via format(). Replace 'custom_source' with the actual source name."},"warnings":[{"fix":"Pin to a specific version and test upgrades carefully.","message":"The library is in early development (0.1.x). API may change without notice.","severity":"gotcha","affected_versions":">=0.1.0,<1.0.0"},{"fix":"Use Python 3.9, 3.10, 3.11, or 3.12.","message":"Python 3.13 is not supported (requires <3.13). Users on 3.13 must downgrade or use alternatives.","severity":"deprecated","affected_versions":">=0.1.0"},{"fix":"Use `import pyspark_datasources` (note: singular 'datasources' in import).","message":"The package name uses hyphens (pyspark-data-sources) but the import uses underscores (pyspark_datasources). This mismatch can cause ImportError.","severity":"gotcha","affected_versions":"all"},{"fix":"Ensure Java 8/11 and Spark are installed and SPARK_HOME is set.","message":"The library requires Java and a Spark installation. Not a pure Python solution.","severity":"gotcha","affected_versions":"all"},{"fix":"Check the source code for the current DataSource abstract methods before implementing.","message":"The DataSource class API may change; custom data source implementations depend on internal interfaces.","severity":"breaking","affected_versions":">=0.1.0"}],"env_vars":null,"last_verified":"2026-05-01T00:00:00.000Z","next_check":"2026-07-30T00:00:00.000Z","problems":[{"fix":"Use `from pyspark_datasources import DataSource` (note: no hyphen, 'datasources' not 'data_sources').","cause":"Using wrong import path or package name.","error":"ImportError: cannot import name 'DataSource' from 'pyspark_data_sources'"},{"fix":"Install PySpark: `pip install pyspark`.","cause":"PySpark is not installed.","error":"ModuleNotFoundError: No module named 'pyspark'"},{"fix":"Install Java 8 or 11 and set JAVA_HOME accordingly.","cause":"Java version mismatch with Spark installation.","error":"java.lang.UnsupportedClassVersionError: ... Unsupported major.minor version"},{"fix":"Ensure SparkSession remains active until all DataFrame operations complete.","cause":"SparkSession was stopped before data source operations.","error":"PySparkException: [INVALID_HANDLE.STATE] Cannot call methods on a stopped SparkSession."}],"ecosystem":"pypi","meta_description":null,"install_score":null,"install_tag":null,"quickstart_score":null,"quickstart_tag":null}