{"id":27411,"library":"sparkxgb","title":"sparkxgb","description":"sparkxgb is a Python wrapper for XGBoost on Apache Spark, providing integration utilities for distributed training and prediction on Spark DataFrames. Version 0.4 is stable but infrequently updated; rely on official XGBoost Spark integration for newer APIs.","status":"maintenance","version":"0.4","language":"python","source_language":"en","source_url":"https://github.com/dmlc/sparkxgb","tags":["xgboost","spark","machine-learning","distributed"],"install":[{"cmd":"pip install sparkxgb","lang":"bash","label":"PyPI install"}],"dependencies":[{"reason":"Required for Spark DataFrame operations","package":"pyspark","optional":false},{"reason":"Core XGBoost library","package":"xgboost","optional":false}],"imports":[{"note":"Standard import for classification/regression estimator","symbol":"XGBoostEstimator","correct":"from sparkxgb import XGBoostEstimator"},{"note":"Use for loaded model inference","symbol":"XGBoostClassificationModel","correct":"from sparkxgb import XGBoostClassificationModel"},{"note":"Newer versions of XGBoost have moved Spark integration to xgboost.spark; sparkxgb remains separate.","wrong":"from xgboost.spark import SparkXGBRegressor (newer API)","symbol":"XGBoostRegressor"}],"quickstart":{"code":"from pyspark.sql import SparkSession\nfrom sparkxgb import XGBoostClassifier\n\nspark = SparkSession.builder.appName('sparkxgb-example').getOrCreate()\ndf = spark.createDataFrame([(1.0, [0.1, 0.2]), (0.0, [0.3, 0.4])], ['label', 'features'])\nclassifier = XGBoostClassifier(max_depth=3, num_round=10)\nmodel = classifier.fit(df, params={'eval_metric': 'logloss'})\npredictions = model.transform(df)\npredictions.show()","lang":"python","description":"Train an XGBoost classifier on a Spark DataFrame with feature vector column."},"warnings":[{"fix":"Downgrade PySpark to 2.4.x or use XGBoost's built-in Spark integration (xgboost >=1.7).","message":"sparkxgb 0.4 requires PySpark <3.0 due to API changes in Spark 3.0+. Check your Spark version.","severity":"breaking","affected_versions":"0.4"},{"fix":"Replace imports: from xgboost.spark import SparkXGBClassifier","message":"sparkxgb is no longer actively maintained; consider migrating to xgboost.spark (available from XGBoost 1.7+).","severity":"deprecated","affected_versions":"0.4"},{"fix":"Always use VectorAssembler to create the 'features' column: from pyspark.ml.feature import VectorAssembler; assembler = VectorAssembler(inputCols=[...], outputCol='features')","message":"Feature column must be of type VectorUDT (e.g., from VectorAssembler). Passing raw arrays fails silently with wrong predictions.","severity":"gotcha","affected_versions":"all"}],"env_vars":null,"last_verified":"2026-05-01T00:00:00.000Z","next_check":"2026-07-30T00:00:00.000Z","problems":[{"fix":"Run: pip install sparkxgb","cause":"Package not installed or virtual environment not activated.","error":"ImportError: No module named sparkxgb"},{"fix":"Install PySpark 2.4.x: pip install pyspark==2.4.8","cause":"Incompatible PySpark version (sparkxgb 0.4 requires PySpark <3.0).","error":"Py4JJavaError: An error occurred while calling o1234.fit.\n: java.lang.NoSuchMethodError"},{"fix":"Use pyspark.ml.linalg.Vector and VectorAssembler from pyspark.ml.feature.","cause":"Using old mllib Vector instead of new ml Vector.","error":"IllegalArgumentException: requirement failed: Column features must be of type org.apache.spark.ml.linalg.VectorUDT but was actually org.apache.spark.mllib.linalg.VectorUDT."}],"ecosystem":"pypi","meta_description":null,"install_score":null,"install_tag":null,"quickstart_score":null,"quickstart_tag":null}