{"id":21098,"library":"dask-ml","title":"dask-ml","description":"A library for distributed and parallel machine learning built on top of Dask and scikit-learn. Current version is 2025.1.0, with releases roughly a few times a year.","status":"active","version":"2025.1.0","language":"python","source_language":"en","source_url":"https://github.com/dask/dask-ml","tags":["machine learning","distributed","dask","scikit-learn","scalable"],"install":[{"cmd":"pip install dask-ml","lang":"bash","label":"default"},{"cmd":"conda install -c conda-forge dask-ml","lang":"bash","label":"conda"}],"dependencies":[{"reason":"core distributed computing framework","package":"dask","optional":false},{"reason":"API compatibility and underlying algorithms","package":"scikit-learn","optional":false},{"reason":"data handling","package":"pandas","optional":false},{"reason":"array operations","package":"numpy","optional":false},{"reason":"optional distributed scheduler","package":"distributed","optional":true}],"imports":[{"note":"wrong module path","wrong":"from dask_ml.models import LogisticRegression","symbol":"LogisticRegression","correct":"from dask_ml.linear_model import LogisticRegression"},{"note":"sklearn's API returns numpy arrays","wrong":"from sklearn.model_selection import train_test_split","symbol":"train_test_split","correct":"from dask_ml.model_selection import train_test_split"},{"note":"sklearn's StandardScaler does not work on Dask arrays","wrong":"from sklearn.preprocessing import StandardScaler","symbol":"preprocessing","correct":"from dask_ml.preprocessing import StandardScaler"}],"quickstart":{"code":"import dask.dataframe as dd\nfrom dask_ml.linear_model import LogisticRegression\nfrom dask_ml.model_selection import train_test_split\n\n# Create a Dask DataFrame from a CSV\ndf = dd.read_csv('data.csv')\nX = df[['feature1', 'feature2']].to_dask_array(lengths=True)\ny = df['label'].to_dask_array(lengths=True)\n\n# Train-test split\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n\n# Fit model\nmodel = LogisticRegression()\nmodel.fit(X_train, y_train)\n\n# Score\naccuracy = model.score(X_test, y_test)\nprint(accuracy.compute())","lang":"python","description":"Basic usage: load data with Dask, split, train a logistic regression model, and compute accuracy."},"warnings":[{"fix":"Update dask to latest or pin dask-ml<2024.3.20.","message":"dask-ml 2024.3.20+ requires dask-expr; old dataframes from dask<2024.3 may break.","severity":"breaking","affected_versions":">=2024.3.20"},{"fix":"Consider using sklearn's KMeans on dask arrays via map_blocks.","message":"dask_ml.cluster.KMeans is deprecated; use dask_ml.cluster.KMeans from dask-ml but it's being replaced by dask array's native k-means.","severity":"deprecated","affected_versions":">=2025.1.0"},{"fix":"Convert pandas DataFrame to dask DataFrame using dask.dataframe.from_pandas() before splitting.","message":"train_test_split from dask_ml.model_selection requires Dask DataFrames; passing pandas DataFrame gives unexpected results.","severity":"gotcha","affected_versions":"all"},{"fix":"Call .compute_chunk_sizes() on the Dask array before fitting.","message":"Many estimators do not support Dask arrays with unknown chunk sizes; call .compute_chunk_sizes() first.","severity":"gotcha","affected_versions":"all"}],"env_vars":null,"last_verified":"2026-04-27T00:00:00.000Z","next_check":"2026-07-26T00:00:00.000Z","problems":[{"fix":"Ensure all input Dask arrays have known chunk sizes: X = X.compute_chunk_sizes()","cause":"Mismatched chunk sizes in Dask array during fit.","error":"ValueError: could not broadcast input array from shape (X,) into shape (Y,)"},{"fix":"Convert using ddf = dask.dataframe.from_pandas(df, npartitions=...)","cause":"Passed a pandas DataFrame to dask_ml.model_selection functions.","error":"TypeError: Only Dask DataFrames are supported; got pandas DataFrame"},{"fix":"Use correct submodule, e.g., from dask_ml.linear_model import LogisticRegression","cause":"Wrong import path for estimators.","error":"ModuleNotFoundError: No module named 'dask_ml.models'"}],"ecosystem":"pypi","meta_description":null,"install_score":null,"install_tag":null,"quickstart_score":null,"quickstart_tag":null}