{"id":23530,"library":"dedupe","title":"Dedupe","description":"A Python library for accurate and scalable data deduplication and entity resolution. Version 3.0.3 requires Python >=3.8 and supports fuzzy matching, blocking, and active learning.","status":"active","version":"3.0.3","language":"python","source_language":"en","source_url":"https://github.com/dedupeio/dedupe","tags":["deduplication","entity-resolution","data-cleaning","fuzzy-matching"],"install":[{"cmd":"pip install dedupe","lang":"bash","label":"latest"}],"dependencies":[],"imports":[{"note":"The main class is not directly exported; use dedupe.Dedupe after importing the module.","wrong":"from dedupe import Dedupe","symbol":"Dedupe","correct":"import dedupe"}],"quickstart":{"code":"import dedupe\nimport csv\n\ndata_d = {}\nwith open('input.csv') as f:\n    reader = csv.DictReader(f)\n    for i, row in enumerate(reader):\n        data_d[i] = row\n\n# Initialize deduper\ndeduper = dedupe.Dedupe([{'field': 'name', 'type': 'String'},\n                         {'field': 'address', 'type': 'String'}])\n\n# Training (sample)\ndeduper.sample(data_d, 10000)\n# For labeled examples, use deduper.markPairs or load from file\n\ndeduper.train()\n\n# Cluster\nclustered = deduper.cluster(data_d)\nprint(clustered)","lang":"python","description":"Basic dedupe workflow: load data, define fields, sample, train, and cluster duplicates."},"warnings":[{"fix":"Update imports and method calls to match current API; refer to migration guide.","message":"In dedupe v2, the API changed significantly: 'Dedupe' class replaced old Dedupe function, and training/sampling methods were reworked.","severity":"breaking","affected_versions":"old v1.x code will not work with v2+"},{"fix":"Use 'import dedupe' instead of 'from dedupe import Dedupe'.","message":"The 'Dedupe' class and related functions are not imported as submodules; always use 'import dedupe' then access e.g. dedupe.Dedupe.","severity":"gotcha","affected_versions":"all"},{"fix":"Use deduper.markPairs or provide pre-labeled data.","message":"The 'ConsoleLabel' training method is deprecated in favor of programmatic labeling via 'markPairs'.","severity":"deprecated","affected_versions":">=3.0"}],"env_vars":null,"last_verified":"2026-05-01T00:00:00.000Z","next_check":"2026-07-30T00:00:00.000Z","problems":[{"fix":"Run 'pip install dedupe' and ensure the correct Python environment is activated.","cause":"Library not installed or installed in a different environment.","error":"ModuleNotFoundError: No module named 'dedupe'"},{"fix":"Use 'import dedupe' then access 'dedupe.Dedupe'.","cause":"Incorrect import pattern (e.g., 'from dedupe import Dedupe') or using an older version.","error":"AttributeError: module 'dedupe' has no attribute 'Dedupe'"},{"fix":"Ensure dictionary keys are integers (e.g., use enumerate).","cause":"Passing string keys to data_d but dedupe expects integer keys.","error":"TypeError: 'str' object cannot be interpreted as an integer"}],"ecosystem":"pypi","meta_description":null,"install_score":null,"install_tag":null,"quickstart_score":null,"quickstart_tag":null}