mirror of
https://github.com/open-metadata/OpenMetadata
synced 2026-05-24 09:39:11 +00:00
* feat(ingestion): add QuestDB database connector QuestDB speaks the PostgreSQL wire protocol but implements a minimal pg_catalog, so the default PG dialect queries fail on the CHAR->DOUBLE cast in pg_class.relkind. This connector routes SQLAlchemy inspection through information_schema and short-circuits constraint/index lookups (QuestDB has no PK/FK/unique/indexes), letting CommonDbSourceService handle the rest of the topology unchanged. - Fixed /qdb target in the psycopg2 URL regardless of databaseName (which remains the OpenMetadata display name) - get_database_names defaults to 'qdb' instead of 'default' - 12 unit tests + live-verified against QuestDB 9.3.5 on localhost:8812 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> * refactor(questdb): address review feedback — rename to QuestDB, wire UI Code review fixes for PR #27604: Blockers resolved: - Rename Questdb -> QuestDB across schema, enum, Python classes, and all generated TS files. Matches peer connectors (PinotDB, DynamoDB) and the product's actual brand. Changing post-merge would be a breaking migration. - Remove sslConfig from schema. QuestDB's sslConfig was declared but never wired — ssl_manager.check_ssl_and_init is @singledispatch and has no QuestDBConnection registration, so enabling SSL in the UI was a silent no-op. Can be added in a follow-up with an explicit psycopg2 wiring. Warnings resolved: - authType now in schema's required array — was failing with opaque 401. - Delete dead queries.py (QUESTDB_TEST_GET_TABLES was defined but never imported). - Add bytea -> LargeBinary to the type map (verified via live information_schema probe against QuestDB 9.3.5 — all other native types normalize to standard PG names that were already mapped). - Complete type annotations on utils._get_table_names, _get_columns, _information_schema_type. - Dialect patch test now uses a real PGDialect_psycopg2 instance instead of a MagicMock dialect, so it catches signature drift against the real SQLAlchemy Inspector contract. Added a separate test that verifies get_table_names emits a query against information_schema.tables (not pg_catalog). - Add ingestion_logger() to utils.py with a debug log on dialect patching. - _empty_view_definition now returns None instead of "" to match how other dialects signal the absence of a DDL. Also fixes UI visibility (QuestDB was missing from the service picker): - Regenerate 15 TS enum files via json2ts.sh -> quicktype so the new DatabaseServiceType.QuestDB value flows through the UI. - Register service-icon-questdb.png in ServiceIconUtils.ts. - Add locales/en-US/Database/QuestDB.md connector form docs. - Add quicktype as a devDependency — json2ts.sh needs it and it wasn't installed. Docs: update skills/connector-building and skills/standards/registration to reflect reality — i18n locale files are not needed, icon + locale MD registration steps are, and Services.constant.ts is deprecated. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> * skill * fix(questdb): restore databaseSchema field for test connection test_connection_db_schema_sources reads service_connection.databaseSchema directly with no hasattr guard. Removing it from the schema in the prior review fix broke GetTables and GetViews steps: 'QuestDBConnection' object has no attribute 'databaseSchema' Restored as an optional string with a clearer description (defaults to public when unset). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> * fix owners * add yaml * Update generated TypeScript types * Sync package.json and yarn.lock with main * Fix: ingestion files , Added Lineage for questdb tests and UI changes, Refactored code * FIX: python_checkstyle * Fix: test and unused param * Fix: yield_table enforcing tabletype to partition, Refactored lineage * Fix: Failing test and remove print statement * FIX: python_checkstyle and added error handling * FIX: Resolved comments * FIX: failing tests and schema cleaning * Minor change * Fix: Failing unit tests * Fix: Unit test unrelated changes ignored * FIX: tests * Fix: Failing test due to extra parameter in yaml --------- Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com> Co-authored-by: Akash Verma <akashverma@Akashs-MacBook-Pro-2.local> Co-authored-by: Akash Verma <138790903+akashverma0786@users.noreply.github.com>
566 lines
19 KiB
Python
566 lines
19 KiB
Python
# Copyright 2025 Collate
|
|
# Licensed under the Collate Community License, Version 1.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
# https://github.com/open-metadata/OpenMetadata/blob/main/ingestion/LICENSE
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
"""
|
|
Python Dependencies
|
|
"""
|
|
|
|
from typing import Dict, List, Set # noqa: UP035
|
|
|
|
from setuptools import setup
|
|
|
|
# Add here versions required for multiple plugins
|
|
VERSIONS = {
|
|
"airflow": "apache-airflow==3.1.7",
|
|
"adlfs": "adlfs>=2023.1.0",
|
|
"aiobotocore": "aiobotocore~=2.26.0",
|
|
"avro": "avro>=1.11.4,<1.12",
|
|
"boto3": "boto3~=1.41.5",
|
|
"cloud-sql-python-connector-pymysql": "cloud-sql-python-connector[pymysql]>=1.0.0,<2.0.0",
|
|
"geoalchemy2": "GeoAlchemy2~=0.12",
|
|
"google-cloud-monitoring": "google-cloud-monitoring>=2.0.0",
|
|
"google-cloud-storage": "google-cloud-storage>=1.43.0",
|
|
"gcsfs": "gcsfs~=2023.12.1",
|
|
"great-expectations": "great-expectations~=0.18.0",
|
|
"great-expectations-1xx": "great-expectations~=1.0",
|
|
"grpc-tools": "grpcio-tools>=1.47.2",
|
|
"ijson": "ijson~=3.4",
|
|
"msal": "msal~=1.2",
|
|
"neo4j": "neo4j~=5.3",
|
|
"pandas": "pandas~=2.1.4",
|
|
"pyarrow": "pyarrow~=16.0",
|
|
"pydantic": "pydantic~=2.0,>=2.7.0,<2.12", # Pin down to <2.12 due to breaking changes in 2.12.0
|
|
"pydantic-settings": "pydantic-settings~=2.0,>=2.7.0",
|
|
"pydomo": "pydomo~=0.3",
|
|
"pymysql": "pymysql~=1.0",
|
|
"pyodbc": "pyodbc~=5.3.0",
|
|
"numpy": "numpy<2",
|
|
"scikit-learn": "scikit-learn>=1.3,<2",
|
|
"packaging": "packaging",
|
|
"azure-storage-blob": "azure-storage-blob~=12.14",
|
|
"azure-identity": "azure-identity~=1.12",
|
|
"databricks-sdk": "databricks-sdk~=0.20.0",
|
|
"databricks-sql-connector": "databricks-sql-connector>=4.0.0",
|
|
"databricks-sqlalchemy": "databricks-sqlalchemy~=2.0.9",
|
|
"trino": "trino[sqlalchemy]",
|
|
"spacy": "spacy<3.8",
|
|
"looker-sdk": "looker-sdk>=22.20.0,!=24.18.0",
|
|
"lkml": "lkml~=1.3",
|
|
"tableau": "tableauserverclient==0.25", # higher versions require urllib3>2.0 which conflicts other libs
|
|
"pyhive": "pyhive[hive_pure_sasl]~=0.7",
|
|
"mongo": "pymongo~=4.3",
|
|
"snowflake": "snowflake-sqlalchemy>=1.6.1",
|
|
"snowflake-connector": "snowflake-connector-python~=3.18.0",
|
|
"elasticsearch8": "elasticsearch8~=8.9.0",
|
|
"giturlparse": "giturlparse",
|
|
"validators": "validators~=0.22.0",
|
|
"teradata": "teradatasqlalchemy==20.0.0.2",
|
|
"cockroach": "sqlalchemy-cockroachdb~=2.0",
|
|
"cassandra": "cassandra-driver>=3.28.0",
|
|
"opensearch": "opensearch-py~=2.4.0",
|
|
"starrocks": "pymysql~=1.0",
|
|
"google-cloud-bigtable": "google-cloud-bigtable>=2.0.0",
|
|
"google-cloud-pubsub": "google-cloud-pubsub>=2.0.0",
|
|
"pyathena": "pyathena~=3.25.0",
|
|
"s3fs": "s3fs~=2023.12.1",
|
|
"sqlalchemy-bigquery": "sqlalchemy-bigquery>=1.15.0",
|
|
"presidio-analyzer": "presidio-analyzer==2.2.358",
|
|
"asammdf": "asammdf~=7.4.5",
|
|
"kafka-connect": "kafka-connect-py==0.10.11",
|
|
"griffe2md": "griffe2md~=1.2",
|
|
"factory-boy": "factory-boy~=3.3.3",
|
|
}
|
|
|
|
COMMONS = {
|
|
"datalake": {
|
|
VERSIONS["asammdf"],
|
|
VERSIONS["avro"],
|
|
VERSIONS["boto3"],
|
|
VERSIONS["ijson"],
|
|
VERSIONS["pandas"],
|
|
VERSIONS["pyarrow"],
|
|
VERSIONS["numpy"],
|
|
# python-snappy does not work well on 3.11 https://github.com/aio-libs/aiokafka/discussions/931
|
|
# Using this as an alternative
|
|
"cramjam~=2.7",
|
|
"fastavro>=1.2.0",
|
|
},
|
|
"hive": {
|
|
"pure-transport==0.2.0",
|
|
"presto-types-parser>=0.0.2",
|
|
VERSIONS["pyhive"],
|
|
},
|
|
"kafka": {
|
|
VERSIONS["avro"],
|
|
"confluent_kafka>=2.1.1,<=2.6.1",
|
|
"fastavro>=1.2.0",
|
|
# Due to https://github.com/grpc/grpc/issues/30843#issuecomment-1303816925
|
|
# use >= v1.47.2 https://github.com/grpc/grpc/blob/v1.47.2/tools/distrib/python/grpcio_tools/grpc_version.py#L17
|
|
VERSIONS["grpc-tools"], # grpcio-tools already depends on grpcio. No need to add separately
|
|
"protobuf",
|
|
},
|
|
"postgres": {
|
|
VERSIONS["pymysql"],
|
|
"psycopg2-binary",
|
|
VERSIONS["geoalchemy2"],
|
|
VERSIONS["packaging"],
|
|
}, # Adding as Postgres SQL & GreenPlum are using common packages.
|
|
}
|
|
|
|
DATA_DIFF = {
|
|
driver: f"collate-data-diff[{driver}]"
|
|
# data-diff uses different drivers out-of-the-box than OpenMetadata
|
|
# the extras are described here:
|
|
# https://github.com/open-metadata/collate-data-diff/blob/main/pyproject.toml#L68
|
|
# install all data diffs with "pip install collate-data-diff[all-dbs]"
|
|
for driver in [
|
|
"clickhouse",
|
|
# "duckdb", # Not supported by OpenMetadata
|
|
"mssql",
|
|
"mysql",
|
|
"oracle",
|
|
# "postgresql", we dont use this as it installs psycopg2 which interferes with psycopg2-binary
|
|
"presto",
|
|
"redshift",
|
|
"snowflake",
|
|
"trino",
|
|
"vertica",
|
|
]
|
|
}
|
|
|
|
base_requirements = {
|
|
"antlr4-python3-runtime==4.9.2",
|
|
VERSIONS["azure-identity"],
|
|
"azure-keyvault-secrets", # Azure Key Vault SM
|
|
VERSIONS["boto3"], # Required in base for the secrets manager
|
|
"cached-property==1.5.2", # LineageParser
|
|
"cachetools", # Used to cache masked queries in ingestion/src/metadata/ingestion/lineage/masker.py
|
|
"chardet==4.0.0", # Used in the profiler
|
|
"cryptography>=42.0.0",
|
|
"google-cloud-secret-manager==2.24.0",
|
|
"google-crc32c",
|
|
"email-validator>=2.0", # For the pydantic generated models for Email
|
|
"importlib-metadata>=4.13.0", # From airflow constraints
|
|
"Jinja2>=2.11.3",
|
|
"jsonpatch<2.0, >=1.24",
|
|
"kubernetes>=21.0.0", # Kubernetes client for secrets manager
|
|
"memory-profiler",
|
|
"mypy_extensions>=0.4.3",
|
|
VERSIONS["pydantic"],
|
|
VERSIONS["pydantic-settings"],
|
|
VERSIONS["pymysql"],
|
|
"python-dateutil>=2.8.1",
|
|
"python-dotenv>=0.19.0", # For environment variable support in dbt ingestion
|
|
"PyYAML~=6.0",
|
|
"requests>=2.23",
|
|
"requests-aws4auth~=1.1", # Only depends on requests as external package. Leaving as base.
|
|
"sqlalchemy>=2.0.0,<3",
|
|
"collate-sqllineage>=2.1.1",
|
|
"tabulate==0.9.0",
|
|
"typing-inspect",
|
|
"packaging", # For version parsing
|
|
"setuptools>=78.1.1,<81", # <81 required: pkg_resources removed in setuptools 81+
|
|
"shapely",
|
|
"collate-data-diff>=0.11.9",
|
|
# Floor on dbt-extractor (transitive via collate-data-diff -> dbt-core).
|
|
# Pre-0.5 versions ship no cp310-manylinux_2_17_aarch64 wheel, forcing a
|
|
# Rust/Cargo source build on ARM runners. 0.5+ uses cp38-abi3 wheels.
|
|
"dbt-extractor>=0.5.0",
|
|
"jaraco.functools<4.2.0", # above 4.2 breaks the build
|
|
"jaraco.context==6.0.1",
|
|
# TODO: Remove one once we have updated datadiff version
|
|
VERSIONS["snowflake-connector"],
|
|
"mysql-connector-python>=9.1",
|
|
"httpx~=0.28.0",
|
|
}
|
|
|
|
plugins: Dict[str, Set[str]] = { # noqa: UP006
|
|
"airflow": {
|
|
"opentelemetry-exporter-otlp==1.37.0",
|
|
"attrs",
|
|
VERSIONS["airflow"],
|
|
}, # Same as ingestion container. For development.
|
|
"amundsen": {VERSIONS["neo4j"]},
|
|
"athena": {VERSIONS["pyathena"]},
|
|
"atlas": {},
|
|
"azuresql": {VERSIONS["pyodbc"]},
|
|
"azure-sso": {VERSIONS["msal"]},
|
|
"microsoftfabric": {VERSIONS["pyodbc"], VERSIONS["msal"]},
|
|
"microsoftfabricpipeline": {VERSIONS["msal"]},
|
|
"backup": {VERSIONS["boto3"], VERSIONS["azure-identity"], "azure-storage-blob"},
|
|
"googledrive": {
|
|
"google-api-python-client>=2.0.0",
|
|
},
|
|
"bigquery": {
|
|
"google-cloud-datacatalog>=3.6.2",
|
|
"google-cloud-logging",
|
|
VERSIONS["pyarrow"],
|
|
VERSIONS["numpy"],
|
|
VERSIONS["sqlalchemy-bigquery"],
|
|
},
|
|
"bigtable": {
|
|
VERSIONS["google-cloud-bigtable"],
|
|
VERSIONS["pandas"],
|
|
VERSIONS["numpy"],
|
|
},
|
|
"clickhouse": {
|
|
"clickhouse-driver~=0.2",
|
|
"clickhouse-sqlalchemy>=0.3",
|
|
DATA_DIFF["clickhouse"],
|
|
},
|
|
"dagster": {
|
|
"croniter<3",
|
|
VERSIONS["pymysql"],
|
|
"psycopg2-binary",
|
|
VERSIONS["geoalchemy2"],
|
|
"dagster_graphql>=1.8.0",
|
|
},
|
|
"dbt": {
|
|
"google-cloud",
|
|
VERSIONS["boto3"],
|
|
VERSIONS["google-cloud-storage"],
|
|
"collate-dbt-artifacts-parser",
|
|
VERSIONS["azure-storage-blob"],
|
|
VERSIONS["azure-identity"],
|
|
},
|
|
"db2": {"ibm-db-sa~=0.4.1", "ibm-db>=3.2.6"},
|
|
"db2-ibmi": {
|
|
# sqlalchemy-ibmi is pre-installed with --no-deps (SA<2 metadata conflict)
|
|
},
|
|
"databricks": {
|
|
VERSIONS["databricks-sqlalchemy"],
|
|
VERSIONS["databricks-sdk"],
|
|
VERSIONS["databricks-sql-connector"],
|
|
"ndg-httpsclient~=0.5.1",
|
|
"pyOpenSSL~=24.1.0",
|
|
"pyasn1~=0.6.0",
|
|
},
|
|
"datalake-azure": {
|
|
VERSIONS["azure-storage-blob"],
|
|
VERSIONS["azure-identity"],
|
|
VERSIONS["adlfs"],
|
|
VERSIONS["aiobotocore"],
|
|
*COMMONS["datalake"],
|
|
},
|
|
"datalake-gcs": {
|
|
VERSIONS["google-cloud-monitoring"],
|
|
VERSIONS["google-cloud-storage"],
|
|
VERSIONS["gcsfs"],
|
|
VERSIONS["aiobotocore"],
|
|
*COMMONS["datalake"],
|
|
},
|
|
"datalake-s3": {
|
|
VERSIONS["s3fs"],
|
|
VERSIONS["aiobotocore"],
|
|
*COMMONS["datalake"],
|
|
},
|
|
"deltalake": {
|
|
"delta-spark>=3.0.0,<4.0.0",
|
|
"deltalake>=0.19.0,<0.20",
|
|
"pyspark==3.5.6",
|
|
}, # TODO: remove pinning to under 0.20 after https://github.com/open-metadata/OpenMetadata/issues/17909
|
|
"deltalake-storage": {"deltalake>=0.19.0,<0.20"},
|
|
"deltalake-spark": {"delta-spark>=3.0.0,<4.0.0", "pyspark==3.5.6"},
|
|
"domo": {VERSIONS["pydomo"]},
|
|
# pydoris-custom declares sqlalchemy<2 but works at runtime with SA 2.0.
|
|
# Pre-installed with --no-deps in Dockerfiles.
|
|
"doris": set(),
|
|
"starrocks": {VERSIONS["pymysql"]},
|
|
"druid": {"pydruid>=0.6.5"},
|
|
"dynamodb": {VERSIONS["boto3"]},
|
|
"elasticsearch": {
|
|
VERSIONS["elasticsearch8"],
|
|
}, # also requires requests-aws4auth which is in base
|
|
"opensearch": {VERSIONS["opensearch"]},
|
|
"exasol": {
|
|
"sqlalchemy_exasol>=6,<7",
|
|
"exasol-integration-test-docker-environment>=6.0.0,<7",
|
|
},
|
|
"glue": {VERSIONS["boto3"]},
|
|
"great-expectations": {VERSIONS["great-expectations"]},
|
|
"great-expectations-1xx": {VERSIONS["great-expectations-1xx"]},
|
|
"greenplum": {*COMMONS["postgres"]},
|
|
"cockroach": {
|
|
VERSIONS["cockroach"],
|
|
"psycopg2-binary",
|
|
},
|
|
"hive": {
|
|
*COMMONS["hive"],
|
|
"thrift>=0.13,<1",
|
|
# Replacing sasl with pure-sasl based on https://github.com/cloudera/python-sasl/issues/30 for py 3.11
|
|
"pure-sasl",
|
|
"thrift-sasl~=0.4",
|
|
"impyla~=0.18.0",
|
|
},
|
|
"iomete": {
|
|
"iomete-sqlalchemy>=1.0.22",
|
|
"adbc-driver-flightsql",
|
|
"adbc-driver-manager",
|
|
},
|
|
"impala": {
|
|
"presto-types-parser>=0.0.2",
|
|
"impyla[kerberos]~=0.18.0",
|
|
"thrift>=0.13,<1",
|
|
"pure-sasl",
|
|
"thrift-sasl~=0.4",
|
|
},
|
|
"kafka": {*COMMONS["kafka"]},
|
|
"kafkaconnect": {VERSIONS["kafka-connect"]},
|
|
"kinesis": {VERSIONS["boto3"]},
|
|
"pubsub": {VERSIONS["google-cloud-pubsub"]},
|
|
"looker": {
|
|
VERSIONS["looker-sdk"],
|
|
VERSIONS["lkml"],
|
|
"gitpython~=3.1.34",
|
|
VERSIONS["giturlparse"],
|
|
"python-liquid",
|
|
},
|
|
"mlflow": {"mlflow-skinny~=3.6.0"},
|
|
"mongo": {VERSIONS["mongo"], VERSIONS["pandas"], VERSIONS["numpy"]},
|
|
"cassandra": {VERSIONS["cassandra"]},
|
|
"couchbase": {"couchbase~=4.1"},
|
|
"mssql": {
|
|
"sqlalchemy-pytds~=0.3",
|
|
DATA_DIFF["mssql"],
|
|
},
|
|
"mssql-odbc": {
|
|
VERSIONS["pyodbc"],
|
|
DATA_DIFF["mssql"],
|
|
},
|
|
"mysql": {
|
|
VERSIONS["pymysql"],
|
|
VERSIONS["cloud-sql-python-connector-pymysql"],
|
|
DATA_DIFF["mysql"],
|
|
},
|
|
"nifi": {}, # uses requests
|
|
"openlineage": {*COMMONS["kafka"]},
|
|
"oracle": {"cx_Oracle>=8.3.0,<9", "oracledb~=1.2", DATA_DIFF["oracle"]},
|
|
"pgspider": {"psycopg2-binary", "sqlalchemy-pgspider"},
|
|
"pinotdb": {"pinotdb~=5.0"},
|
|
"postgres": {*COMMONS["postgres"]},
|
|
"powerbi": {
|
|
VERSIONS["msal"],
|
|
VERSIONS["boto3"],
|
|
VERSIONS["google-cloud-storage"],
|
|
VERSIONS["azure-storage-blob"],
|
|
VERSIONS["azure-identity"],
|
|
},
|
|
"qliksense": {"websocket-client~=1.6.1"},
|
|
"presto": {*COMMONS["hive"], DATA_DIFF["presto"]},
|
|
"pymssql": {"pymssql~=2.3.9"},
|
|
"questdb": {"psycopg2-binary"},
|
|
"quicksight": {VERSIONS["boto3"]},
|
|
"redash": {VERSIONS["packaging"]},
|
|
"redpanda": {*COMMONS["kafka"]},
|
|
"redshift": {
|
|
"sqlalchemy-redshift~=1.0.0",
|
|
"psycopg2-binary",
|
|
VERSIONS["geoalchemy2"],
|
|
},
|
|
"sagemaker": {VERSIONS["boto3"]},
|
|
"salesforce": {"simple_salesforce~=1.11", "authlib>=1.3.1"},
|
|
"sample-data": {
|
|
VERSIONS["avro"],
|
|
VERSIONS["grpc-tools"],
|
|
VERSIONS["sqlalchemy-bigquery"],
|
|
VERSIONS["spacy"],
|
|
VERSIONS["presidio-analyzer"],
|
|
},
|
|
"sap-hana": {"hdbcli", "sqlalchemy-hana"},
|
|
"sas": {},
|
|
"singlestore": {VERSIONS["pymysql"]},
|
|
"sklearn": {VERSIONS["scikit-learn"]},
|
|
"snowflake": {VERSIONS["snowflake"], DATA_DIFF["snowflake"]},
|
|
"ssrs": {"requests-ntlm"},
|
|
"superset": {}, # uses requests
|
|
"tableau": {VERSIONS["tableau"], VERSIONS["validators"], VERSIONS["packaging"]},
|
|
"teradata": {VERSIONS["teradata"]},
|
|
"trino": {VERSIONS["trino"], DATA_DIFF["trino"]},
|
|
"vertica": {"sqlalchemy-vertica[vertica-python]>=0.0.5", DATA_DIFF["vertica"]},
|
|
# SDK Data Quality: Required for DataFrame validation (DataFrameValidator)
|
|
# Install with: pip install 'openmetadata-ingestion[pandas]'
|
|
"pandas": {VERSIONS["pandas"], VERSIONS["numpy"]},
|
|
"pyarrow": {VERSIONS["pyarrow"]},
|
|
"pii-processor": {
|
|
VERSIONS["spacy"],
|
|
VERSIONS["pandas"],
|
|
VERSIONS["numpy"],
|
|
VERSIONS["presidio-analyzer"],
|
|
},
|
|
"presidio-analyzer": {VERSIONS["presidio-analyzer"]},
|
|
}
|
|
|
|
dev = {
|
|
"ruff~=0.15.12",
|
|
"uvloop==0.21.0",
|
|
"datamodel-code-generator==0.25.6",
|
|
"boto3-stubs",
|
|
"mypy-boto3-glue",
|
|
"nox",
|
|
"pre-commit",
|
|
"basedpyright==1.39.3",
|
|
# For publishing
|
|
"twine",
|
|
"build",
|
|
*plugins["sample-data"],
|
|
}
|
|
|
|
# Dependencies for unit testing in addition to dev dependencies and plugins
|
|
test_unit = {
|
|
"pytest==7.0.1",
|
|
"pytest-cov",
|
|
"pytest-order",
|
|
"pytest-rerunfailures",
|
|
"dirty-equals",
|
|
"faker==37.1.0", # The version needs to be fixed to prevent flaky tests!
|
|
# TODO: Remove once no unit test requires testcontainers
|
|
"testcontainers",
|
|
VERSIONS["factory-boy"],
|
|
*plugins["exasol"],
|
|
*plugins["teradata"],
|
|
}
|
|
|
|
test = {
|
|
# Install Airflow as it's not part of `all` plugin
|
|
"opentelemetry-exporter-otlp==1.37.0",
|
|
VERSIONS["airflow"],
|
|
"boto3-stubs",
|
|
"mypy-boto3-glue",
|
|
"coverage",
|
|
# Install GE because it's not in the `all` plugin
|
|
VERSIONS["great-expectations"],
|
|
"pytest==7.0.1",
|
|
"pytest-cov",
|
|
"pytest-xdist~=3.5",
|
|
"pytest-order",
|
|
"dirty-equals",
|
|
# install dbt dependency
|
|
"collate-dbt-artifacts-parser",
|
|
"freezegun",
|
|
VERSIONS["databricks-sdk"],
|
|
VERSIONS["databricks-sql-connector"],
|
|
VERSIONS["scikit-learn"],
|
|
VERSIONS["pyarrow"],
|
|
VERSIONS["trino"],
|
|
VERSIONS["spacy"],
|
|
VERSIONS["pydomo"],
|
|
VERSIONS["looker-sdk"],
|
|
VERSIONS["lkml"],
|
|
VERSIONS["tableau"],
|
|
VERSIONS["pyhive"],
|
|
VERSIONS["mongo"],
|
|
VERSIONS["cassandra"],
|
|
VERSIONS["snowflake"],
|
|
VERSIONS["elasticsearch8"],
|
|
VERSIONS["giturlparse"],
|
|
VERSIONS["avro"], # Sample Data
|
|
VERSIONS["grpc-tools"],
|
|
VERSIONS["neo4j"],
|
|
VERSIONS["cockroach"],
|
|
# pydoris-custom pre-installed with --no-deps in Dockerfiles (SA<2 metadata constraint).
|
|
VERSIONS["starrocks"],
|
|
"testcontainers~=4.8.0",
|
|
"minio==7.2.5",
|
|
*plugins["mlflow"],
|
|
*plugins["datalake-s3"],
|
|
*plugins["kafka"],
|
|
"kafka-python==2.0.2",
|
|
*plugins["pii-processor"],
|
|
"requests>=2.31.0,<3",
|
|
f"{DATA_DIFF['mysql']}",
|
|
*plugins["deltalake"],
|
|
*plugins["datalake-gcs"],
|
|
*plugins["pgspider"],
|
|
*plugins["clickhouse"],
|
|
*plugins["dagster"],
|
|
*plugins["oracle"],
|
|
*plugins["mssql"],
|
|
VERSIONS["validators"],
|
|
VERSIONS["pyathena"],
|
|
"python-liquid",
|
|
VERSIONS["google-cloud-bigtable"],
|
|
*plugins["bigquery"],
|
|
"faker==37.1.0", # The version needs to be fixed to prevent flaky tests!
|
|
*plugins["exasol"],
|
|
VERSIONS["opensearch"],
|
|
VERSIONS["kafka-connect"],
|
|
VERSIONS["factory-boy"],
|
|
"locust~=2.32.0",
|
|
*plugins["exasol"],
|
|
*plugins["teradata"],
|
|
}
|
|
|
|
docs = {
|
|
VERSIONS["griffe2md"],
|
|
}
|
|
|
|
e2e_test = {
|
|
# playwright dependencies
|
|
"pytest-playwright",
|
|
"pytest-base-url",
|
|
}
|
|
|
|
# Define playwright_dependencies as a set of packages required for Playwright tests
|
|
# These packages correspond to the ingestion connectors used in Playwright tests
|
|
playwright_dependencies = {
|
|
*plugins["mysql"],
|
|
*plugins["bigquery"],
|
|
*plugins["kafka"],
|
|
*plugins["mlflow"],
|
|
*plugins["snowflake"],
|
|
*plugins["superset"],
|
|
*plugins["postgres"],
|
|
*plugins["redshift"],
|
|
*plugins["airflow"],
|
|
*plugins["datalake-s3"],
|
|
*plugins["dbt"],
|
|
*plugins["presidio-analyzer"],
|
|
*e2e_test,
|
|
# Add other plugins as needed for Playwright tests
|
|
}
|
|
|
|
|
|
def filter_requirements(filtered: Set[str]) -> List[str]: # noqa: UP006
|
|
"""Filter out requirements from base_requirements"""
|
|
return list(
|
|
base_requirements.union(*[requirements for plugin, requirements in plugins.items() if plugin not in filtered])
|
|
)
|
|
|
|
|
|
setup(
|
|
install_requires=list(base_requirements),
|
|
extras_require={
|
|
"dev": list(dev),
|
|
"test": list(test),
|
|
"test-unit": list(test_unit),
|
|
"e2e_test": list(e2e_test),
|
|
"data-insight": list(plugins["elasticsearch"]),
|
|
**{plugin: list(dependencies) for (plugin, dependencies) in plugins.items()},
|
|
# FIXME: all-dev-env is a temporary solution to install all dependencies except
|
|
# those that might conflict with each other or cause issues in the dev environment
|
|
# This covers all development cases where none of the plugins are used
|
|
"all-dev-env": filter_requirements({"airflow", "db2", "great-expectations", "pymssql"}),
|
|
# enf-of-fixme
|
|
"all": filter_requirements({"airflow", "db2", "great-expectations"}),
|
|
"playwright": list(playwright_dependencies),
|
|
"slim": filter_requirements(
|
|
{
|
|
"airflow",
|
|
"db2",
|
|
"great-expectations",
|
|
"deltalake",
|
|
"deltalake-spark",
|
|
"sklearn",
|
|
}
|
|
),
|
|
"docs": docs,
|
|
},
|
|
)
|