Migrate Databricks from sqlalchemy-databricks to databricks-sqlalchemy (#26896)
Some checks are pending
Integration Tests - MySQL + Elasticsearch / Detect Changes (push) Waiting to run
Integration Tests - MySQL + Elasticsearch / integration-tests-mysql-elasticsearch (push) Blocked by required conditions
Integration Tests - PostgreSQL + OpenSearch / Detect Changes (push) Waiting to run
Maven Collate Tests / maven-collate-ci (push) Waiting to run
Integration Tests - PostgreSQL + OpenSearch / integration-tests-postgres-opensearch (push) Blocked by required conditions
Java Checkstyle / java-checkstyle (push) Waiting to run
OpenMetadata Service Unit Tests / Detect Changes (push) Waiting to run
OpenMetadata Service Unit Tests / openmetadata-service-unit-tests (mysql) (push) Blocked by required conditions
OpenMetadata Service Unit Tests / openmetadata-service-unit-tests (postgresql) (push) Blocked by required conditions
OpenMetadata Service Unit Tests / k8s_operator-unit-tests (push) Blocked by required conditions
OpenMetadata Service Unit Tests / openmetadata-service-unit-tests-status (push) Blocked by required conditions
Publish Package to Maven Central Repository / publish-maven-packages (push) Waiting to run

* Update Databricks Dependency to databricks-sqlalchemy

* Update generated TypeScript types

* address comments and pyformat

* pyformat

* fix log filtering

* address comments

* fix static unit tests

* fix rule for static type

* pyformat

* update baseline

* revert basepyright changes

---------

Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
Co-authored-by: Aniket Katkar <aniketkatkar97@gmail.com>
This commit is contained in:
Mayur Singal 2026-05-04 18:53:24 +05:30 committed by GitHub
parent 8cec97b52c
commit 60a2e6546e
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
37 changed files with 494 additions and 86 deletions

View file

@ -84,7 +84,7 @@ runs:
source env/bin/activate
uv pip install "setuptools<81"
uv pip install --no-build-isolation "cx_Oracle>=8.3.0,<9"
uv pip install --no-deps "sqlalchemy-databricks==0.2.0" "sqlalchemy-ibmi==0.9.3" "pydoris-custom==1.1.0"
uv pip install --no-deps "sqlalchemy-redshift==0.8.14" "sqlalchemy-ibmi==0.9.3" "pydoris-custom==1.1.0"
uv pip install "${{ github.workspace }}/ingestion[all]"
uv pip install "${{ github.workspace }}/ingestion[test]"
uv pip install nox

View file

@ -130,6 +130,12 @@ FROM user_entity ue, role_entity re
WHERE ue.name = 'mcpapplicationbot'
AND re.name = 'ApplicationBotImpersonationRole';
-- Update Databricks and Unity Catalog connection schemes from 'databricks+connector' to 'databricks'
-- as part of migration from sqlalchemy-databricks to databricks-sqlalchemy package
UPDATE dbservice_entity
SET json = JSON_SET(json, '$.connection.config.scheme', 'databricks')
WHERE serviceType IN ('Databricks', 'UnityCatalog')
AND JSON_UNQUOTE(JSON_EXTRACT(json, '$.connection.config.scheme')) = 'databricks+connector';
UPDATE entity_extension
SET json = JSON_SET(

View file

@ -151,6 +151,13 @@ WHERE ue.name = 'mcpapplicationbot'
AND re.name = 'ApplicationBotImpersonationRole'
ON CONFLICT DO NOTHING;
-- Update Databricks and Unity Catalog connection schemes from 'databricks+connector' to 'databricks'
-- as part of migration from sqlalchemy-databricks to databricks-sqlalchemy package
UPDATE dbservice_entity
SET json = jsonb_set(json, '{connection,config,scheme}', '"databricks"')
WHERE serviceType IN ('Databricks', 'UnityCatalog')
AND json #>> '{connection,config,scheme}' = 'databricks+connector';
-- Migrate profiler sampling config: move flat profileSample/profileSampleType/samplingMethodType
-- into the new profileSampleConfig structure. Default to STATIC since DYNAMIC is new.

View file

@ -108,7 +108,7 @@ WORKDIR /home/airflow/ingestion
# Pre-install dialect packages that declare SQLAlchemy<2 in their metadata
# but work fine at runtime with SQLAlchemy 2.0 (unmaintained packages).
RUN pip install --no-deps "sqlalchemy-databricks==0.2.0" "sqlalchemy-ibmi==0.9.3" "pydoris-custom==1.1.0"
RUN pip install --no-deps "sqlalchemy-redshift==0.8.14" "sqlalchemy-ibmi==0.9.3" "pydoris-custom==1.1.0"
RUN pip install "datamodel-code-generator==0.25.6"
RUN mkdir -p /home/airflow/ingestion/src/metadata/generated
RUN python /home/airflow/scripts/datamodel_generation.py

View file

@ -109,7 +109,7 @@ RUN pip install --no-build-isolation "cx_Oracle>=8.3.0,<9"
ARG INGESTION_DEPENDENCY="all"
# Pre-install dialect packages that declare SQLAlchemy<2 in their metadata
# but work fine at runtime with SQLAlchemy 2.0 (unmaintained packages).
RUN pip install --no-deps "sqlalchemy-databricks==0.2.0" "sqlalchemy-ibmi==0.9.3" "pydoris-custom==1.1.0"
RUN pip install --no-deps "sqlalchemy-redshift==0.8.14" "sqlalchemy-ibmi==0.9.3" "pydoris-custom==1.1.0"
RUN pip install "datamodel-code-generator==0.25.6"
RUN mkdir -p /ingestion/src/metadata/generated
RUN python /scripts/datamodel_generation.py

View file

@ -48,7 +48,8 @@ VERSIONS = {
"azure-storage-blob": "azure-storage-blob~=12.14",
"azure-identity": "azure-identity~=1.12",
"databricks-sdk": "databricks-sdk~=0.20.0",
"databricks-sql-connector": "databricks-sql-connector>=2.0",
"databricks-sql-connector": "databricks-sql-connector>=4.0.0",
"databricks-sqlalchemy": "databricks-sqlalchemy~=2.0.9",
"trino": "trino[sqlalchemy]",
"spacy": "spacy<3.8",
"looker-sdk": "looker-sdk>=22.20.0,!=24.18.0",
@ -235,14 +236,12 @@ plugins: Dict[str, Set[str]] = { # noqa: UP006
# sqlalchemy-ibmi is pre-installed with --no-deps (SA<2 metadata conflict)
},
"databricks": {
# sqlalchemy-databricks is pre-installed with --no-deps (SA<2 metadata conflict)
VERSIONS["databricks-sqlalchemy"],
VERSIONS["databricks-sdk"],
VERSIONS["databricks-sql-connector"],
"ndg-httpsclient~=0.5.1",
"pyOpenSSL~=24.1.0",
"pyasn1~=0.6.0",
# databricks has a dependency on pyhive for metadata as well as profiler
VERSIONS["pyhive"],
},
"datalake-azure": {
VERSIONS["azure-storage-blob"],

View file

@ -19,7 +19,7 @@ class DatabricksBaseTableParameter(BaseTableParameter):
if not service_connection_config:
return None
scheme = getattr(service_connection_config, "scheme", "databricks+connector")
scheme = getattr(service_connection_config, "scheme", "databricks")
# Handle enum values properly
if hasattr(scheme, "value"):
scheme = scheme.value

View file

@ -16,6 +16,7 @@ Source connection handler
from copy import deepcopy
from functools import partial
from typing import Optional
from urllib.parse import quote_plus
from sqlalchemy import text
from sqlalchemy.engine import Engine
@ -42,6 +43,9 @@ from metadata.ingestion.connections.test_connections import (
)
from metadata.ingestion.ometa.ometa_api import OpenMetadata
from metadata.ingestion.source.database.databricks.auth import get_auth_config
from metadata.ingestion.source.database.databricks.log_filters import (
suppress_user_agent_entry_deprecation_log,
)
from metadata.ingestion.source.database.databricks.queries import (
DATABRICKS_GET_CATALOGS,
DATABRICKS_SQL_STATEMENT_TEST,
@ -58,6 +62,8 @@ from metadata.utils.logger import ingestion_logger
logger = ingestion_logger()
suppress_user_agent_entry_deprecation_log()
class DatabricksEngineWrapper:
"""Wrapper to store engine and schemas to avoid multiple calls"""
@ -129,7 +135,11 @@ class DatabricksEngineWrapper:
def get_connection_url(connection: DatabricksConnection) -> str:
return f"{connection.scheme.value}://{connection.hostPort}"
scheme = connection.scheme.value if connection.scheme else "databricks"
url = f"{scheme}://{connection.hostPort}"
if connection.catalog:
url = f"{url}?catalog={quote_plus(connection.catalog)}"
return url
def get_connection(connection: DatabricksConnection) -> Engine:

View file

@ -0,0 +1,43 @@
# Copyright 2025 Collate
# Licensed under the Collate Community License, Version 1.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# https://github.com/open-metadata/OpenMetadata/blob/main/ingestion/LICENSE
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Logging filters for Databricks SQL connector noise.
"""
import logging
_DATABRICKS_SESSION_LOGGER = "databricks.sql.session"
_DEPRECATED_PARAM_FRAGMENT = "_user_agent_entry"
_FILTER_INSTALLED_FLAG = "_om_user_agent_entry_filter_installed"
class _UserAgentEntryDeprecationFilter(logging.Filter):
def filter(self, record: logging.LogRecord) -> bool:
try:
message = record.getMessage()
except Exception:
return True
return _DEPRECATED_PARAM_FRAGMENT not in message
def suppress_user_agent_entry_deprecation_log() -> None:
"""
Drop the `_user_agent_entry` deprecation log emitted by databricks-sqlalchemy
without changing the level of the `databricks.sql.session` logger, so
user-configured logging is preserved and other records flow through normally.
Idempotent: safe to call from multiple connector modules at import time.
"""
target_logger = logging.getLogger(_DATABRICKS_SESSION_LOGGER)
if getattr(target_logger, _FILTER_INSTALLED_FLAG, False):
return
target_logger.addFilter(_UserAgentEntryDeprecationFilter())
setattr(target_logger, _FILTER_INSTALLED_FLAG, True)

View file

@ -13,18 +13,17 @@
import re
import traceback
from copy import deepcopy
from typing import Iterable, Optional, Tuple, Union # noqa: UP035
from typing import Any, Iterable, Optional, Tuple, Union # noqa: UP035
from pydantic import EmailStr
from pydantic_core import PydanticCustomError
from pyhive.sqlalchemy_hive import _type_map
from sqlalchemy import exc, text, types, util
from sqlalchemy.engine import reflection
from sqlalchemy.engine.reflection import Inspector
from sqlalchemy.exc import DatabaseError
from sqlalchemy.sql.sqltypes import String
from sqlalchemy_databricks._dialect import DatabricksDialect
from databricks.sqlalchemy.base import DatabricksDialect
from metadata.generated.schema.entity.data.database import Database
from metadata.generated.schema.entity.data.databaseSchema import DatabaseSchema
from metadata.generated.schema.entity.data.table import Column, Table, TableType
@ -105,18 +104,28 @@ class MAP(String):
__visit_name__ = "MAP"
# overriding pyhive.sqlalchemy_hive._type_map
# mapping struct, array & map to custom classed instead of sqltypes.String
_type_map.update(
{
"struct": STRUCT,
"array": ARRAY,
"map": MAP,
"void": create_sqlalchemy_type("VOID"),
"interval": create_sqlalchemy_type("INTERVAL"),
"binary": create_sqlalchemy_type("BINARY"),
}
)
_type_map = {
"boolean": types.Boolean,
"tinyint": types.SmallInteger,
"smallint": types.SmallInteger,
"int": types.Integer,
"bigint": types.BigInteger,
"float": types.Float,
"double": types.Float,
"string": types.String,
"varchar": types.String,
"char": types.String,
"date": types.Date,
"timestamp": types.DateTime,
"decimal": types.Numeric,
"binary": create_sqlalchemy_type("BINARY"),
"struct": STRUCT,
"array": ARRAY,
"map": MAP,
"void": create_sqlalchemy_type("VOID"),
"interval": create_sqlalchemy_type("INTERVAL"),
"uniontype": types.String,
}
# This method is from hive dialect originally but
@ -168,7 +177,7 @@ def _get_column_rows(self, connection, table_name, schema, db_name):
@reflection.cache
def get_columns(self, connection, table_name, schema=None, **kw):
"""
This function overrides the sqlalchemy_databricks._dialect.DatabricksDialect.get_columns
This function overrides the DatabricksDialect.get_columns
to add support for struct, array & map datatype
Extract the Database Name from the keyword arguments parameter if it is present. This
@ -274,7 +283,14 @@ def get_view_names_reflection(self, schema=None, **kw):
return []
def get_view_names(self, connection, schema=None, **kw): # pylint: disable=unused-argument
def get_view_names( # pylint: disable=unused-argument
self: Any,
connection: Any,
schema: str | None = None,
only_materialized: bool = False, # pyright: ignore[reportUnusedParameter]
only_temp: bool = False, # pyright: ignore[reportUnusedParameter]
**kw: Any,
) -> list[str]:
if kw.get("db_name"):
connection.execute(text(f"USE CATALOG {self.identifier_preparer.quote_identifier(kw.get('db_name'))}"))
query = "SHOW VIEWS"
@ -315,7 +331,7 @@ def get_table_comment( # pylint: disable=unused-argument
)
try:
for result in list(cursor):
data = result.values()
data = tuple(result) # pyright: ignore[reportUnknownArgumentType]
if data[0] and data[0].strip() == "Comment":
return {"text": data[1] if data and data[1] else None}
except Exception:
@ -847,7 +863,7 @@ class DatabricksSource(ExternalTableLineageMixin, CommonDbSourceService, MultiDB
schema=schema_name,
)
for result in list(cursor):
data = result.values()
data = tuple(result) # pyright: ignore[reportUnknownArgumentType]
if data[0] and data[0].strip() == "Comment":
description = data[1] if data and data[1] else None
return description # noqa: RET504
@ -874,7 +890,7 @@ class DatabricksSource(ExternalTableLineageMixin, CommonDbSourceService, MultiDB
schema=schema_name,
)
for result in list(cursor):
data = result.values()
data = tuple(result) # pyright: ignore[reportUnknownArgumentType]
if data[0] and data[0].strip() == "Comment":
description = data[1] if data and data[1] else None
elif data[0] and data[0].strip() == "Location":

View file

@ -16,6 +16,7 @@ Source connection handler
from copy import deepcopy
from functools import partial
from typing import Optional
from urllib.parse import quote_plus
from databricks.sdk import WorkspaceClient
from sqlalchemy import text
@ -48,6 +49,9 @@ from metadata.ingestion.connections.builders import (
from metadata.ingestion.connections.test_connections import test_connection_steps
from metadata.ingestion.ometa.ometa_api import OpenMetadata
from metadata.ingestion.source.database.databricks.auth import get_auth_config
from metadata.ingestion.source.database.databricks.log_filters import (
suppress_user_agent_entry_deprecation_log,
)
from metadata.ingestion.source.database.unitycatalog.models import DatabricksTable
from metadata.ingestion.source.database.unitycatalog.queries import (
UNITY_CATALOG_GET_ALL_SCHEMA_TAGS,
@ -63,10 +67,14 @@ from metadata.utils.logger import ingestion_logger
logger = ingestion_logger()
suppress_user_agent_entry_deprecation_log()
def get_connection_url(connection: UnityCatalogConnection) -> str:
url = f"{connection.scheme.value}://{connection.hostPort}"
return url # noqa: RET504
if connection.catalog:
url = f"{url}?catalog={quote_plus(connection.catalog)}"
return url
def get_connection(connection: UnityCatalogConnection) -> WorkspaceClient:

View file

@ -32,11 +32,16 @@ from metadata.ingestion.connections.builders import (
from metadata.ingestion.connections.test_connections import test_connection_steps
from metadata.ingestion.ometa.ometa_api import OpenMetadata
from metadata.ingestion.source.database.databricks.client import DatabricksClient
from metadata.ingestion.source.database.databricks.log_filters import (
suppress_user_agent_entry_deprecation_log,
)
from metadata.utils.constants import THREE_MIN
suppress_user_agent_entry_deprecation_log()
def get_connection_url(connection: DatabricksPipelineConnection) -> str:
url = f"databricks+connector://token:{connection.token.get_secret_value()}@{connection.hostPort}"
url = f"databricks://token:{connection.token.get_secret_value()}@{connection.hostPort}"
return url # noqa: RET504

View file

@ -88,10 +88,10 @@ class SQAInterfaceMixin(Root):
self.service_connection_config,
(UnityCatalogConnection, DatabricksConnection),
):
session.execute(
text("USE CATALOG :catalog"),
{"catalog": self.service_connection_config.catalog},
).first()
catalog = self.service_connection_config.catalog # pyright: ignore[reportAttributeAccessIssue]
if catalog:
quoted_catalog = session.connection().dialect.identifier_preparer.quote(catalog) # pyright: ignore[reportUnknownVariableType, reportUnknownMemberType]
session.execute(text(f"USE CATALOG {quoted_catalog}")) # pyright: ignore[reportUnknownMemberType]
if isinstance(self.service_connection_config, (MysqlConnection, MariaDBConnection)):
session.execute(

View file

@ -16,8 +16,8 @@ supporting sqlalchemy abstraction layer
from typing import List, Type, cast # noqa: UP035
from pyhive.sqlalchemy_hive import HiveCompiler
from sqlalchemy import Column
from sqlalchemy.sql.compiler import SQLCompiler
from metadata.generated.schema.entity.data.table import Column as OMColumn
from metadata.generated.schema.entity.data.table import (
@ -66,10 +66,7 @@ class DatabricksProfilerInterface(SQAProfilerInterface):
return instance.get_system_metrics()
def visit_column(self, *args, **kwargs):
result = super( # pylint: disable=bad-super-call
HiveCompiler, self
).visit_column(*args, **kwargs)
# Here the databricks uses HiveCompiler.
result = SQLCompiler.visit_column(self, *args, **kwargs) # pyright: ignore[reportArgumentType, reportUnknownArgumentType]
# the `result` here would be `db.schema.table` or `db.schema.table.column`
# for struct it will be `db.schema.table.column.nestedchild.nestedchild` etc
# the logic is to add the backticks to nested children.
@ -83,9 +80,7 @@ class DatabricksProfilerInterface(SQAProfilerInterface):
return result
def visit_table(self, *args, **kwargs):
result = super( # pylint: disable=bad-super-call
HiveCompiler, self
).visit_table(*args, **kwargs)
result = SQLCompiler.visit_table(self, *args, **kwargs) # pyright: ignore[reportArgumentType, reportUnknownMemberType, reportUnknownArgumentType]
# Handle table references with hyphens in database/schema names
# Format: `database`.`schema`.`table` for Unity Catalog/Databricks
if "." in result and not result.startswith("`"):
@ -102,8 +97,31 @@ class DatabricksProfilerInterface(SQAProfilerInterface):
def __init__(self, service_connection_config, **kwargs):
super().__init__(service_connection_config=service_connection_config, **kwargs)
self.set_catalog(self.session)
HiveCompiler.visit_column = DatabricksProfilerInterface.visit_column
HiveCompiler.visit_table = DatabricksProfilerInterface.visit_table
self._patch_databricks_statement_compiler()
@staticmethod
def _patch_databricks_statement_compiler():
"""Override visit_column/visit_table on the Databricks statement compiler.
Resolve the compiler via the public `DatabricksDialect.statement_compiler`
attribute rather than importing from `databricks.sqlalchemy._ddl`, which is a
private module that can move between databricks-sqlalchemy releases. Failures
are logged and swallowed so a packaging change cannot break profiler startup.
"""
try:
from databricks.sqlalchemy.base import DatabricksDialect # noqa: PLC0415
statement_compiler = getattr(DatabricksDialect, "statement_compiler", None)
if statement_compiler is None:
logger.warning("DatabricksDialect.statement_compiler not found; skipping Databricks compiler patches.")
return
statement_compiler.visit_column = DatabricksProfilerInterface.visit_column # pyright: ignore[reportUnknownMemberType]
statement_compiler.visit_table = DatabricksProfilerInterface.visit_table # pyright: ignore[reportUnknownMemberType]
except Exception as exc:
logger.warning(
"Failed to patch Databricks statement compiler: %s. Profiling will continue without struct/hyphen quoting overrides.",
exc,
)
def _get_struct_columns(self, columns: List[OMColumn], parent: str): # noqa: UP006
"""Get struct columns"""
@ -122,8 +140,10 @@ class DatabricksProfilerInterface(SQAProfilerInterface):
table_service_type=DatabaseServiceType.Databricks,
_quote=False,
)
sqa_col._set_parent( # pylint: disable=protected-access
self.table.__table__
sqa_col._set_parent( # pylint: disable=protected-access # pyright: ignore[reportUnknownArgumentType, reportUnknownMemberType, reportUnknownVariableType]
self.table.__table__, # pyright: ignore[reportUnknownArgumentType, reportUnknownMemberType]
all_names={c.name: c for c in self.table.__table__.columns}, # pyright: ignore[reportUnknownMemberType, reportUnknownVariableType]
allow_replacements=True,
)
columns_list.append(sqa_col)
else:
@ -139,8 +159,10 @@ class DatabricksProfilerInterface(SQAProfilerInterface):
columns.extend(self._get_struct_columns(column_obj.children, column_obj.name.root))
else:
col = build_orm_col(idx, column_obj, DatabaseServiceType.Databricks)
col._set_parent( # pylint: disable=protected-access
self.table.__table__
col._set_parent( # pylint: disable=protected-access # pyright: ignore[reportUnknownArgumentType, reportUnknownMemberType, reportUnknownVariableType]
self.table.__table__, # pyright: ignore[reportUnknownArgumentType, reportUnknownMemberType]
all_names={c.name: c for c in self.table.__table__.columns}, # pyright: ignore[reportUnknownMemberType, reportUnknownVariableType]
allow_replacements=True,
)
columns.append(col)
return columns

View file

@ -35,6 +35,6 @@ class DataBricksProfilerSource(ProfilerSource):
def set_is_disconnect(self):
"""Set the is_disconnect method for the Databricks dialect"""
# pylint: disable=import-outside-toplevel
from databricks.sqlalchemy import DatabricksDialect # noqa: PLC0415
from databricks.sqlalchemy.base import DatabricksDialect # noqa: PLC0415
DatabricksDialect.is_disconnect = is_disconnect

View file

@ -1,16 +1,16 @@
import unittest
from unittest.mock import MagicMock, patch
from pyhive.sqlalchemy_hive import HiveCompiler
from sqlalchemy.sql.compiler import SQLCompiler
from metadata.profiler.interface.sqlalchemy.databricks.profiler_interface import (
DatabricksProfilerInterface,
)
class FakeHiveCompiler(
class FakeCompiler(
DatabricksProfilerInterface,
HiveCompiler,
SQLCompiler,
):
def __init__(self, service_connection_config):
self.service_connection_config = service_connection_config
@ -25,14 +25,14 @@ class TestDatabricksProfilerInterface(unittest.TestCase):
"metadata.profiler.interface.sqlalchemy.databricks.profiler_interface.DatabricksProfilerInterface.__init__",
return_value=None,
)
@patch("pyhive.sqlalchemy_hive.HiveCompiler.visit_column")
@patch("sqlalchemy.sql.compiler.SQLCompiler.visit_column")
def setUp(
self,
mock_visit_column,
mock_init,
mock_set_catalog,
) -> None:
self.profiler = FakeHiveCompiler(service_connection_config={})
self.profiler = FakeCompiler(service_connection_config={})
@patch("sqlalchemy.sql.compiler.SQLCompiler.visit_column")
def test_visit_column_no_nesting(self, mock_visit_column_super):

View file

@ -136,9 +136,9 @@ class SourceConnectionTest(TestCase):
get_connection_url,
)
expected_result = "databricks+connector://1.1.1.1:443"
expected_result = "databricks://1.1.1.1:443"
databricks_conn_obj = DatabricksConnection(
scheme=DatabricksScheme.databricks_connector,
scheme=DatabricksScheme.databricks,
hostPort="1.1.1.1:443",
authType=PersonalAccessToken(token="KlivDTACWXKmZVfN1qIM"),
httpPath="/sql/1.0/warehouses/abcdedfg",
@ -150,9 +150,9 @@ class SourceConnectionTest(TestCase):
get_connection_url,
)
expected_result = "databricks+connector://1.1.1.1:443"
expected_result = "databricks://1.1.1.1:443?catalog=main"
databricks_conn_obj = DatabricksConnection(
scheme=DatabricksScheme.databricks_connector,
scheme=DatabricksScheme.databricks,
hostPort="1.1.1.1:443",
authType=DatabricksOauth(
clientId="d40e2905-88ef-42ab-8898-fbefff2d071d",
@ -163,6 +163,99 @@ class SourceConnectionTest(TestCase):
)
assert expected_result == get_connection_url(databricks_conn_obj)
def test_databricks_pipeline_url(self):
from metadata.generated.schema.entity.services.connections.pipeline.databricksPipelineConnection import (
DatabricksPipelineConnection,
)
from metadata.ingestion.source.pipeline.databrickspipeline.connection import (
get_connection_url,
)
conn_obj = DatabricksPipelineConnection(
hostPort="my-workspace.cloud.databricks.com:443",
token="dapi1234567890",
)
url = get_connection_url(conn_obj)
assert url == "databricks://token:dapi1234567890@my-workspace.cloud.databricks.com:443"
assert "databricks+connector" not in url
def test_databricks_url_with_special_chars_in_catalog(self):
from metadata.ingestion.source.database.databricks.connection import (
get_connection_url,
)
databricks_conn_obj = DatabricksConnection(
scheme=DatabricksScheme.databricks,
hostPort="1.1.1.1:443",
authType=PersonalAccessToken(token="KlivDTACWXKmZVfN1qIM"),
httpPath="/sql/1.0/warehouses/abcdedfg",
catalog="my catalog&name=val",
)
url = get_connection_url(databricks_conn_obj)
assert url == "databricks://1.1.1.1:443?catalog=my+catalog%26name%3Dval"
def test_unity_catalog_url_without_catalog(self):
from metadata.generated.schema.entity.services.connections.database.unityCatalogConnection import (
DatabricksScheme as UCDatabricksScheme,
)
from metadata.generated.schema.entity.services.connections.database.unityCatalogConnection import (
UnityCatalogConnection,
)
from metadata.ingestion.source.database.unitycatalog.connection import (
get_connection_url,
)
conn_obj = UnityCatalogConnection(
scheme=UCDatabricksScheme.databricks,
hostPort="my-workspace.cloud.databricks.com:443",
authType=PersonalAccessToken(token="dapi1234567890"),
httpPath="/sql/1.0/warehouses/abc",
)
url = get_connection_url(conn_obj)
assert url == "databricks://my-workspace.cloud.databricks.com:443"
def test_unity_catalog_url_with_catalog(self):
from metadata.generated.schema.entity.services.connections.database.unityCatalogConnection import (
DatabricksScheme as UCDatabricksScheme,
)
from metadata.generated.schema.entity.services.connections.database.unityCatalogConnection import (
UnityCatalogConnection,
)
from metadata.ingestion.source.database.unitycatalog.connection import (
get_connection_url,
)
conn_obj = UnityCatalogConnection(
scheme=UCDatabricksScheme.databricks,
hostPort="my-workspace.cloud.databricks.com:443",
authType=PersonalAccessToken(token="dapi1234567890"),
httpPath="/sql/1.0/warehouses/abc",
catalog="production",
)
url = get_connection_url(conn_obj)
assert url == "databricks://my-workspace.cloud.databricks.com:443?catalog=production"
def test_unity_catalog_url_with_special_chars_in_catalog(self):
from metadata.generated.schema.entity.services.connections.database.unityCatalogConnection import (
DatabricksScheme as UCDatabricksScheme,
)
from metadata.generated.schema.entity.services.connections.database.unityCatalogConnection import (
UnityCatalogConnection,
)
from metadata.ingestion.source.database.unitycatalog.connection import (
get_connection_url,
)
conn_obj = UnityCatalogConnection(
scheme=UCDatabricksScheme.databricks,
hostPort="my-workspace.cloud.databricks.com:443",
authType=PersonalAccessToken(token="dapi1234567890"),
httpPath="/sql/1.0/warehouses/abc",
catalog="my catalog&name=val",
)
url = get_connection_url(conn_obj)
assert url == "databricks://my-workspace.cloud.databricks.com:443?catalog=my+catalog%26name%3Dval"
def test_hive_url(self):
from metadata.ingestion.source.database.hive.connection import (
get_connection_url,

View file

@ -450,21 +450,21 @@ class DatabricksConnectionTest(TestCase):
def test_get_connection_url(self):
"""Test get_connection_url function"""
connection = self.DatabricksConnection(
scheme=self.DatabricksScheme.databricks_connector,
scheme=self.DatabricksScheme.databricks,
hostPort="test-host:443",
authType=PersonalAccessToken(token="test-token"),
httpPath="/sql/1.0/warehouses/test",
)
url = self.get_connection_url(connection)
expected_url = "databricks+connector://test-host:443"
expected_url = "databricks://test-host:443"
self.assertEqual(url, expected_url)
@patch("metadata.ingestion.source.database.databricks.connection.create_generic_db_connection")
def test_get_connection(self, mock_create_connection):
"""Test get_connection function"""
connection = self.DatabricksConnection(
scheme=self.DatabricksScheme.databricks_connector,
scheme=self.DatabricksScheme.databricks,
hostPort="test-host:443",
authType=PersonalAccessToken(token="test-token"),
httpPath="/sql/1.0/warehouses/test",
@ -788,7 +788,7 @@ class DatabricksConnectionTest(TestCase):
# Create test connection
service_connection = DatabricksConnection(
scheme=DatabricksScheme.databricks_connector,
scheme=DatabricksScheme.databricks,
hostPort="test-host:443",
authType=PersonalAccessToken(token="test-token"),
httpPath="/sql/1.0/warehouses/test",

View file

@ -0,0 +1,87 @@
# Copyright 2025 Collate
# Licensed under the Collate Community License, Version 1.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# https://github.com/open-metadata/OpenMetadata/blob/main/ingestion/LICENSE
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Tests for the databricks.sql.session log filter helper.
"""
import logging
import pytest
from metadata.ingestion.source.database.databricks import log_filters
from metadata.ingestion.source.database.databricks.log_filters import (
suppress_user_agent_entry_deprecation_log,
)
DATABRICKS_SESSION_LOGGER = "databricks.sql.session"
@pytest.fixture
def clean_logger():
target = logging.getLogger(DATABRICKS_SESSION_LOGGER)
original_filters = list(target.filters)
original_level = target.level
had_flag = hasattr(target, log_filters._FILTER_INSTALLED_FLAG)
flag_value = getattr(target, log_filters._FILTER_INSTALLED_FLAG, None)
target.filters = []
if had_flag:
delattr(target, log_filters._FILTER_INSTALLED_FLAG)
yield target
target.filters = original_filters
target.setLevel(original_level)
if had_flag:
setattr(target, log_filters._FILTER_INSTALLED_FLAG, flag_value)
elif hasattr(target, log_filters._FILTER_INSTALLED_FLAG):
delattr(target, log_filters._FILTER_INSTALLED_FLAG)
def _emit(logger: logging.Logger, message: str) -> logging.LogRecord:
return logger.makeRecord(logger.name, logging.WARNING, __file__, 0, message, None, None)
def test_filters_user_agent_entry_message(clean_logger):
suppress_user_agent_entry_deprecation_log()
record = _emit(
clean_logger,
"Parameter '_user_agent_entry' is deprecated, use 'user_agent_entry' instead",
)
assert clean_logger.filters, "Expected the suppression filter to be installed"
assert all(f.filter(record) is False for f in clean_logger.filters)
def test_unrelated_warning_passes_through(clean_logger):
suppress_user_agent_entry_deprecation_log()
record = _emit(clean_logger, "Connection retry: attempt 2 of 3")
assert all(f.filter(record) is True for f in clean_logger.filters)
def test_logger_level_is_not_modified(clean_logger):
clean_logger.setLevel(logging.DEBUG)
suppress_user_agent_entry_deprecation_log()
assert clean_logger.level == logging.DEBUG
def test_helper_is_idempotent(clean_logger):
suppress_user_agent_entry_deprecation_log()
suppress_user_agent_entry_deprecation_log()
suppress_user_agent_entry_deprecation_log()
assert len(clean_logger.filters) == 1

View file

@ -0,0 +1,112 @@
# Copyright 2025 Collate
# Licensed under the Collate Community License, Version 1.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# https://github.com/open-metadata/OpenMetadata/blob/main/ingestion/LICENSE
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Tests for databricks-sqlalchemy migration changes
"""
from metadata.generated.schema.entity.services.connections.database.databricksConnection import (
DatabricksScheme,
)
from metadata.ingestion.source.database.databricks.metadata import (
ARRAY,
MAP,
STRUCT,
_type_map,
)
class TestDatabricksScheme:
"""Verify the scheme enum reflects the new databricks-sqlalchemy package"""
def test_scheme_value(self):
assert DatabricksScheme.databricks.value == "databricks"
EXPECTED_TYPES = [
"boolean",
"tinyint",
"smallint",
"int",
"bigint",
"float",
"double",
"string",
"varchar",
"char",
"date",
"timestamp",
"decimal",
"binary",
"struct",
"array",
"map",
"void",
"interval",
"uniontype",
]
class TestTypeMap:
"""Verify _type_map is self-contained and covers all expected Databricks types"""
def test_all_expected_types_present(self):
for type_name in EXPECTED_TYPES:
assert type_name in _type_map, f"Missing type '{type_name}' in _type_map"
def test_complex_types_are_custom(self):
assert _type_map["struct"] is STRUCT
assert _type_map["array"] is ARRAY
assert _type_map["map"] is MAP
def test_all_values_are_types(self):
for type_name, type_cls in _type_map.items():
assert isinstance(type_cls, type) or callable(type_cls), (
f"_type_map['{type_name}'] is not a type or callable: {type_cls}"
)
class TestDatabricksBaseDefaultScheme:
"""Verify DatabricksBaseTableParameter uses the new default scheme"""
def test_default_scheme(self):
from metadata.ingestion.source.database.common.data_diff.databricks_base import (
DatabricksBaseTableParameter,
)
class FakeConfig:
hostPort = "host:443" # noqa: N815
token = "secret"
result = DatabricksBaseTableParameter._get_service_connection_config(FakeConfig())
assert result is not None
assert "databricks+connector" not in result
class TestDatabricksPipelineConnectionUrl:
"""Verify pipeline connection URL uses new scheme"""
def test_url_scheme(self):
from metadata.generated.schema.entity.services.connections.pipeline.databricksPipelineConnection import (
DatabricksPipelineConnection,
)
from metadata.ingestion.source.pipeline.databrickspipeline.connection import (
get_connection_url,
)
conn = DatabricksPipelineConnection(
hostPort="workspace.cloud.databricks.com:443",
token="dapi123",
)
url = get_connection_url(conn)
assert url.startswith("databricks://")
assert "databricks+connector" not in url
assert "dapi123" in url

View file

@ -17,9 +17,9 @@
"description": "SQLAlchemy driver scheme options.",
"type": "string",
"enum": [
"databricks+connector"
"databricks"
],
"default": "databricks+connector"
"default": "databricks"
}
},
"properties": {
@ -33,7 +33,7 @@
"title": "Connection Scheme",
"description": "SQLAlchemy driver scheme options.",
"$ref": "#/definitions/databricksScheme",
"default": "databricks+connector"
"default": "databricks"
},
"hostPort": {
"title": "Host and Port",

View file

@ -18,9 +18,9 @@
"description": "SQLAlchemy driver scheme options.",
"type": "string",
"enum": [
"databricks+connector"
"databricks"
],
"default": "databricks+connector"
"default": "databricks"
}
},
"properties": {
@ -34,7 +34,7 @@
"title": "Connection Scheme",
"description": "SQLAlchemy driver scheme options.",
"$ref": "#/definitions/databricksScheme",
"default": "databricks+connector"
"default": "databricks"
},
"hostPort": {
"title": "Host and Port",

View file

@ -18,9 +18,9 @@
"description": "SQLAlchemy driver scheme options.",
"type": "string",
"enum": [
"databricks+connector"
"databricks"
],
"default": "databricks+connector"
"default": "databricks"
}
},
"properties": {
@ -34,7 +34,7 @@
"title": "Connection Scheme",
"description": "SQLAlchemy driver scheme options.",
"$ref": "#/definitions/databricksScheme",
"default": "databricks+connector"
"default": "databricks"
},
"hostPort": {
"title": "Host and Port",

View file

@ -4688,7 +4688,7 @@ export enum ConfigScheme {
ClickhouseNative = "clickhouse+native",
CockroachdbPsycopg2 = "cockroachdb+psycopg2",
Couchbase = "couchbase",
DatabricksConnector = "databricks+connector",
Databricks = "databricks",
Db2IBMDB = "db2+ibm_db",
Doris = "doris",
Druid = "druid",

View file

@ -2205,7 +2205,7 @@ export enum ConfigScheme {
ClickhouseNative = "clickhouse+native",
CockroachdbPsycopg2 = "cockroachdb+psycopg2",
Couchbase = "couchbase",
DatabricksConnector = "databricks+connector",
Databricks = "databricks",
Db2IBMDB = "db2+ibm_db",
Doris = "doris",
Druid = "druid",

View file

@ -7276,7 +7276,7 @@ export enum ConfigScheme {
ClickhouseNative = "clickhouse+native",
CockroachdbPsycopg2 = "cockroachdb+psycopg2",
Couchbase = "couchbase",
DatabricksConnector = "databricks+connector",
Databricks = "databricks",
Db2IBMDB = "db2+ibm_db",
Doris = "doris",
Druid = "druid",

View file

@ -391,7 +391,7 @@ export interface GCPImpersonateServiceAccountValues {
*/
export enum Scheme {
Bigquery = "bigquery",
DatabricksConnector = "databricks+connector",
Databricks = "databricks",
Snowflake = "snowflake",
Trino = "trino",
}

View file

@ -101,7 +101,7 @@ export interface AuthenticationType {
* SQLAlchemy driver scheme options.
*/
export enum DatabricksScheme {
DatabricksConnector = "databricks+connector",
Databricks = "databricks",
}
/**

View file

@ -4570,7 +4570,7 @@ export enum ConfigScheme {
ClickhouseNative = "clickhouse+native",
CockroachdbPsycopg2 = "cockroachdb+psycopg2",
Couchbase = "couchbase",
DatabricksConnector = "databricks+connector",
Databricks = "databricks",
Db2IBMDB = "db2+ibm_db",
Doris = "doris",
Druid = "druid",

View file

@ -5069,7 +5069,7 @@ export enum ConfigScheme {
ClickhouseNative = "clickhouse+native",
CockroachdbPsycopg2 = "cockroachdb+psycopg2",
Couchbase = "couchbase",
DatabricksConnector = "databricks+connector",
Databricks = "databricks",
Db2IBMDB = "db2+ibm_db",
Doris = "doris",
Druid = "druid",

View file

@ -229,7 +229,7 @@ export interface AwsCredentials {
* SQLAlchemy driver scheme options.
*/
export enum DatabricksScheme {
DatabricksConnector = "databricks+connector",
Databricks = "databricks",
}
/**

View file

@ -225,7 +225,7 @@ export interface AwsCredentials {
* SQLAlchemy driver scheme options.
*/
export enum DatabricksScheme {
DatabricksConnector = "databricks+connector",
Databricks = "databricks",
}
/**

View file

@ -4610,7 +4610,7 @@ export enum ConfigScheme {
ClickhouseNative = "clickhouse+native",
CockroachdbPsycopg2 = "cockroachdb+psycopg2",
Couchbase = "couchbase",
DatabricksConnector = "databricks+connector",
Databricks = "databricks",
Db2IBMDB = "db2+ibm_db",
Doris = "doris",
Druid = "druid",

View file

@ -2336,7 +2336,7 @@ export enum ConfigScheme {
ClickhouseNative = "clickhouse+native",
CockroachdbPsycopg2 = "cockroachdb+psycopg2",
Couchbase = "couchbase",
DatabricksConnector = "databricks+connector",
Databricks = "databricks",
Db2IBMDB = "db2+ibm_db",
Doris = "doris",
Druid = "druid",

View file

@ -7789,7 +7789,7 @@ export enum ConfigScheme {
ClickhouseNative = "clickhouse+native",
CockroachdbPsycopg2 = "cockroachdb+psycopg2",
Couchbase = "couchbase",
DatabricksConnector = "databricks+connector",
Databricks = "databricks",
Db2IBMDB = "db2+ibm_db",
Doris = "doris",
Druid = "druid",

View file

@ -4654,7 +4654,7 @@ export enum ConfigScheme {
ClickhouseNative = "clickhouse+native",
CockroachdbPsycopg2 = "cockroachdb+psycopg2",
Couchbase = "couchbase",
DatabricksConnector = "databricks+connector",
Databricks = "databricks",
Db2IBMDB = "db2+ibm_db",
Doris = "doris",
Druid = "druid",

View file

@ -4727,7 +4727,7 @@ export enum ConfigScheme {
ClickhouseNative = "clickhouse+native",
CockroachdbPsycopg2 = "cockroachdb+psycopg2",
Couchbase = "couchbase",
DatabricksConnector = "databricks+connector",
Databricks = "databricks",
Db2IBMDB = "db2+ibm_db",
Doris = "doris",
Druid = "druid",