Microsoft Fabric
Azure Fabric source for DataHub metadata ingestion
CLI based Ingestion
Config Details
- Options
- Schema
Note that a .
is used to denote nested fields in the YAML recipe.
Field | Description |
---|---|
batch_size integer | Default: 100 |
check_timeout_sec integer | Default: 120 |
enable_dataflow_extraction boolean | Default: True |
enable_dataset_discovery boolean | Default: True |
enable_job_extraction boolean | Default: True |
enable_lineage_discovery boolean | Default: True |
enable_profiling boolean | Default: False |
enable_usage_stats boolean | Default: True |
incremental_lineage boolean | Default: True |
platform_instance string | The instance of the platform that all assets produced by this recipe belong to. This should be unique within the platform. See https://datahubproject.io/docs/platform-instances/ for more details. |
profiling_sample_size integer | Default: 1000 |
retry_count integer | Default: 3 |
workspace_name string | Workspace name Default: |
workspace_url string | Workspace URL Default: |
env string | The environment that all assets produced by this connector belong to Default: PROD |
azure_config AzureConnectionConfig | Azure configuration |
azure_config.account_key string | Azure storage account access key. |
azure_config.account_name string | Name of the Azure storage account. See Microsoft official documentation on how to create a storage account. |
azure_config.base_path string | Base folder in hierarchical namespaces to start from. Default: / |
azure_config.client_id string | Azure client (Application) ID for service principal auth. |
azure_config.client_secret string | Azure client secret for service principal auth. |
azure_config.container_name string | Azure storage account container name. |
azure_config.sas_token string | Azure storage account SAS token. |
azure_config.tenant_id string | Azure tenant ID required for service principal auth. |
azure_config.use_cli_auth boolean | Whether to authenticate using the Azure CLI. Default: False |
azure_config.use_managed_identity boolean | Whether to use Azure Managed Identity authentication. Default: False |
stateful_ingestion StatefulIngestionConfig | Stateful Ingestion Config |
stateful_ingestion.enabled boolean | Whether or not to enable stateful ingest. Default: True if a pipeline_name is set and either a datahub-rest sink or datahub_api is specified, otherwise False Default: False |
The JSONSchema for this configuration is inlined below.
{
"title": "AzureFabricSourceConfig",
"description": "Base configuration class for stateful ingestion for source configs to inherit from.",
"type": "object",
"properties": {
"platform_instance": {
"title": "Platform Instance",
"description": "The instance of the platform that all assets produced by this recipe belong to. This should be unique within the platform. See https://datahubproject.io/docs/platform-instances/ for more details.",
"type": "string"
},
"env": {
"title": "Env",
"description": "The environment that all assets produced by this connector belong to",
"default": "PROD",
"type": "string"
},
"stateful_ingestion": {
"title": "Stateful Ingestion",
"description": "Stateful Ingestion Config",
"allOf": [
{
"$ref": "#/definitions/StatefulIngestionConfig"
}
]
},
"azure_config": {
"title": "Azure Config",
"description": "Azure configuration",
"allOf": [
{
"$ref": "#/definitions/AzureConnectionConfig"
}
]
},
"workspace_url": {
"title": "Workspace Url",
"description": "Workspace URL",
"default": "",
"type": "string"
},
"workspace_name": {
"title": "Workspace Name",
"description": "Workspace name",
"default": "",
"type": "string"
},
"enable_dataset_discovery": {
"title": "Enable Dataset Discovery",
"default": true,
"type": "boolean"
},
"enable_lineage_discovery": {
"title": "Enable Lineage Discovery",
"default": true,
"type": "boolean"
},
"enable_usage_stats": {
"title": "Enable Usage Stats",
"default": true,
"type": "boolean"
},
"enable_profiling": {
"title": "Enable Profiling",
"default": false,
"type": "boolean"
},
"enable_job_extraction": {
"title": "Enable Job Extraction",
"default": true,
"type": "boolean"
},
"enable_dataflow_extraction": {
"title": "Enable Dataflow Extraction",
"default": true,
"type": "boolean"
},
"profiling_sample_size": {
"title": "Profiling Sample Size",
"default": 1000,
"type": "integer"
},
"incremental_lineage": {
"title": "Incremental Lineage",
"default": true,
"type": "boolean"
},
"check_timeout_sec": {
"title": "Check Timeout Sec",
"default": 120,
"type": "integer"
},
"retry_count": {
"title": "Retry Count",
"default": 3,
"type": "integer"
},
"batch_size": {
"title": "Batch Size",
"default": 100,
"type": "integer"
}
},
"additionalProperties": false,
"definitions": {
"DynamicTypedStateProviderConfig": {
"title": "DynamicTypedStateProviderConfig",
"type": "object",
"properties": {
"type": {
"title": "Type",
"description": "The type of the state provider to use. For DataHub use `datahub`",
"type": "string"
},
"config": {
"title": "Config",
"description": "The configuration required for initializing the state provider. Default: The datahub_api config if set at pipeline level. Otherwise, the default DatahubClientConfig. See the defaults (https://github.com/datahub-project/datahub/blob/master/metadata-ingestion/src/datahub/ingestion/graph/client.py#L19).",
"default": {},
"type": "object"
}
},
"required": [
"type"
],
"additionalProperties": false
},
"StatefulIngestionConfig": {
"title": "StatefulIngestionConfig",
"description": "Basic Stateful Ingestion Specific Configuration for any source.",
"type": "object",
"properties": {
"enabled": {
"title": "Enabled",
"description": "Whether or not to enable stateful ingest. Default: True if a pipeline_name is set and either a datahub-rest sink or `datahub_api` is specified, otherwise False",
"default": false,
"type": "boolean"
}
},
"additionalProperties": false
},
"AzureConnectionConfig": {
"title": "AzureConnectionConfig",
"description": "Common Azure credentials config.\n\nhttps://docs.microsoft.com/en-us/azure/storage/blobs/data-lake-storage-directory-file-acl-python",
"type": "object",
"properties": {
"base_path": {
"title": "Base Path",
"description": "Base folder in hierarchical namespaces to start from.",
"default": "/",
"type": "string"
},
"container_name": {
"title": "Container Name",
"description": "Azure storage account container name.",
"type": "string"
},
"account_name": {
"title": "Account Name",
"description": "Name of the Azure storage account. See [Microsoft official documentation on how to create a storage account.](https://docs.microsoft.com/en-us/azure/storage/blobs/create-data-lake-storage-account)",
"type": "string"
},
"use_managed_identity": {
"title": "Use Managed Identity",
"description": "Whether to use Azure Managed Identity authentication.",
"default": false,
"type": "boolean"
},
"use_cli_auth": {
"title": "Use Cli Auth",
"description": "Whether to authenticate using the Azure CLI.",
"default": false,
"type": "boolean"
},
"account_key": {
"title": "Account Key",
"description": "Azure storage account access key.",
"type": "string"
},
"sas_token": {
"title": "Sas Token",
"description": "Azure storage account SAS token.",
"type": "string"
},
"client_id": {
"title": "Client Id",
"description": "Azure client (Application) ID for service principal auth.",
"type": "string"
},
"client_secret": {
"title": "Client Secret",
"description": "Azure client secret for service principal auth.",
"type": "string"
},
"tenant_id": {
"title": "Tenant Id",
"description": "Azure tenant ID required for service principal auth.",
"type": "string"
}
},
"additionalProperties": false
}
}
}
Code Coordinates
- Class Name:
datahub.ingestion.source.ms_fabric.source.AzureFabricSource
- Browse on GitHub
Questions
If you've got any questions on configuring ingestion for Microsoft Fabric, feel free to ping us on our Slack.
Is this page helpful?