How to add a new Data Quality Metric
Here is an example of how to calculate Accuracy of data. You can use this as a guide to implement your own custom data metrics.
using System;
using System.Collections.Generic;
using System.Globalization;
using System.Linq;
using System.Text;
using CluedIn.Core;
using CluedIn.Core.Data;
using CluedIn.Core.Data.Parts;
using CluedIn.Core.Metrics;
using CluedIn.Processing.Processors;
namespace Custom.Metrics.Implementations
{
public class AccuracyMetric : PercentageMetric
{
private readonly IMetricProviderResolver providerResolver;
public AccuracyMetric(IMetricProviderResolver providerResolver)
{
this.providerResolver = providerResolver;
}
public override short ValueSize => sizeof(ushort);
public override Guid Id { get; } = new Guid("{987D2A08-E9A0-4661-BC8A-468E7BB34873}");
public override string[] Categories { get; } = { MetricCategories.DataQuality };
protected override PercentageMetricValue CalculatePct(
MetricsProcessingContext context,
IMetricDimension dimension,
IMetricValues<short> existingMetricValues,
Entity entity)
{
if (context == null)
throw new ArgumentNullException(nameof(context));
if (dimension == null)
throw new ArgumentNullException(nameof(dimension));
if (existingMetricValues == null)
throw new ArgumentNullException(nameof(existingMetricValues));
/*
* Dimension Table:
*
* | DimensionType | DetailType | ProviderDefinitionId | ProviderId | Detail | Persistence |
* |-----------------------|------------|----------------------|------------|---------------|-----------------------------------|
* Global | Global | | | | | |
* │ Global Property | Global | Property | | | Property Name | |
* │ │ Global Provider | GlobalIntegrationType | | | Id | | |
* │ │ │ Global Provider Definition | GlobalIntegration | | Id | Id | | |
* │ │ │ │ |-----------------------|------------|----------------------|------------|---------------|-----------------------------------|
* └───│───│───│── Entity | Entity | | | | | Blob, Graph, Search, EntityMetric |
* └───│───│────└─ Entity Property | Entity | Property | | | Property Name | |
* └───│── Entity Provider | EntityIntegrationType | | | Id | | EntityMetric |
* │ └─ Entity Provider Property | EntityIntegrationType | Property | | Id | Property Name | |
* └── Entity Provider Definition | EntityIntegration | | Id | Id | | EntityMetric |
* └─ Entity Provider Definition Property | EntityIntegration | Property | Id | Id | Property Name | |
*
*/
if (dimension.DimensionType.HasFlag(MetricDimensionType.Entity))
{
if (entity == null)
throw new ArgumentNullException();
var isProperty = dimension.DimensionDetailType == MetricDimensionDetailType.Property;
var hasProviderDefinition = dimension.ProviderDefinitionId.HasValue;
var hasProvider = dimension.ProviderId.HasValue;
// EntityIntegration
if (hasProviderDefinition && hasProvider)
{
if (isProperty)
{
var uniqueValue = this.GetUniqueValuesMap(entity, dimension.DimensionDetail, v => v.DataPart.OriginProviderDefinitionId == dimension.ProviderDefinitionId.Value);
var average = CalculateMetricValue(uniqueValue);
return new PercentageMetricValue(dimension, entity.Id, average).WithExplanation(this.GetExplanation(context, uniqueValue));
}
else
{
var average = (short)existingMetricValues.Average(this, v => v.Dimension.DimensionType == MetricDimensionType.EntityIntegration && v.Dimension.DimensionDetailType == MetricDimensionDetailType.Property);
return new PercentageMetricValue(dimension, entity.Id, average).WithExplanation(this.GetAggregatedValueExplanation(context, MetricDimensionType.EntityIntegration, MetricDimensionDetailType.Property));
}
}
// EntityIntegrationType
if (hasProvider)
{
if (isProperty)
{
var uniqueValue = this.GetUniqueValuesMap(entity, dimension.DimensionDetail, v => this.providerResolver.ResolveProvider(context, v.DataPart.OriginProviderDefinitionId)?.Id == dimension.ProviderId.Value);
var average = CalculateMetricValue(uniqueValue);
return new PercentageMetricValue(dimension, entity.Id, average).WithExplanation(this.GetExplanation(context, uniqueValue));
}
else
{
var average = (short)existingMetricValues.Average(this, v => v.Dimension.DimensionType == MetricDimensionType.EntityIntegrationType && v.Dimension.DimensionDetailType == MetricDimensionDetailType.Property);
return new PercentageMetricValue(dimension, entity.Id, average).WithExplanation(this.GetAggregatedValueExplanation(context, MetricDimensionType.EntityIntegrationType, MetricDimensionDetailType.Property));
}
}
// Entity
if (isProperty)
{
var uniqueValue = this.GetUniqueValuesMap(entity, dimension.DimensionDetail);
var average = CalculateMetricValue(uniqueValue);
return new PercentageMetricValue(dimension, entity.Id, average).WithExplanation(this.GetExplanation(context, uniqueValue));
}
else
{
var average = (short)existingMetricValues.Average(this, v => v.Dimension.DimensionType == MetricDimensionType.Entity && v.Dimension.DimensionDetailType == MetricDimensionDetailType.Property);
return new PercentageMetricValue(dimension, entity.Id, average).WithExplanation(this.GetAggregatedValueExplanation(context, MetricDimensionType.Entity, MetricDimensionDetailType.Property));
}
}
else if (dimension.DimensionType.HasFlag(MetricDimensionType.Global))
{
var dateDimension = MetricDateDimension.Today;
short average;
string explanation;
switch (dimension.DimensionType)
{
case MetricDimensionType.GlobalIntegration:
average = (short)existingMetricValues.Average(this, v => v.Dimension.DimensionType == MetricDimensionType.EntityIntegration);
explanation = this.GetAggregatedValueExplanation(context, MetricDimensionType.EntityIntegration, null);
break;
case MetricDimensionType.GlobalIntegrationType:
average = (short)existingMetricValues.Average(this, v => v.Dimension.DimensionType == MetricDimensionType.EntityIntegrationType);
explanation = this.GetAggregatedValueExplanation(context, MetricDimensionType.EntityIntegrationType, null);
break;
case MetricDimensionType.Global:
average = (short)existingMetricValues.Average(this, v => v.Dimension.DimensionType == MetricDimensionType.Entity);
explanation = this.GetAggregatedValueExplanation(context, MetricDimensionType.Entity, null);
break;
default:
throw new Exception();
}
return new PercentageMetricValue(dimension, dateDimension, average).WithExplanation(explanation);
}
throw new Exception();
}
private string GetExplanation(MetricsProcessingContext context, UniqueValuesMap uniqueValue)
{
if (!context.MetricsExecutionOptions.HasFlag(MetricsExecutionOption.Explanation))
return null;
var sb = new StringBuilder();
var table = AsciiTableGenerator.GenerateTable(
new[] { "Value", "Count", "CountAtHead"},
uniqueValue.Values.OrderByDescending(v => v.Value != null).ThenByDescending(v => v.Count).ThenByDescending(v => v.CountAtHead),
v => v.Value ?? "[Missing]",
v => v.Count.ToString(),
v => v.CountAtHead.ToString()
);
sb.AppendLine(table);
sb.AppendLine();
sb.AppendLine($"MaxUniqueValueCount: {uniqueValue.MaxUniqueValueCount}");
sb.AppendLine($"ValuesCount: {uniqueValue.ValuesCount}");
sb.AppendLine($"MissingValuesCount: {uniqueValue.MissingValuesCount}");
sb.AppendLine($"BranchCount: {uniqueValue.BranchCount}");
sb.AppendLine();
sb.AppendLine("Calculation:");
sb.AppendLine($"{uniqueValue.MaxUniqueValueCount} / ({uniqueValue.ValuesCount} + ({uniqueValue.MissingValuesCount} / {uniqueValue.BranchCount})) = {CalculateMetricValue(uniqueValue).ToString(CultureInfo.InvariantCulture)}");
return sb.ToString();
}
private static double CalculateMetricValue(UniqueValuesMap v)
{
/*
Formula:
[Max Unique Value Count] / ([Sum of value counts] + ([Missing values] / [Branch Count]))
max Max Unique Value Count
populated Sum of value counts / Count populated records
missing Number of records with missing value
branches Branch Count
Alternative
Min(1, ([Max Unique Value Count] + ([Count at head] / [Branch Count])) / ([Sum of value counts] + ([Missing values] / [Branch Count]))
*/
var average = (double)v.MaxUniqueValueCount / ((double)v.ValuesCount + ((double)v.MissingValuesCount / (double)v.BranchCount));
// Alternative
//var average = Math.Min(1d, ((double)v.MaxUniqueValueCount + ((double)v.HeadValuesCount / (double)v.BranchCount)) / ((double)v.ValuesCount + ((double)v.MissingValuesCount / (double)v.BranchCount)));
return average;
}
public override bool ShouldPersist(IMetricDimension dimension)
{
throw new NotImplementedException();
}
public override IEnumerable<IMetricDimension> GetDimensions(MetricsProcessingContext context, IMetricsModel model)
{
var existingDimensions = model.MetricDimensions.Where(d => d.MetricId == this.Id);
/*
* Dimension Table:
*
* | DimensionType | DetailType | ProviderDefinitionId | ProviderId | Detail | Persistence |
* |-----------------------|------------|----------------------|------------|---------------|-----------------------------------|
* Global | Global | | | | | |
* │ Global Property | Global | Property | | | Property Name | |
* │ │ Global Provider | GlobalIntegrationType | | | Id | | |
* │ │ │ Global Provider Definition | GlobalIntegration | | Id | Id | | |
* │ │ │ │ |-----------------------|------------|----------------------|------------|---------------|-----------------------------------|
* └───│───│───│── Entity | Entity | | | | | Blob, Graph, Search, EntityMetric |
* └───│───│────└─ Entity Property | Entity | Property | | | Property Name | |
* └───│── Entity Provider | EntityIntegrationType | | | Id | | EntityMetric |
* │ └─ Entity Provider Property | EntityIntegrationType | Property | | Id | Property Name | |
* └── Entity Provider Definition | EntityIntegration | | Id | Id | | EntityMetric |
* └─ Entity Provider Definition Property | EntityIntegration | Property | Id | Id | Property Name | |
*
*/
var entityIntegrationDimensions = existingDimensions.Where(d => d.DimensionType == MetricDimensionType.EntityIntegration && d.DimensionDetailType == MetricDimensionDetailType.None && d.ProviderDefinitionId.HasValue && d.ProviderId.HasValue);
if (entityIntegrationDimensions.Any())
{
foreach (var entityDimension in entityIntegrationDimensions)
yield return this.GetDefaultGlobalDimension(context, entityDimension.ProviderDefinitionId, entityDimension.ProviderId);
}
var entityIntegrationTypeDimensions = existingDimensions.Where(d => d.DimensionType == MetricDimensionType.EntityIntegrationType && d.DimensionDetailType == MetricDimensionDetailType.None && d.ProviderId.HasValue);
if (entityIntegrationTypeDimensions.Any())
{
foreach (var entityDimension in entityIntegrationTypeDimensions)
yield return this.GetDefaultGlobalDimension(context, entityDimension.ProviderId);
}
if (existingDimensions.Any(d => d.DimensionType == MetricDimensionType.Entity))
yield return this.GetDefaultGlobalDimension(context);
}
public override IEnumerable<IMetricDimension> GetDimensionsToCalculate(MetricsProcessingContext context, Entity entity)
{
/*
* Dimension Table:
*
* | DimensionType | DetailType | ProviderDefinitionId | ProviderId | Detail | Persistence |
* |-----------------------|------------|----------------------|------------|---------------|-----------------------------------|
* Global | Global | | | | | |
* │ Global Property | Global | Property | | | Property Name | |
* │ │ Global Provider | GlobalIntegrationType | | | Id | | |
* │ │ │ Global Provider Definition | GlobalIntegration | | Id | Id | | |
* │ │ │ │ |-----------------------|------------|----------------------|------------|---------------|-----------------------------------|
* └───│───│───│── Entity | Entity | | | | | Blob, Graph, Search, EntityMetric |
* └───│───│────└─ Entity Property | Entity | Property | | | Property Name | |
* └───│── Entity Provider | EntityIntegrationType | | | Id | | EntityMetric |
* │ └─ Entity Provider Property | EntityIntegrationType | Property | | Id | Property Name | |
* └── Entity Provider Definition | EntityIntegration | | Id | Id | | EntityMetric |
* └─ Entity Provider Definition Property | EntityIntegration | Property | Id | Id | Property Name | |
*
*/
// Provider Definition
foreach (var group in entity.Details.DataEntries.GroupBy(d => d.OriginProviderDefinitionId))
{
if (group.Key == null)
continue;
var providerDefinition = context.Organization.Providers.GetProviderDefinition(context, group.Key.Value);
if (providerDefinition == null)
break;
var keys = group.SelectMany(d => d.ProcessedEntityData.Properties.Keys).Distinct().ToList();
var globalDimension = this.GetDefaultGlobalDimension(context, group.Key, providerDefinition.ProviderId);
var entityLevelDimension = new MetricDimension(context, this, MetricDimensionType.EntityIntegration, MetricDimensionDetailType.None, providerDefinitionId: group.Key, providerId: providerDefinition.ProviderId, persistence: MetricDimensionPersistence.EntityMetric) { ParentDimension = globalDimension };
foreach (var key in keys)
yield return new MetricDimension(context, this, entityLevelDimension, MetricDimensionType.EntityIntegration, MetricDimensionDetailType.Property, providerDefinitionId: group.Key, providerId: providerDefinition.ProviderId, dimensionDetail: key);
if (keys.Any())
yield return entityLevelDimension;
}
// Provider
foreach (var group in entity.Details.DataEntries.Where(d => d.OriginProviderDefinitionId.HasValue)
.GroupBy(d => this.providerResolver.ResolveProvider(context, d.OriginProviderDefinitionId.Value)))
{
if (group.Key == null)
continue;
var keys = group.SelectMany(d => d.ProcessedEntityData.Properties.Keys).Distinct().ToList();
var globalDimension = this.GetDefaultGlobalDimension(context, group.Key.Id);
var entityLevelDimension = new MetricDimension(context, this, MetricDimensionType.EntityIntegrationType, MetricDimensionDetailType.None, providerId: group.Key.Id, persistence: MetricDimensionPersistence.EntityMetric) { ParentDimension = globalDimension };
foreach (var key in keys)
yield return new MetricDimension(context, this, entityLevelDimension, MetricDimensionType.EntityIntegrationType, MetricDimensionDetailType.Property, providerId: group.Key.Id, dimensionDetail: key);
if (keys.Any())
yield return entityLevelDimension;
}
// Entity Property
{
var globalDimension = this.GetDefaultGlobalDimension(context);
var entityLevelDimension = new MetricDimension(context, this, MetricDimensionType.Entity, MetricDimensionDetailType.None, persistence: MetricDimensionPersistence.Blob | MetricDimensionPersistence.Graph | MetricDimensionPersistence.Search | MetricDimensionPersistence.EntityMetric) { ParentDimension = globalDimension };
foreach (var key in entity.Properties.Keys)
{
yield return new MetricDimension(context, this, entityLevelDimension, MetricDimensionType.Entity, MetricDimensionDetailType.Property, dimensionDetail: key, persistence: MetricDimensionPersistence.None);
}
// Entity
if (entity.Properties.Any())
yield return entityLevelDimension;
}
}
private UniqueValuesMap GetUniqueValuesMap(Entity entity, string propertyName, Func<IVersionPart, bool> versionFilter = null)
{
if (entity.Details.VersionHistory.Versions.Count != entity.Details.DataEntries.Count || entity.Details.VersionHistory.Versions.Any(v => v.DataPart == null))
VersionHistoryProcessing.CreateChangeHistory(entity);
var branches = entity.Details.VersionHistory.Branches;
var mergedBranches = new List<MergedBranch>();
foreach (var branch in branches)
{
var versionParts = entity.Details.VersionHistory.GetBranch(branch);
var allVersionParts = versionParts;
if (versionFilter != null)
versionParts = versionParts.Where(versionFilter);
if (!versionParts.Any())
continue;
var mergedBranch = this.MergeDataParts(versionParts, allVersionParts);
mergedBranches.Add(mergedBranch);
}
var uniqueValue = this.GetUniqueValues(mergedBranches, p => p.Properties.GetValue(propertyName));
return uniqueValue;
}
public UniqueValuesMap GetUniqueValues(IEnumerable<MergedBranch> mergedBranches, Func<IProcessedEntityMetadata, string> func)
{
var groups = mergedBranches.Select(m => func(m.MergedData)).GroupBy(v => v);
var headGroups = mergedBranches.Select(m => func(m.HeadData)).GroupBy(v => v);
var branchCount = mergedBranches.Count();
return new UniqueValuesMap(branchCount, groups, headGroups);
}
/**********************************************************************************************************
* INNER TYPES
**********************************************************************************************************/
public struct UniqueValuesMap
{
public UniqueValuesMap(
int branchCount,
IEnumerable<IGrouping<string, string>> groups,
IEnumerable<IGrouping<string, string>> headGroups)
{
var heads = headGroups.ToLookup(g => g.Key);
this.Values = groups.Select(g => new UniqueValueEntry(g.Key, g.Count(), heads.Contains(g.Key) ? heads[g.Key].Sum(l => l.Count()) : 0)).ToList();
this.ValuesCount = this.Values.Where(v => v.Value != null).Sum(v => v.Count);
this.MissingValuesCount = this.Values.Where(v => v.Value == null).Sum(v => v.Count);
this.MaxUniqueValueCount = this.Values.Where(v => v.Value != null).Max(v => v.Count, 0);
this.HeadValuesCount = this.Values.Where(v => v.Value != null).Max(v => v.CountAtHead, 0);
this.BranchCount = branchCount;
}
public ICollection<UniqueValueEntry> Values { get; }
public int MaxUniqueValueCount { get; }
public int ValuesCount { get; }
public int MissingValuesCount { get; }
public int HeadValuesCount { get; }
public int BranchCount { get; }
}
public struct UniqueValueEntry
{
public UniqueValueEntry(string value, int count, int countAtHead)
{
this.Value = value;
this.Count = count;
this.CountAtHead = countAtHead;
}
public string Value { get; }
public int Count { get; }
public int CountAtHead { get; }
}
}
}