Commit fcbce418 by Venkatesh Seetharam

Added more DSL tests, Docs and resolved RAT issues

parent c1c6510b
......@@ -19,11 +19,11 @@
######### Graph Database Configs #########
# Graph Storage
metadata.graph.storage.backend=berkeleyje
metadata.graph.storage.directory=./data/berkeley
metadata.graph.storage.directory=./target/data/berkeley
# Graph Search Index
metadata.graph.index.search.backend=elasticsearch
metadata.graph.index.search.directory=./data/es
metadata.graph.index.search.directory=./target/data/es
metadata.graph.index.search.elasticsearch.client-only=false
metadata.graph.index.search.elasticsearch.local-mode=true
......
---+ Quick Start Guide
---++ Introduction
This quick start user guide is a simple client that adds a few sample type definitions modeled
after the example as shown below. It also adds example entities along with traits as shown in the
instance graph below.
---+++ Example Type Definitions
<img src="guide-class-diagram.png"/>
---+++ Example Instance Graph
<img src="guide-instance-graph.png"/>
---++ Running the example
This will add sample types and instances along with traits as shown in the instance graph above.
* bin/quick-start.sh
---++ Dashboard
A simple dashboard with search is available.
* http://localhost:21000/dashboard
---+ Data Governance and Metadata platform for Hadoop
---+ Data Governance and Metadata framework for Hadoop
---++ Overview
DGI is a scalable and extensible set of core foundational governance services – enabling
enterprises to effectively and efficiently meet their compliance requirements within Hadoop and
allows integration with the whole enterprise data ecosystem.
---++ Use Cases
---++ Features
* Enables modeling
---+++ Data Classification
* Import or define taxonomy business-oriented annotations for data
* Define, annotate, and automate capture of relationships between data sets and underlying
elements including source, target, and derivation processes
* Export metadata to third-party systems
* Captures Lineage information for data sets and processes
---+++ Centralized Auditing
* Capture security access information for every application, process, and interaction with data
* Capture the operational information for execution, steps, and activities
---+++ Search & Lineage (Browse)
* Pre-defined navigation paths to explore the data classification and audit information
* Text-based search features locates relevant data and audit event across Data Lake quickly
and accurately
* Browse visualization of data set lineage allowing users to drill-down into operational,
security, and provenance related information
---+++ Security & Policy Engine
* Rationalize compliance policy at runtime based on data classification schemes, attributes
and roles.
* Advanced definition of policies for preventing data derivation based on classification
(i.e. re-identification) – Prohibitions
* Column and Row level masking based on cell values and attibutes.
---++ Getting Started
* [[QuickStart][Quick Start Guide]]
---++ Documentation
* [[Architecture][High Level Architecture]]
* [[TypeSystem][Type System]]
* [[Repository][Metadata Repository]]
* [[Discovery][Metadata Discovery]]
* [[Discovery][Search]]
---++ API Documentation
......
......@@ -82,7 +82,7 @@ public class GraphBackedDiscoveryService implements DiscoveryService {
return queryResult.toJson();
}
} catch (Exception e) { // unable to catch ExpressionException
throw new DiscoveryException("Invalid expression : " + dslQuery);
throw new DiscoveryException("Invalid expression : " + dslQuery, e);
}
throw new DiscoveryException("Invalid expression : " + dslQuery);
......
......@@ -36,7 +36,6 @@ import org.apache.hadoop.metadata.typesystem.types.AttributeInfo;
import org.apache.hadoop.metadata.typesystem.types.ClassType;
import org.apache.hadoop.metadata.typesystem.types.DataTypes;
import org.apache.hadoop.metadata.typesystem.types.IDataType;
import org.apache.hadoop.metadata.typesystem.types.Multiplicity;
import org.apache.hadoop.metadata.typesystem.types.StructType;
import org.apache.hadoop.metadata.typesystem.types.TraitType;
import org.slf4j.Logger;
......@@ -162,20 +161,17 @@ public class GraphBackedSearchIndexer implements SearchIndexer {
final String propertyName = typeName + "." + field.name;
switch (field.dataType().getTypeCategory()) {
case PRIMITIVE:
createVertexMixedIndex(propertyName,
getPrimitiveClass(field.dataType()), getCardinality(field.multiplicity));
createVertexMixedIndex(propertyName, getPrimitiveClass(field.dataType()));
break;
case ENUM:
createVertexMixedIndex(
propertyName, Integer.class, getCardinality(field.multiplicity));
createVertexMixedIndex(propertyName, Integer.class);
break;
case ARRAY:
case MAP:
// index the property holder for element names
createVertexMixedIndex(
propertyName, String.class, getCardinality(field.multiplicity));
// todo - how do we overcome this limitation?
// IGNORE: Can only index single-valued property keys on vertices in Mixed Index
break;
case STRUCT:
......@@ -223,6 +219,7 @@ public class GraphBackedSearchIndexer implements SearchIndexer {
throw new IllegalArgumentException("unknown data type " + dataType);
}
/*
private Cardinality getCardinality(Multiplicity multiplicity) {
if (multiplicity == Multiplicity.OPTIONAL || multiplicity == Multiplicity.REQUIRED) {
return Cardinality.SINGLE;
......@@ -235,12 +232,13 @@ public class GraphBackedSearchIndexer implements SearchIndexer {
// todo - default to LIST as this is the most forgiving
return Cardinality.LIST;
}
*/
private void createCompositeAndMixedIndex(String indexName,
String propertyName, Class propertyClass,
boolean isUnique, Cardinality cardinality) {
createCompositeIndex(indexName, propertyName, propertyClass, isUnique, cardinality);
createVertexMixedIndex(propertyName, propertyClass, cardinality);
createVertexMixedIndex(propertyName, propertyClass);
}
private PropertyKey createCompositeIndex(String indexName,
......@@ -272,15 +270,14 @@ public class GraphBackedSearchIndexer implements SearchIndexer {
return propertyKey;
}
private PropertyKey createVertexMixedIndex(String propertyName, Class propertyClass,
Cardinality cardinality) {
private PropertyKey createVertexMixedIndex(String propertyName, Class propertyClass) {
TitanManagement management = titanGraph.getManagementSystem();
PropertyKey propertyKey = management.getPropertyKey(propertyName);
if (propertyKey == null) {
// ignored cardinality as Can only index single-valued property keys on vertices
propertyKey = management
.makePropertyKey(propertyName)
.dataType(propertyClass)
.cardinality(cardinality)
.make();
TitanGraphIndex vertexIndex = management.getGraphIndex(Constants.VERTEX_INDEX);
......
......@@ -216,6 +216,7 @@ public class GraphBackedDiscoveryServiceTest {
{"Table loop (LoadProcess outputTable)"},
{"Table as _loop0 loop (LoadProcess outputTable) withPath"},
{"Table as src loop (LoadProcess outputTable) as dest select src.name as srcTable, dest.name as destTable withPath"},
{"Table as t, sd, Column as c where t.name=\"sales_fact\" select c.name as colName, c.dataType as colType"},
};
}
......
......@@ -22,7 +22,7 @@ metadata.graph.storage.backend=inmemory
# Graph Search Index
metadata.graph.index.search.backend=elasticsearch
metadata.graph.index.search.directory=./data/es
metadata.graph.index.search.directory=./target/data/es
metadata.graph.index.search.elasticsearch.client-only=false
metadata.graph.index.search.elasticsearch.local-mode=true
......
......@@ -21,6 +21,7 @@ package org.apache.hadoop.metadata.examples;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableList;
import org.apache.hadoop.metadata.MetadataServiceClient;
import org.apache.hadoop.metadata.typesystem.IStruct;
import org.apache.hadoop.metadata.typesystem.Referenceable;
import org.apache.hadoop.metadata.typesystem.TypesDef;
import org.apache.hadoop.metadata.typesystem.json.InstanceSerialization;
......@@ -40,7 +41,9 @@ import org.codehaus.jettison.json.JSONArray;
import org.codehaus.jettison.json.JSONObject;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* A driver that sets up sample types and data for testing purposes.
......@@ -116,18 +119,17 @@ public class QuickStart {
attrDef("inputFormat", DataTypes.STRING_TYPE),
attrDef("outputFormat", DataTypes.STRING_TYPE),
attrDef("compressed", DataTypes.STRING_TYPE,
Multiplicity.REQUIRED, false, null)
Multiplicity.REQUIRED, false, null),
new AttributeDefinition("columns",
DataTypes.arrayTypeName(COLUMN_TYPE),
Multiplicity.COLLECTION, true, null)
);
HierarchicalTypeDefinition<ClassType> columnClsDef =
TypesUtil.createClassTypeDef(COLUMN_TYPE, null,
attrDef("name", DataTypes.STRING_TYPE),
attrDef("dataType", DataTypes.STRING_TYPE),
attrDef("comment", DataTypes.STRING_TYPE),
new AttributeDefinition("sd", STORAGE_DESC_TYPE,
Multiplicity.REQUIRED, false, null)
// new AttributeDefinition("table", DataTypes.STRING_TYPE.getName(),
// Multiplicity.REQUIRED, false, null)
attrDef("comment", DataTypes.STRING_TYPE)
);
HierarchicalTypeDefinition<ClassType> tblClsDef =
......@@ -145,13 +147,7 @@ public class QuickStart {
attrDef("viewOriginalText", DataTypes.STRING_TYPE),
attrDef("viewExpandedText", DataTypes.STRING_TYPE),
attrDef("tableType", DataTypes.STRING_TYPE),
attrDef("temporary", DataTypes.BOOLEAN_TYPE),
// todo - fix this post serialization support for collections
new AttributeDefinition("columns",
DataTypes.arrayTypeName(DataTypes.STRING_TYPE.getName()),
Multiplicity.COLLECTION, false, null)
// new AttributeDefinition("columns", DataTypes.arrayTypeName(COLUMN_TYPE),
// Multiplicity.COLLECTION, true, null)
attrDef("temporary", DataTypes.BOOLEAN_TYPE)
);
HierarchicalTypeDefinition<ClassType> loadProcessClsDef =
......@@ -160,15 +156,11 @@ public class QuickStart {
attrDef("userName", DataTypes.STRING_TYPE),
attrDef("startTime", DataTypes.INT_TYPE),
attrDef("endTime", DataTypes.INT_TYPE),
// todo - fix this post serialization support for collections
// new AttributeDefinition("inputTables", DataTypes.arrayTypeName(TABLE_TYPE),
// Multiplicity.COLLECTION, false, null),
// new AttributeDefinition("outputTable", TABLE_TYPE,
// Multiplicity.REQUIRED, false, null),
new AttributeDefinition("inputTables", DataTypes.STRING_TYPE.getName(),
new AttributeDefinition("inputTables",
DataTypes.arrayTypeName(TABLE_TYPE),
Multiplicity.COLLECTION, false, null),
new AttributeDefinition("outputTable", DataTypes.STRING_TYPE.getName(),
Multiplicity.REQUIRED, false, null),
new AttributeDefinition("outputTable", TABLE_TYPE,
Multiplicity.OPTIONAL, false, null),
attrDef("queryText", DataTypes.STRING_TYPE, Multiplicity.REQUIRED),
attrDef("queryPlan", DataTypes.STRING_TYPE, Multiplicity.REQUIRED),
attrDef("queryId", DataTypes.STRING_TYPE, Multiplicity.REQUIRED),
......@@ -180,10 +172,8 @@ public class QuickStart {
attrDef("name", DataTypes.STRING_TYPE),
new AttributeDefinition("db", DATABASE_TYPE,
Multiplicity.REQUIRED, false, null),
// todo - fix this post serialization support for collections
// new AttributeDefinition("inputTables", TABLE_TYPE, Multiplicity.COLLECTION,
// false, null)
new AttributeDefinition("inputTables", DataTypes.STRING_TYPE.getName(),
new AttributeDefinition("inputTables",
DataTypes.arrayTypeName(TABLE_TYPE),
Multiplicity.COLLECTION, false, null)
);
......@@ -234,62 +224,49 @@ public class QuickStart {
Referenceable salesDB = database(
"Sales", "Sales Database", "John ETL", "hdfs://host:8000/apps/warehouse/sales");
Referenceable sd = storageDescriptor("hdfs://host:8000/apps/warehouse/sales",
"TextInputFormat", "TextOutputFormat", true);
Referenceable sd = rawStorageDescriptor("hdfs://host:8000/apps/warehouse/sales",
"TextInputFormat", "TextOutputFormat", true);
ArrayList<Referenceable> salesFactColumns = new ArrayList<>();
Referenceable column = column("time_id", "int", "time id", sd);
salesFactColumns.add(column);
column = column("product_id", "int", "product id", sd);
salesFactColumns.add(column);
column = column("customer_id", "int", "customer id", sd, "PII");
salesFactColumns.add(column);
column = column("sales", "double", "product id", sd, "Metric");
salesFactColumns.add(column);
Referenceable salesFact = table("sales_fact", "sales fact table",
salesFactColumns.add(rawColumn("time_id", "int", "time id"));
salesFactColumns.add(rawColumn("product_id", "int", "product id"));
salesFactColumns.add(rawColumn("customer_id", "int", "customer id", "PII"));
salesFactColumns.add(rawColumn("sales", "double", "product id", "Metric"));
Referenceable salesFact = tableDefinition("sales_fact", "sales fact table",
salesDB, sd, "Joe", "Managed", salesFactColumns, "Fact");
ArrayList<Referenceable> productDimColumns = new ArrayList<>();
column = column("product_id", "int", "product id", sd);
productDimColumns.add(column);
column = column("product_name", "string", "product name", sd);
productDimColumns.add(column);
column = column("brand_name", "int", "brand name", sd);
productDimColumns.add(column);
Referenceable productDim = table("product_dim", "product dimension table",
productDimColumns.add(rawColumn("product_id", "int", "product id"));
productDimColumns.add(rawColumn("product_name", "string", "product name"));
productDimColumns.add(rawColumn("brand_name", "int", "brand name"));
Referenceable productDim = tableDefinition("product_dim", "product dimension table",
salesDB, sd, "John Doe", "Managed", productDimColumns, "Dimension");
ArrayList<Referenceable> timeDimColumns = new ArrayList<>();
column = column("time_id", "int", "time id", sd);
timeDimColumns.add(column);
column = column("dayOfYear", "int", "day Of Year", sd);
timeDimColumns.add(column);
column = column("weekDay", "int", "week Day", sd);
timeDimColumns.add(column);
Referenceable timeDim = table("time_dim", "time dimension table",
timeDimColumns.add(rawColumn("time_id", "int", "time id"));
timeDimColumns.add(rawColumn("dayOfYear", "int", "day Of Year"));
timeDimColumns.add(rawColumn("weekDay", "int", "week Day"));
Referenceable timeDim = tableDefinition("time_dim", "time dimension table",
salesDB, sd, "John Doe", "External", timeDimColumns, "Dimension");
ArrayList<Referenceable> customerDimColumns = new ArrayList<>();
column = column("customer_id", "int", "customer id", sd, "PII");
customerDimColumns.add(column);
column = column("name", "string", "customer name", sd, "PII");
customerDimColumns.add(column);
column = column("address", "string", "customer address", sd, "PII");
customerDimColumns.add(column);
Referenceable customerDim = table("customer_dim", "customer dimension table",
customerDimColumns.add(rawColumn("customer_id", "int", "customer id", "PII"));
customerDimColumns.add(rawColumn("name", "string", "customer name", "PII"));
customerDimColumns.add(rawColumn("address", "string", "customer address", "PII"));
Referenceable customerDim = tableDefinition("customer_dim", "customer dimension table",
salesDB, sd, "fetl", "External", customerDimColumns, "Dimension");
Referenceable reportingDB = database("Reporting", "reporting database", "Jane BI",
"hdfs://host:8000/apps/warehouse/reporting");
Referenceable salesFactDaily = table("sales_fact_daily_mv",
Referenceable salesFactDaily = tableDefinition("sales_fact_daily_mv",
"sales fact daily materialized view", reportingDB, sd,
"Joe BI", "Managed", salesFactColumns, "Metric");
......@@ -307,7 +284,7 @@ public class QuickStart {
ImmutableList.of(customerDim), "Dimension", "JdbcAccess");
System.out.println("added customerDimView = " + customerDimView);
Referenceable salesFactMonthly = table("sales_fact_monthly_mv",
Referenceable salesFactMonthly = tableDefinition("sales_fact_monthly_mv",
"sales fact monthly materialized view",
reportingDB, sd, "Jane BI", "Managed", salesFactColumns, "Metric");
......@@ -328,7 +305,19 @@ public class QuickStart {
System.out.println("created instance for type " + typeName + ", guid: " + guid);
// return the reference to created instance with guid
return new Referenceable(guid, referenceable.getTypeName(), referenceable.getValuesMap());
final ImmutableList<String> traitNames = referenceable.getTraits();
if (traitNames.isEmpty()) {
return new Referenceable(guid, referenceable.getTypeName(),
referenceable.getValuesMap());
} else {
Map<String, IStruct> traits = new HashMap<>();
for (String traitName : traitNames) {
traits.put(traitName, referenceable.getTrait(traitName));
}
return new Referenceable(guid, referenceable.getTypeName(),
referenceable.getValuesMap(), traitNames, traits);
}
}
Referenceable database(String name, String description,
......@@ -344,34 +333,49 @@ public class QuickStart {
return createInstance(referenceable);
}
Referenceable storageDescriptor(String location, String inputFormat,
String outputFormat,
boolean compressed) throws Exception {
Referenceable rawStorageDescriptor(String location, String inputFormat,
String outputFormat,
boolean compressed) throws Exception {
Referenceable referenceable = new Referenceable(STORAGE_DESC_TYPE);
referenceable.set("location", location);
referenceable.set("inputFormat", inputFormat);
referenceable.set("outputFormat", outputFormat);
referenceable.set("compressed", compressed);
return createInstance(referenceable);
return referenceable;
}
Referenceable column(String name, String dataType,
String comment, Referenceable sd,
String... traitNames) throws Exception {
Referenceable rawColumn(String name, String dataType, String comment,
String... traitNames) throws Exception {
Referenceable referenceable = new Referenceable(COLUMN_TYPE, traitNames);
referenceable.set("name", name);
referenceable.set("dataType", dataType);
referenceable.set("comment", comment);
referenceable.set("sd", sd);
return createInstance(referenceable);
return referenceable;
}
Referenceable tableDefinition(String name, String description,
Referenceable db, Referenceable sdReferenceable,
String owner, String tableType,
List<Referenceable> columnsList,
String... traitNames) throws Exception {
List<Referenceable> columns = new ArrayList<>();
for (Referenceable columnReferenceable : columnsList) {
columns.add(createInstance(columnReferenceable));
}
sdReferenceable.set("columns", columns);
Referenceable sd = createInstance(sdReferenceable);
return table(name, description, db, sd, owner, tableType, traitNames);
}
Referenceable table(String name, String description,
Referenceable db, Referenceable sd,
String owner, String tableType,
List<Referenceable> columns,
// List<Referenceable> columns,
String... traitNames) throws Exception {
Referenceable referenceable = new Referenceable(TABLE_TYPE, traitNames);
referenceable.set("name", name);
......@@ -384,14 +388,6 @@ public class QuickStart {
referenceable.set("db", db);
referenceable.set("sd", sd);
// todo - fix this post serialization support for collections
// referenceable.set("columns", columns);
ArrayList<String> columnNames = new ArrayList<>(columns.size());
for (Referenceable column : columns) {
columnNames.add(String.valueOf(column.get("name")));
}
referenceable.set("columns", columnNames);
return createInstance(referenceable);
}
......@@ -407,17 +403,8 @@ public class QuickStart {
referenceable.set("startTime", System.currentTimeMillis());
referenceable.set("endTime", System.currentTimeMillis() + 10000);
// todo - fix this post serialization support for collections
/*
referenceable.set("inputTables", inputTables);
referenceable.set("outputTable", outputTable);
*/
ArrayList<String> inputTableNames = new ArrayList<>(inputTables.size());
for (Referenceable inputTable : inputTables) {
inputTableNames.add(String.valueOf(inputTable.get("name")));
}
referenceable.set("inputTables", inputTableNames);
referenceable.set("outputTable", outputTable.get("name"));
referenceable.set("queryText", queryText);
referenceable.set("queryPlan", queryPlan);
......@@ -434,13 +421,7 @@ public class QuickStart {
referenceable.set("name", name);
referenceable.set("db", db);
// todo - fix this post serialization support for collections
// referenceable.set("inputTables", inputTables);
ArrayList<String> inputTableNames = new ArrayList<>(inputTables.size());
for (Referenceable inputTable : inputTables) {
inputTableNames.add(String.valueOf(inputTable.get("name")));
}
referenceable.set("inputTables", inputTableNames);
referenceable.set("inputTables", inputTables);
return createInstance(referenceable);
}
......@@ -511,6 +492,7 @@ public class QuickStart {
"Table as _loop0 loop (LoadProcess outputTable) withPath",
"Table as src loop (LoadProcess outputTable) as dest select src.name as srcTable, dest.name as destTable withPath",
*/
"Table as t, sd, columns where t.name=\"sales_fact\"",
};
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment