Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,22 @@ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/ma
</archive>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
</execution>
</executions>
<configuration>
<finalName>uber-${artifactId}-${version}</finalName>
</configuration>
</plugin>

</plugins>
</build>

Expand Down
7 changes: 3 additions & 4 deletions resource/dava.properties
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,12 @@ mapreduce.reduce.maxattempts=2

#ValidationChecker
vac.filter.invalid.records=false
vac.invalid.data.file.path=/user/pranab/output/dava/invalid.txt
vac.schema.file.path=/user/pranab/meta/dava/electr_prod.json
vac.invalid.data.file.path=/usr/avi/chombo/invalid.txt
vac.validation.schema.file.path=/usr/avi/chombo/meta/dava/electr_prod.json
#vac.cleanser.schema.file.path=
vac.validator.0=notMissing
vac.validator.1=membership,notMissing
vac.validator.2=membership,notMissing
vac.validator.3=exactLength,notMissing
vac.validator.4=min,max,notMissing
vac.validator.5=min,max,notMissing


8 changes: 4 additions & 4 deletions resource/dava.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,12 @@ JAR_NAME=/home/pranab/Projects/chombo/target/chombo-1.0.jar
CLASS_NAME=org.chombo.mr.ValidationChecker

echo "running mr"
IN_PATH=/user/pranab/dava/input
OUT_PATH=/user/pranab/dava/output
IN_PATH=/usr/avi/chombo/input/dava
OUT_PATH=/usr/avi/chombo/output/dava
echo "input $IN_PATH output $OUT_PATH"
hadoop fs -rmr $OUT_PATH
echo "removed output dir"
hadoop fs -rm /user/pranab/output/dava/*
hadoop fs -rm /usr/avi/chombo/output/dava/*
echo "removed invalid data file"

hadoop jar $JAR_NAME $CLASS_NAME -Dconf.path=/home/pranab/Projects/bin/chombo/dava.properties $IN_PATH $OUT_PATH
hadoop jar $JAR_NAME $CLASS_NAME -Dconf.path=/home/ec2-user/pranab/chombo/resource/dava.properties $IN_PATH $OUT_PATH
26 changes: 26 additions & 0 deletions resource/dava.spark.properties
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#system.master=local[*]
system.master=spark://172.31.8.69:7077
filter.invalid.records=true
output.invalid.records=true
invalid.records.output.file=hdfs://localhost:9000/usr/avi/chombo/invalid.txt
field.delim.in=,
field.delim.out=,
val.tag.separator=,
schema.file.path=/usr/avi/chombo/meta/dava/electr_prod.json
config.file.path=/usr/avi/chombo/meta/dava/electr_prod.conf

field.delim.regex=,
debug.on=true
num.reducer=1
mapreduce.map.maxattempts=2
mapreduce.reduce.maxattempts=2

#ValidationChecker
#vac.cleanser.schema.file.path=
validator.0=notMissing
validator.1=membership,notMissing
validator.2=membership,notMissing
validator.3=exactLength,notMissing
validator.4=min,max,notMissing
validator.5=min,max,notMissing

1 change: 0 additions & 1 deletion resource/electr_prod.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
{
"name" : "electronicProduct",
"attributes" :
[
{
Expand Down
76 changes: 44 additions & 32 deletions spark/src/main/scala/org/chombo/spark/etl/DataValidator.scala
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import org.apache.spark.SparkContext
import org.chombo.util.Utility
import org.chombo.validator.ValidatorFactory
import com.typesafe.config.Config
import com.typesafe.config.ConfigValue
import org.chombo.validator.Validator
import org.chombo.util.ProcessorAttributeSchema
import org.chombo.util.NumericalAttrStatsManager
Expand All @@ -47,45 +48,54 @@ object DataValidator extends JobConfiguration {
* @return
*/
def main(args: Array[String]) {
val Array(master: String, inputPath: String, outputPath: String, configFile: String) = getCommandLineArgs(args, 3)
val Array( inputPath: String, outputPath: String, configFile: String) = getCommandLineArgs(args, 3)
val config = createConfig(configFile)
val localConfig = config.atPath("app")
val sparkConf = createSparkConf("app.data validation", config, false)
val sparkCntxt = new SparkContext(sparkConf)

if (config.hasPath("app.invalid.records.output.file"))
config.getString("app.invalid.records.output.file")
else
""

val fieldDelimIn = config.getString("app.field.delim.in")
val fieldDelimOut = config.getString("app.field.delim.out")
val valTagSeparator = config.getString("app.val.tag.separator")
val filterInvalidRecords = config.getBoolean("app.filter.invalid.records")
val outputInvalidRecords = config.getBoolean("app.output.invalid.records")
if (localConfig.hasPath("app.invalid.records.output.file"))
localConfig.getString("app.invalid.records.output.file")

val fieldDelimIn = localConfig.getString("app.field.delim.in")
val fieldDelimOut = localConfig.getString("app.field.delim.out")
val valTagSeparator = localConfig.getString("app.val.tag.separator")
val filterInvalidRecords = localConfig.getBoolean("app.filter.invalid.records")
val outputInvalidRecords = localConfig.getBoolean("app.output.invalid.records")
val invalidRecordsOutputFile =
if (config.hasPath("app.invalid.records.output.file"))
config.getString("app.invalid.records.output.file")
if (localConfig.hasPath("app.invalid.records.output.file"))
localConfig.getString("app.invalid.records.output.file")
else
""
val validationSchema = Utility.getProcessingSchema( config.getString("app.schema.file.path"))
""
val validationSchema = Utility.getProcessingSchema( localConfig.getString("app.schema.file.path"))

val validatorConfig = config.atPath("app")
ValidatorFactory.initialize( config.getString( "app,custom.valid.factory.class"), validatorConfig )

val configClass =
if (localConfig.hasPath("app.custom.valid.factory.class"))
localConfig.getString("app.custom.valid.factory.class")
else
null
ValidatorFactory.initialize(configClass, validatorConfig )
val ordinals = validationSchema.getAttributeOrdinals()
val tagSep = config.getString( "app,vaidator.tag.separator")
val tagSep = localConfig.getString( "app.val.tag.separator")

//initialize stats manager
getAttributeStats(config.getString("app.stats.file.path"))
getAttributeMeds(config.getString("app.med.stats.file.path"), config.getString("app.mad.stats.file.path"),
Utility.intArrayFromString(config.getString("app.id.ordinals"), ",") )
if(localConfig.hasPath("app.stats.file.path"))
getAttributeStats(localConfig.getString("app.stats.file.path"))
if(localConfig.hasPath("app.med.stats.file.path"))
getAttributeMeds(localConfig.getString("app.med.stats.file.path"), localConfig.getString("app.mad.stats.file.path"),
Utility.intArrayFromString(localConfig.getString("app.id.ordinals"), ",") )


//simple validators
var foundSimpleValidators = false

ordinals.foreach(ord => {
val key = "app.validator." + ord
if (config.hasPath(key)) {
val validatorTag = config.getString(key)
val valTags = validatorTag.split(tagSep);
if (localConfig.hasPath(key)) {
val validatorTag : String = localConfig.getString(key)
val valTags :Array[String] = validatorTag.split(tagSep);
createValidators(config, valTags, ord, validationSchema, mutValidators)
foundSimpleValidators = true
}
Expand All @@ -110,12 +120,13 @@ object DataValidator extends JobConfiguration {

//apply all validators for the field
val taggedItems = itemsZipped.map(z => {
val valList = validators.get(z._2).get
println("The value of z is " + z)
val valList : Array[Validator] = validators.get(z._2).get
val valStatuses = valList.map(validator => {
val status = validator.isValid(z._1)
(validator.getTag(), status)
})

//only failed validators
val failedValidators = valStatuses.filter(s => {
!s._2
Expand All @@ -124,17 +135,18 @@ object DataValidator extends JobConfiguration {
val field = if (failedValidators.isEmpty)
z._1
else
z._1 + valTagSeparator + failedValidators.mkString(fieldDelimOut)
z._1 + valTagSeparator + failedValidators.mkString(fieldDelimOut)

field
})

taggedItems.mkString(fieldDelimOut)
})
taggedData.cache

//filter valid data
val validData = taggedData.filter(line => !line.contains(valTagSeparator))
val validData = taggedData.filter(line => !line.contains(":"))
val list = validData.collect()
list.foreach(println)
validData.saveAsTextFile(outputPath)

//filter invalid data
Expand All @@ -154,9 +166,9 @@ object DataValidator extends JobConfiguration {
private def createValidators( config : Config , valTags : Array[String], ord : Int,
validationSchema : ProcessorAttributeSchema, mutValidators : scala.collection.mutable.HashMap[Int, Array[Validator]]) {
val validatorList = List[Validator]()
val prAttr = validationSchema.findAttributeByOrdinal(ord)
val validatorConfig = config.atPath("app")
val validators = valTags.map(tag => {
val prAttr = validationSchema.findAttributeByOrdinal(ord)
val validatorConfig = config
val validators = valTags.filter(_.length > 0 ).map(tag => {
val validator = tag match {
case "zscoreBasedRange" => {
getAttributeStats(config.getString("app.stats.file.path"))
Expand Down Expand Up @@ -206,4 +218,4 @@ private def createValidators( config : Config , valTags : Array[String], ord
validationContext.clear()
validationContext.put("stats", medStatManager.get)
}
}
}
8 changes: 7 additions & 1 deletion src/main/java/org/chombo/validator/ValidatorFactory.java
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ public static Validator create(String validatorType, ProcessorAttribute prAttr,
validator = new NumericalValidator.StatsBasedRangeValidator(validatorType, prAttr, validatorContext);
} else if (validatorType.equals( ROBUST_ZCORE_BASED_RANGE_VALIDATOR)) {
validator = new NumericalValidator.RobustZscoreBasedRangeValidator(validatorType, prAttr, validatorContext);
} else {
} else if (null != valConfig){
//custom validator with configured validator class names
validator = createCustomValidator(validatorType, prAttr, valConfig);

Expand Down Expand Up @@ -221,6 +221,12 @@ private static Validator createCustomValidator(String validatorType, ProcessorA
* @return
*/
public static Config getValidatorConfig(Config transformerConfig ,String validatorTag, ProcessorAttribute prAttr) {
if(null == transformerConfig)
return null;

if(!transformerConfig.hasPath("validators." + validatorTag))
return null;

Config valConfig = transformerConfig.getConfig("validators." + validatorTag);
Config config = null;
try {
Expand Down