From 7695498c65033ebf6daa3fdc192c7a4c65c20d4d Mon Sep 17 00:00:00 2001 From: James Free Date: Tue, 10 Mar 2015 23:23:09 -0600 Subject: [PATCH 1/3] filled in TODOs as requested --- .../com/sovrn/interview/mr/AverageScore.java | 41 ++++++++++++++++--- 1 file changed, 36 insertions(+), 5 deletions(-) diff --git a/interview-mr/src/main/java/com/sovrn/interview/mr/AverageScore.java b/interview-mr/src/main/java/com/sovrn/interview/mr/AverageScore.java index 9cf90fc..b67a404 100644 --- a/interview-mr/src/main/java/com/sovrn/interview/mr/AverageScore.java +++ b/interview-mr/src/main/java/com/sovrn/interview/mr/AverageScore.java @@ -1,6 +1,8 @@ package com.sovrn.interview.mr; import java.io.IOException; +import java.net.URI; +import java.net.URISyntaxException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; @@ -19,7 +21,7 @@ * 2.) The score is the 2nd column of the data file * 3.) The normalized URL is the 4th column of the data file. * - * The tab seperated data file can be found under src/main/data/data.tsv + * The tab separated data file can be found under src/main/data/data.tsv * * Example data: * @@ -31,13 +33,15 @@ * navigation 0.615594 http://411mania.com/games/dragon-fantasy-book-one-psn-review/ http://411mania.com/games/dragon-fantasy-book-one-psn-review/ Sovrn */ public class AverageScore { + public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); Job job = Job.getInstance(conf, "Interview Averaging"); job.setJarByClass(AverageScore.class); job.setMapperClass(AverageScoreMapper.class); job.setReducerClass(AverageScoreReducer.class); - // TODO: Finish the key and output setup + job.setOutputKeyClass(Text.class); + job.setOutputValueClass(FloatWritable.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); @@ -47,22 +51,49 @@ public static void main(String[] args) throws Exception { public static class AverageScoreMapper extends Mapper { + private final FloatWritable number = new FloatWritable(); + private final Text domainText = new Text(); + @Override protected void map(Object key, Text value, Mapper.Context context) throws IOException, InterruptedException { - // TODO : Implement the mapping phase - + String[] values = value.toString().split("\t"); + + try { + String domain = new URI(values[3]).getHost(); + number.set(Float.parseFloat(values[1])); + + if(domain != null) + domainText.set(domain); + context.write(domainText, number); + + } catch (URISyntaxException | NumberFormatException e) { + // Invalid data - either not a clean URL or the value + // wasn't a number (as in the column header line). No + // error reporting requirement, so do nothing for now. + } } } public static class AverageScoreReducer extends Reducer { + private final FloatWritable average = new FloatWritable(); + @Override protected void reduce(Text key, Iterable values, Reducer.Context context) throws IOException, InterruptedException { - // TODO: Implemnet the reducer phase + float sum = 0f; + int count = 0; + + for (FloatWritable value : values) { + sum += value.get(); + count++; + } + + average.set(count > 0 ? sum / count : 0f); + context.write(key, average); } } From f3c2039ce2f780abc2e84aef7afb9c2436c8853a Mon Sep 17 00:00:00 2001 From: James Free Date: Wed, 11 Mar 2015 08:03:37 -0600 Subject: [PATCH 2/3] add missing error-checking --- .../src/main/java/com/sovrn/interview/mr/AverageScore.java | 3 +++ 1 file changed, 3 insertions(+) diff --git a/interview-mr/src/main/java/com/sovrn/interview/mr/AverageScore.java b/interview-mr/src/main/java/com/sovrn/interview/mr/AverageScore.java index b67a404..f36314b 100644 --- a/interview-mr/src/main/java/com/sovrn/interview/mr/AverageScore.java +++ b/interview-mr/src/main/java/com/sovrn/interview/mr/AverageScore.java @@ -58,6 +58,9 @@ public static class AverageScoreMapper extends Mapper.Context context) throws IOException, InterruptedException { String[] values = value.toString().split("\t"); + + if(values.length < 4) + return; try { String domain = new URI(values[3]).getHost(); From 84948e82e1f46c646be64594b7bad75b761b7e54 Mon Sep 17 00:00:00 2001 From: James Free Date: Wed, 11 Mar 2015 12:27:12 -0600 Subject: [PATCH 3/3] add tests of mapreduce averaging --- interview-mr/pom.xml | 7 +- interview-mr/src/main/data/junit-data.tsv | 7 ++ .../com/sovrn/interview/mr/AverageScore.java | 24 +++-- .../sovrn/interview/mr/AverageScoreTest.java | 93 +++++++++++++++++++ 4 files changed, 117 insertions(+), 14 deletions(-) create mode 100644 interview-mr/src/main/data/junit-data.tsv create mode 100644 interview-mr/src/test/java/com/sovrn/interview/mr/AverageScoreTest.java diff --git a/interview-mr/pom.xml b/interview-mr/pom.xml index e04d5f2..928749b 100644 --- a/interview-mr/pom.xml +++ b/interview-mr/pom.xml @@ -39,7 +39,7 @@ junit junit - 3.8.1 + 4.11 test @@ -48,10 +48,5 @@ 2.6.0 provided - - com.google.guava - guava - 16.0 - diff --git a/interview-mr/src/main/data/junit-data.tsv b/interview-mr/src/main/data/junit-data.tsv new file mode 100644 index 0000000..bba5ba7 --- /dev/null +++ b/interview-mr/src/main/data/junit-data.tsv @@ -0,0 +1,7 @@ +Dummy Line +jewelry 0.504193 http://411mania.com/movies/comics-411-03-26-14-favorite-marvel-superhero-team-edition/ http://411mania.com/movies/comics-411-03-26-14-favorite-marvel-superhero-team-edition/ Sovrn +literature-language 0.556966 http://411mania.com/wrestling/411-fact-or-fiction-09-06-12-punkheyman-d-bry-kane-hug-it-out-more/ http://411mania.com/wrestling/411-fact-or-fiction-09-06-12-punkheyman-d-bry-kane-hug-it-out-more/ Sovrn +education 0.931712 http://footballscoop.com/news/brady-hoke-let-go-michigan/ brady-hoke-let-go-michigan/ Sovrn +jewelry jewelry http://411mania.com/movies/comics-411-03-26-14-favorite-marvel-superhero-team-edition/ http://411mania.com/movies/comics-411-03-26-14-favorite-marvel-superhero-team-edition/ Sovrn +internet 0.897649 http://starcasm.net/archives/147255 http://starcasm.net/archives/147255 Sovrn +sports 0.506808 http://totalfratmove.com/senior-uga-wide-receiver-chris-conley-trolls-freshman-running-back-nick-chubb-on-twitter/ http://totalfratmove.com/senior-uga-wide-receiver-chris-conley-trolls-freshman-running-back-nick-chubb-on-twitter/ Sovrn diff --git a/interview-mr/src/main/java/com/sovrn/interview/mr/AverageScore.java b/interview-mr/src/main/java/com/sovrn/interview/mr/AverageScore.java index f36314b..6f9af83 100644 --- a/interview-mr/src/main/java/com/sovrn/interview/mr/AverageScore.java +++ b/interview-mr/src/main/java/com/sovrn/interview/mr/AverageScore.java @@ -35,6 +35,12 @@ public class AverageScore { public static void main(String[] args) throws Exception { + if(args.length < 2) + System.exit(2); + System.exit(mapReduce(args[0], args[1])); + } + + public static int mapReduce(String inputPath, String outputPath) throws Exception { Configuration conf = new Configuration(); Job job = Job.getInstance(conf, "Interview Averaging"); job.setJarByClass(AverageScore.class); @@ -43,10 +49,10 @@ public static void main(String[] args) throws Exception { job.setOutputKeyClass(Text.class); job.setOutputValueClass(FloatWritable.class); - FileInputFormat.addInputPath(job, new Path(args[0])); - FileOutputFormat.setOutputPath(job, new Path(args[1])); + FileInputFormat.addInputPath(job, new Path(inputPath)); + FileOutputFormat.setOutputPath(job, new Path(outputPath)); - System.exit(job.waitForCompletion(true) ? 0 : 1); + return job.waitForCompletion(true) ? 0 : 1; } public static class AverageScoreMapper extends Mapper { @@ -66,14 +72,16 @@ protected void map(Object key, Text value, Mapper getOutput() { + Scanner scanner = null; + Map results = new HashMap(); + + try { + scanner = new Scanner(new File(outputPath+"/part-r-00000")); + while(scanner.hasNext()) { + String values[] = scanner.nextLine().split("\t"); + if(values.length < 2) + fail("Output contained invalid line"); + results.put(values[0], Float.parseFloat(values[1])); + } + } catch (FileNotFoundException e) { + fail("Output file not found: "+e); + } catch (NumberFormatException e) { + fail("Output file contained data without a numerical value: "+e); + } finally { + if(scanner != null) + scanner.close(); + } + return results; + } + + @Test + public void mapReduceBasicTest() { + try { + int retVal = AverageScore.mapReduce("src/main/data/data.tsv", outputPath); + assertEquals(0,retVal); + + // Verify basic output format and number of entries + assertEquals(189, getOutput().size()); + } catch (Exception e) { + fail("Exceptions should not be thrown, but threw "+e); + } + } + + @Test + public void mapReduceSpecificTest() { + Map expectedOutput = new HashMap(); + expectedOutput.put("411mania.com", 0.5305795f); // Manually verified average + expectedOutput.put("starcasm.net", 0.897649f); + expectedOutput.put("totalfratmove.com", 0.506808f); + + try { + // Note that data contains an invalid line, an invalid domain, and a line without a number. + int retVal = AverageScore.mapReduce("src/main/data/junit-data.tsv", outputPath); + assertEquals(0,retVal); + + // Verify output exactly + Map output = getOutput(); + assertEquals(3, output.size()); + for(Map.Entry entry : output.entrySet()) { + assertTrue(expectedOutput.keySet().contains(entry.getKey())); + assertEquals(expectedOutput.get(entry.getKey()), entry.getValue()); + } + } catch (Exception e) { + fail("Exceptions should not be thrown, but threw "+e); + } + } + +}