diff --git a/interview-mr/pom.xml b/interview-mr/pom.xml index e04d5f2..928749b 100644 --- a/interview-mr/pom.xml +++ b/interview-mr/pom.xml @@ -39,7 +39,7 @@ junit junit - 3.8.1 + 4.11 test @@ -48,10 +48,5 @@ 2.6.0 provided - - com.google.guava - guava - 16.0 - diff --git a/interview-mr/src/main/data/junit-data.tsv b/interview-mr/src/main/data/junit-data.tsv new file mode 100644 index 0000000..bba5ba7 --- /dev/null +++ b/interview-mr/src/main/data/junit-data.tsv @@ -0,0 +1,7 @@ +Dummy Line +jewelry 0.504193 http://411mania.com/movies/comics-411-03-26-14-favorite-marvel-superhero-team-edition/ http://411mania.com/movies/comics-411-03-26-14-favorite-marvel-superhero-team-edition/ Sovrn +literature-language 0.556966 http://411mania.com/wrestling/411-fact-or-fiction-09-06-12-punkheyman-d-bry-kane-hug-it-out-more/ http://411mania.com/wrestling/411-fact-or-fiction-09-06-12-punkheyman-d-bry-kane-hug-it-out-more/ Sovrn +education 0.931712 http://footballscoop.com/news/brady-hoke-let-go-michigan/ brady-hoke-let-go-michigan/ Sovrn +jewelry jewelry http://411mania.com/movies/comics-411-03-26-14-favorite-marvel-superhero-team-edition/ http://411mania.com/movies/comics-411-03-26-14-favorite-marvel-superhero-team-edition/ Sovrn +internet 0.897649 http://starcasm.net/archives/147255 http://starcasm.net/archives/147255 Sovrn +sports 0.506808 http://totalfratmove.com/senior-uga-wide-receiver-chris-conley-trolls-freshman-running-back-nick-chubb-on-twitter/ http://totalfratmove.com/senior-uga-wide-receiver-chris-conley-trolls-freshman-running-back-nick-chubb-on-twitter/ Sovrn diff --git a/interview-mr/src/main/java/com/sovrn/interview/mr/AverageScore.java b/interview-mr/src/main/java/com/sovrn/interview/mr/AverageScore.java index 9cf90fc..6f9af83 100644 --- a/interview-mr/src/main/java/com/sovrn/interview/mr/AverageScore.java +++ b/interview-mr/src/main/java/com/sovrn/interview/mr/AverageScore.java @@ -1,6 +1,8 @@ package com.sovrn.interview.mr; import java.io.IOException; +import java.net.URI; +import java.net.URISyntaxException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; @@ -19,7 +21,7 @@ * 2.) The score is the 2nd column of the data file * 3.) The normalized URL is the 4th column of the data file. * - * The tab seperated data file can be found under src/main/data/data.tsv + * The tab separated data file can be found under src/main/data/data.tsv * * Example data: * @@ -31,38 +33,78 @@ * navigation 0.615594 http://411mania.com/games/dragon-fantasy-book-one-psn-review/ http://411mania.com/games/dragon-fantasy-book-one-psn-review/ Sovrn */ public class AverageScore { + public static void main(String[] args) throws Exception { + if(args.length < 2) + System.exit(2); + System.exit(mapReduce(args[0], args[1])); + } + + public static int mapReduce(String inputPath, String outputPath) throws Exception { Configuration conf = new Configuration(); Job job = Job.getInstance(conf, "Interview Averaging"); job.setJarByClass(AverageScore.class); job.setMapperClass(AverageScoreMapper.class); job.setReducerClass(AverageScoreReducer.class); - // TODO: Finish the key and output setup + job.setOutputKeyClass(Text.class); + job.setOutputValueClass(FloatWritable.class); - FileInputFormat.addInputPath(job, new Path(args[0])); - FileOutputFormat.setOutputPath(job, new Path(args[1])); + FileInputFormat.addInputPath(job, new Path(inputPath)); + FileOutputFormat.setOutputPath(job, new Path(outputPath)); - System.exit(job.waitForCompletion(true) ? 0 : 1); + return job.waitForCompletion(true) ? 0 : 1; } public static class AverageScoreMapper extends Mapper { + private final FloatWritable number = new FloatWritable(); + private final Text domainText = new Text(); + @Override protected void map(Object key, Text value, Mapper.Context context) throws IOException, InterruptedException { - // TODO : Implement the mapping phase + String[] values = value.toString().split("\t"); + if(values.length < 4) + return; + + try { + String domain = new URI(values[3]).getHost(); + number.set(Float.parseFloat(values[1])); + + if(domain != null) { + domainText.set(domain); + context.write(domainText, number); + } + + } catch (URISyntaxException e) { + // Invalid data - not a clean URL. No error reporting requirement, so ignore for now. + } catch (NumberFormatException e) { + // Invalid data - value wasn't a number (as in the column header line). + // No error reporting requirement, so ignore for now. + } } } public static class AverageScoreReducer extends Reducer { + private final FloatWritable average = new FloatWritable(); + @Override protected void reduce(Text key, Iterable values, Reducer.Context context) throws IOException, InterruptedException { - // TODO: Implemnet the reducer phase + float sum = 0f; + int count = 0; + + for (FloatWritable value : values) { + sum += value.get(); + count++; + } + + average.set(count > 0 ? sum / count : 0f); + context.write(key, average); } } diff --git a/interview-mr/src/test/java/com/sovrn/interview/mr/AverageScoreTest.java b/interview-mr/src/test/java/com/sovrn/interview/mr/AverageScoreTest.java new file mode 100644 index 0000000..16796ea --- /dev/null +++ b/interview-mr/src/test/java/com/sovrn/interview/mr/AverageScoreTest.java @@ -0,0 +1,93 @@ +package com.sovrn.interview.mr; + +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; +import static org.junit.Assert.assertEquals; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; +import java.util.Scanner; + +import com.sovrn.interview.mr.AverageScore; + +import org.apache.commons.io.FileUtils; +import org.junit.Before; +import org.junit.Test; + +public class AverageScoreTest { + private final String outputPath = "junit-output"; + AverageScore score; + + @Before + public void clean() { + try { + FileUtils.deleteDirectory(new File(outputPath)); + } catch (IOException e) { + // ignore + } + } + + private Map getOutput() { + Scanner scanner = null; + Map results = new HashMap(); + + try { + scanner = new Scanner(new File(outputPath+"/part-r-00000")); + while(scanner.hasNext()) { + String values[] = scanner.nextLine().split("\t"); + if(values.length < 2) + fail("Output contained invalid line"); + results.put(values[0], Float.parseFloat(values[1])); + } + } catch (FileNotFoundException e) { + fail("Output file not found: "+e); + } catch (NumberFormatException e) { + fail("Output file contained data without a numerical value: "+e); + } finally { + if(scanner != null) + scanner.close(); + } + return results; + } + + @Test + public void mapReduceBasicTest() { + try { + int retVal = AverageScore.mapReduce("src/main/data/data.tsv", outputPath); + assertEquals(0,retVal); + + // Verify basic output format and number of entries + assertEquals(189, getOutput().size()); + } catch (Exception e) { + fail("Exceptions should not be thrown, but threw "+e); + } + } + + @Test + public void mapReduceSpecificTest() { + Map expectedOutput = new HashMap(); + expectedOutput.put("411mania.com", 0.5305795f); // Manually verified average + expectedOutput.put("starcasm.net", 0.897649f); + expectedOutput.put("totalfratmove.com", 0.506808f); + + try { + // Note that data contains an invalid line, an invalid domain, and a line without a number. + int retVal = AverageScore.mapReduce("src/main/data/junit-data.tsv", outputPath); + assertEquals(0,retVal); + + // Verify output exactly + Map output = getOutput(); + assertEquals(3, output.size()); + for(Map.Entry entry : output.entrySet()) { + assertTrue(expectedOutput.keySet().contains(entry.getKey())); + assertEquals(expectedOutput.get(entry.getKey()), entry.getValue()); + } + } catch (Exception e) { + fail("Exceptions should not be thrown, but threw "+e); + } + } + +}