-
Notifications
You must be signed in to change notification settings - Fork 1
Completed MapReduce Functionality for Average Scores by Domain #3
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,7 @@ | ||
| Dummy Line | ||
| jewelry 0.504193 http://411mania.com/movies/comics-411-03-26-14-favorite-marvel-superhero-team-edition/ http://411mania.com/movies/comics-411-03-26-14-favorite-marvel-superhero-team-edition/ Sovrn | ||
| literature-language 0.556966 http://411mania.com/wrestling/411-fact-or-fiction-09-06-12-punkheyman-d-bry-kane-hug-it-out-more/ http://411mania.com/wrestling/411-fact-or-fiction-09-06-12-punkheyman-d-bry-kane-hug-it-out-more/ Sovrn | ||
| education 0.931712 http://footballscoop.com/news/brady-hoke-let-go-michigan/ brady-hoke-let-go-michigan/ Sovrn | ||
| jewelry jewelry http://411mania.com/movies/comics-411-03-26-14-favorite-marvel-superhero-team-edition/ http://411mania.com/movies/comics-411-03-26-14-favorite-marvel-superhero-team-edition/ Sovrn | ||
| internet 0.897649 http://starcasm.net/archives/147255 http://starcasm.net/archives/147255 Sovrn | ||
| sports 0.506808 http://totalfratmove.com/senior-uga-wide-receiver-chris-conley-trolls-freshman-running-back-nick-chubb-on-twitter/ http://totalfratmove.com/senior-uga-wide-receiver-chris-conley-trolls-freshman-running-back-nick-chubb-on-twitter/ Sovrn |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,6 +1,8 @@ | ||
| package com.sovrn.interview.mr; | ||
|
|
||
| import java.io.IOException; | ||
| import java.net.URI; | ||
| import java.net.URISyntaxException; | ||
|
|
||
| import org.apache.hadoop.conf.Configuration; | ||
| import org.apache.hadoop.fs.Path; | ||
|
|
@@ -19,7 +21,7 @@ | |
| * 2.) The score is the 2nd column of the data file | ||
| * 3.) The normalized URL is the 4th column of the data file. | ||
| * | ||
| * The tab seperated data file can be found under src/main/data/data.tsv | ||
| * The tab separated data file can be found under src/main/data/data.tsv | ||
| * | ||
| * Example data: | ||
| * | ||
|
|
@@ -31,38 +33,78 @@ | |
| * navigation 0.615594 http://411mania.com/games/dragon-fantasy-book-one-psn-review/ http://411mania.com/games/dragon-fantasy-book-one-psn-review/ Sovrn | ||
| */ | ||
| public class AverageScore { | ||
|
|
||
| public static void main(String[] args) throws Exception { | ||
| if(args.length < 2) | ||
| System.exit(2); | ||
| System.exit(mapReduce(args[0], args[1])); | ||
| } | ||
|
|
||
| public static int mapReduce(String inputPath, String outputPath) throws Exception { | ||
|
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Separated this out for modularity. |
||
| Configuration conf = new Configuration(); | ||
| Job job = Job.getInstance(conf, "Interview Averaging"); | ||
| job.setJarByClass(AverageScore.class); | ||
| job.setMapperClass(AverageScoreMapper.class); | ||
| job.setReducerClass(AverageScoreReducer.class); | ||
| // TODO: Finish the key and output setup | ||
| job.setOutputKeyClass(Text.class); | ||
| job.setOutputValueClass(FloatWritable.class); | ||
|
|
||
| FileInputFormat.addInputPath(job, new Path(args[0])); | ||
| FileOutputFormat.setOutputPath(job, new Path(args[1])); | ||
| FileInputFormat.addInputPath(job, new Path(inputPath)); | ||
| FileOutputFormat.setOutputPath(job, new Path(outputPath)); | ||
|
|
||
| System.exit(job.waitForCompletion(true) ? 0 : 1); | ||
| return job.waitForCompletion(true) ? 0 : 1; | ||
| } | ||
|
|
||
| public static class AverageScoreMapper extends Mapper<Object, Text, Text, FloatWritable> { | ||
|
|
||
| private final FloatWritable number = new FloatWritable(); | ||
| private final Text domainText = new Text(); | ||
|
|
||
| @Override | ||
| protected void map(Object key, Text value, Mapper<Object, Text, Text, FloatWritable>.Context context) | ||
| throws IOException, InterruptedException { | ||
| // TODO : Implement the mapping phase | ||
| String[] values = value.toString().split("\t"); | ||
|
|
||
| if(values.length < 4) | ||
| return; | ||
|
|
||
| try { | ||
| String domain = new URI(values[3]).getHost(); | ||
| number.set(Float.parseFloat(values[1])); | ||
|
|
||
| if(domain != null) { | ||
| domainText.set(domain); | ||
| context.write(domainText, number); | ||
| } | ||
|
|
||
| } catch (URISyntaxException e) { | ||
| // Invalid data - not a clean URL. No error reporting requirement, so ignore for now. | ||
| } catch (NumberFormatException e) { | ||
| // Invalid data - value wasn't a number (as in the column header line). | ||
| // No error reporting requirement, so ignore for now. | ||
|
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Didn't think designing an error-reporting component was a reasonably-sized addition to a test that wasn't supposed to be taking very long. |
||
| } | ||
| } | ||
|
|
||
| } | ||
|
|
||
| public static class AverageScoreReducer extends Reducer<Text, FloatWritable, Text, FloatWritable> { | ||
|
|
||
| private final FloatWritable average = new FloatWritable(); | ||
|
|
||
| @Override | ||
| protected void reduce(Text key, Iterable<FloatWritable> values, | ||
| Reducer<Text, FloatWritable, Text, FloatWritable>.Context context) throws IOException, | ||
| InterruptedException { | ||
| // TODO: Implemnet the reducer phase | ||
| float sum = 0f; | ||
| int count = 0; | ||
|
|
||
| for (FloatWritable value : values) { | ||
| sum += value.get(); | ||
| count++; | ||
| } | ||
|
|
||
| average.set(count > 0 ? sum / count : 0f); | ||
|
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think |
||
| context.write(key, average); | ||
| } | ||
|
|
||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,93 @@ | ||
| package com.sovrn.interview.mr; | ||
|
|
||
| import static org.junit.Assert.assertTrue; | ||
| import static org.junit.Assert.fail; | ||
| import static org.junit.Assert.assertEquals; | ||
|
|
||
| import java.io.File; | ||
| import java.io.FileNotFoundException; | ||
| import java.io.IOException; | ||
| import java.util.HashMap; | ||
| import java.util.Map; | ||
| import java.util.Scanner; | ||
|
|
||
| import com.sovrn.interview.mr.AverageScore; | ||
|
|
||
| import org.apache.commons.io.FileUtils; | ||
| import org.junit.Before; | ||
| import org.junit.Test; | ||
|
|
||
| public class AverageScoreTest { | ||
| private final String outputPath = "junit-output"; | ||
|
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Minor quibble, but could have made a separate output path for each test so all output is preserved after test failures, but it's pretty easy to run tests individually if they fail, too. I'm running out of lunch break :) |
||
| AverageScore score; | ||
|
|
||
| @Before | ||
| public void clean() { | ||
| try { | ||
| FileUtils.deleteDirectory(new File(outputPath)); | ||
| } catch (IOException e) { | ||
| // ignore | ||
|
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If we haven't run these tests before, there's nothing to delete. |
||
| } | ||
| } | ||
|
|
||
| private Map<String, Float> getOutput() { | ||
| Scanner scanner = null; | ||
| Map<String, Float> results = new HashMap<String, Float>(); | ||
|
|
||
| try { | ||
| scanner = new Scanner(new File(outputPath+"/part-r-00000")); | ||
| while(scanner.hasNext()) { | ||
| String values[] = scanner.nextLine().split("\t"); | ||
| if(values.length < 2) | ||
| fail("Output contained invalid line"); | ||
| results.put(values[0], Float.parseFloat(values[1])); | ||
| } | ||
| } catch (FileNotFoundException e) { | ||
| fail("Output file not found: "+e); | ||
| } catch (NumberFormatException e) { | ||
| fail("Output file contained data without a numerical value: "+e); | ||
| } finally { | ||
| if(scanner != null) | ||
| scanner.close(); | ||
| } | ||
| return results; | ||
| } | ||
|
|
||
| @Test | ||
| public void mapReduceBasicTest() { | ||
| try { | ||
| int retVal = AverageScore.mapReduce("src/main/data/data.tsv", outputPath); | ||
| assertEquals(0,retVal); | ||
|
|
||
| // Verify basic output format and number of entries | ||
| assertEquals(189, getOutput().size()); | ||
| } catch (Exception e) { | ||
| fail("Exceptions should not be thrown, but threw "+e); | ||
| } | ||
| } | ||
|
|
||
| @Test | ||
| public void mapReduceSpecificTest() { | ||
|
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Kind of an 80/20 effort here. Tests all the basic edge cases, but doesn't get into many sequences/combinations of possible errors. Further testing would see diminishing returns. A reasonable next addition would be a very large scale average computation, but that's a big piece of work to bite off. |
||
| Map<String, Float> expectedOutput = new HashMap<String, Float>(); | ||
| expectedOutput.put("411mania.com", 0.5305795f); // Manually verified average | ||
| expectedOutput.put("starcasm.net", 0.897649f); | ||
| expectedOutput.put("totalfratmove.com", 0.506808f); | ||
|
|
||
| try { | ||
| // Note that data contains an invalid line, an invalid domain, and a line without a number. | ||
| int retVal = AverageScore.mapReduce("src/main/data/junit-data.tsv", outputPath); | ||
| assertEquals(0,retVal); | ||
|
|
||
| // Verify output exactly | ||
| Map<String, Float> output = getOutput(); | ||
| assertEquals(3, output.size()); | ||
| for(Map.Entry<String, Float> entry : output.entrySet()) { | ||
| assertTrue(expectedOutput.keySet().contains(entry.getKey())); | ||
| assertEquals(expectedOutput.get(entry.getKey()), entry.getValue()); | ||
| } | ||
| } catch (Exception e) { | ||
| fail("Exceptions should not be thrown, but threw "+e); | ||
| } | ||
| } | ||
|
|
||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I didn't use guava, so no need to explicitly include it.