Skip to content
This repository was archived by the owner on Mar 24, 2020. It is now read-only.
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 1 addition & 6 deletions interview-mr/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>3.8.1</version>
<version>4.11</version>
<scope>test</scope>
</dependency>
<dependency>
Expand All @@ -48,10 +48,5 @@
<version>2.6.0</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>16.0</version>
</dependency>
</dependencies>
</project>
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I didn't use guava, so no need to explicitly include it.

7 changes: 7 additions & 0 deletions interview-mr/src/main/data/junit-data.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
Dummy Line
jewelry 0.504193 http://411mania.com/movies/comics-411-03-26-14-favorite-marvel-superhero-team-edition/ http://411mania.com/movies/comics-411-03-26-14-favorite-marvel-superhero-team-edition/ Sovrn
literature-language 0.556966 http://411mania.com/wrestling/411-fact-or-fiction-09-06-12-punkheyman-d-bry-kane-hug-it-out-more/ http://411mania.com/wrestling/411-fact-or-fiction-09-06-12-punkheyman-d-bry-kane-hug-it-out-more/ Sovrn
education 0.931712 http://footballscoop.com/news/brady-hoke-let-go-michigan/ brady-hoke-let-go-michigan/ Sovrn
jewelry jewelry http://411mania.com/movies/comics-411-03-26-14-favorite-marvel-superhero-team-edition/ http://411mania.com/movies/comics-411-03-26-14-favorite-marvel-superhero-team-edition/ Sovrn
internet 0.897649 http://starcasm.net/archives/147255 http://starcasm.net/archives/147255 Sovrn
sports 0.506808 http://totalfratmove.com/senior-uga-wide-receiver-chris-conley-trolls-freshman-running-back-nick-chubb-on-twitter/ http://totalfratmove.com/senior-uga-wide-receiver-chris-conley-trolls-freshman-running-back-nick-chubb-on-twitter/ Sovrn
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
package com.sovrn.interview.mr;

import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
Expand All @@ -19,7 +21,7 @@
* 2.) The score is the 2nd column of the data file
* 3.) The normalized URL is the 4th column of the data file.
*
* The tab seperated data file can be found under src/main/data/data.tsv
* The tab separated data file can be found under src/main/data/data.tsv
*
* Example data:
*
Expand All @@ -31,38 +33,78 @@
* navigation 0.615594 http://411mania.com/games/dragon-fantasy-book-one-psn-review/ http://411mania.com/games/dragon-fantasy-book-one-psn-review/ Sovrn
*/
public class AverageScore {

public static void main(String[] args) throws Exception {
if(args.length < 2)
System.exit(2);
System.exit(mapReduce(args[0], args[1]));
}

public static int mapReduce(String inputPath, String outputPath) throws Exception {
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Separated this out for modularity.

Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "Interview Averaging");
job.setJarByClass(AverageScore.class);
job.setMapperClass(AverageScoreMapper.class);
job.setReducerClass(AverageScoreReducer.class);
// TODO: Finish the key and output setup
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(FloatWritable.class);

FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
FileInputFormat.addInputPath(job, new Path(inputPath));
FileOutputFormat.setOutputPath(job, new Path(outputPath));

System.exit(job.waitForCompletion(true) ? 0 : 1);
return job.waitForCompletion(true) ? 0 : 1;
}

public static class AverageScoreMapper extends Mapper<Object, Text, Text, FloatWritable> {

private final FloatWritable number = new FloatWritable();
private final Text domainText = new Text();

@Override
protected void map(Object key, Text value, Mapper<Object, Text, Text, FloatWritable>.Context context)
throws IOException, InterruptedException {
// TODO : Implement the mapping phase
String[] values = value.toString().split("\t");

if(values.length < 4)
return;

try {
String domain = new URI(values[3]).getHost();
number.set(Float.parseFloat(values[1]));

if(domain != null) {
domainText.set(domain);
context.write(domainText, number);
}

} catch (URISyntaxException e) {
// Invalid data - not a clean URL. No error reporting requirement, so ignore for now.
} catch (NumberFormatException e) {
// Invalid data - value wasn't a number (as in the column header line).
// No error reporting requirement, so ignore for now.
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Didn't think designing an error-reporting component was a reasonably-sized addition to a test that wasn't supposed to be taking very long.

}
}

}

public static class AverageScoreReducer extends Reducer<Text, FloatWritable, Text, FloatWritable> {

private final FloatWritable average = new FloatWritable();

@Override
protected void reduce(Text key, Iterable<FloatWritable> values,
Reducer<Text, FloatWritable, Text, FloatWritable>.Context context) throws IOException,
InterruptedException {
// TODO: Implemnet the reducer phase
float sum = 0f;
int count = 0;

for (FloatWritable value : values) {
sum += value.get();
count++;
}

average.set(count > 0 ? sum / count : 0f);
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think reduce would be called if there were zero values, but divide-by-zero is a nasty condition, so I check for it anyway.

context.write(key, average);
}

}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
package com.sovrn.interview.mr;

import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;
import static org.junit.Assert.assertEquals;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.Scanner;

import com.sovrn.interview.mr.AverageScore;

import org.apache.commons.io.FileUtils;
import org.junit.Before;
import org.junit.Test;

public class AverageScoreTest {
private final String outputPath = "junit-output";
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Minor quibble, but could have made a separate output path for each test so all output is preserved after test failures, but it's pretty easy to run tests individually if they fail, too. I'm running out of lunch break :)

AverageScore score;

@Before
public void clean() {
try {
FileUtils.deleteDirectory(new File(outputPath));
} catch (IOException e) {
// ignore
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we haven't run these tests before, there's nothing to delete.

}
}

private Map<String, Float> getOutput() {
Scanner scanner = null;
Map<String, Float> results = new HashMap<String, Float>();

try {
scanner = new Scanner(new File(outputPath+"/part-r-00000"));
while(scanner.hasNext()) {
String values[] = scanner.nextLine().split("\t");
if(values.length < 2)
fail("Output contained invalid line");
results.put(values[0], Float.parseFloat(values[1]));
}
} catch (FileNotFoundException e) {
fail("Output file not found: "+e);
} catch (NumberFormatException e) {
fail("Output file contained data without a numerical value: "+e);
} finally {
if(scanner != null)
scanner.close();
}
return results;
}

@Test
public void mapReduceBasicTest() {
try {
int retVal = AverageScore.mapReduce("src/main/data/data.tsv", outputPath);
assertEquals(0,retVal);

// Verify basic output format and number of entries
assertEquals(189, getOutput().size());
} catch (Exception e) {
fail("Exceptions should not be thrown, but threw "+e);
}
}

@Test
public void mapReduceSpecificTest() {
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Kind of an 80/20 effort here. Tests all the basic edge cases, but doesn't get into many sequences/combinations of possible errors. Further testing would see diminishing returns.

A reasonable next addition would be a very large scale average computation, but that's a big piece of work to bite off.

Map<String, Float> expectedOutput = new HashMap<String, Float>();
expectedOutput.put("411mania.com", 0.5305795f); // Manually verified average
expectedOutput.put("starcasm.net", 0.897649f);
expectedOutput.put("totalfratmove.com", 0.506808f);

try {
// Note that data contains an invalid line, an invalid domain, and a line without a number.
int retVal = AverageScore.mapReduce("src/main/data/junit-data.tsv", outputPath);
assertEquals(0,retVal);

// Verify output exactly
Map<String, Float> output = getOutput();
assertEquals(3, output.size());
for(Map.Entry<String, Float> entry : output.entrySet()) {
assertTrue(expectedOutput.keySet().contains(entry.getKey()));
assertEquals(expectedOutput.get(entry.getKey()), entry.getValue());
}
} catch (Exception e) {
fail("Exceptions should not be thrown, but threw "+e);
}
}

}