Thursday, February 14, 2008

Reading Hadoop SequenceFile from Pig

A trick to read SequenceFile generated by Hadoop into Pig:


public class SequenceFileStorage implements LoadFunc {

protected SequenceFileRecordReader reader;


public SequenceFileStorage() {}
public void bindTo(String fileName, BufferedPositionedInputStream in, long offset, long end) throws IOException {         
Path file = new Path(fileName);
FileSplit split = new FileSplit(file, offset, end-offset, new JobConf());
reader = new SequenceFileRecordReader(new Configuration(), split);
}

public Tuple getNext() throws IOException {
MyKey key = new MyKey();
MyValue value = new MyValue();

if (! reader.next(key, listing)) {
reader.close();
return null;
}

Tuple tuple = new Tuple();
tuple.appendField(value.getData());

return tuple;
}
}