Excel ETL with PL/SQL and Apache POI
I
created a ETL application to populate fact and dimension tables from
an excel file. The first approach was to create a standalone java
application, using Apache POI and JDBC to complete the tables.
The application was running fine… but slowly. Whith an excel file with around than 25k rows the time elapsed was around 440 milliseconds.
(App and DB were running in the same server)
The steps were:
- Load Apache POI java classes into the DB
- Create a small java class to read from excel files
- Create a poi package as interface to java code
- Create a PL/SQL ETL routine
Load Apache POI java classes into the DB
Before to load java classess, the user test was created…loadjava -user test/test -resolve stax/stax-api/1.0.1/stax-api-1.0.1.jar
loadjava -user test/test -resolve org/apache/ant/ant-launcher/1.8.2/ant-launcher-1.8.2.jar
loadjava -user test/test -resolve org/apache/ant/ant/1.8.2/ant-1.8.2.jar
loadjava -user test/test -resolve org/apache/xmlbeans/xmlbeans/2.4.0/xmlbeans-2.4.0.jar
loadjava -user test/test -resolve org/apache/poi/ooxml-schemas/1.0/ooxml-schemas-1.0.jar
loadjava -user test/test -resolve commons-codec/commons-codec/1.9/commons-codec-1.9.jar
loadjava -user test/test -resolve -genmissing org/apache/poi/poi-ooxml-schemas/3.11-beta1/poi-ooxml-schemas-3.11-beta1.jar
loadjava -user test/test -resolve -genmissing org/apache/poi/poi-ooxml/3.11-beta1/poi-ooxml-3.11-beta1.jar
loadjava -user test/test -resolve -genmissing org/apache/poi/poi/3.11-beta1/poi-3.11-beta1.jar
Create a small java class to read from excel files
public class poi {// Internal table to maintain workbook<->handle mappings
private static Hashtable<Integer,Workbook> table= new Hashtable<Integer,Workbook>();
// Sequence for next handle
private static int sequence_handle_id= 1;
/**
*
* @param bl Blob element
* @return an handle to the workbook if successfully loaded or 0 in other case
*/
public static int load(BLOB bl) {
try {
//Workbook workbook= WorkbookFactory.create(bl.getBinaryStream());
InputStream is= bl.getBinaryStream();
Workbook workbook= new HSSFWorkbook(is);
table.put(sequence_handle_id, workbook);
return sequence_handle_id++;
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
return 0;
}
}
/**
*
* @param handle
* @return close the workbook and return 0 if error or handle if successfully closed
*/
public static int close(int handle) {
// close the workbook and remove from the internal table
try{
Workbook workbook= table.get(handle);
workbook.close();
table.remove(handle);
return handle;
}catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
return 0;
}
}
/**
*
* @param handle
* @return the workbook as BLOB or null if error
* @throws Exception
*/
public static BLOB getBytes(int handle) throws Exception {
try{
Workbook workbook= table.get(handle);
// from http://stackoverflow.com/a/20072705
oracle.jdbc.OracleConnection conn =
(oracle.jdbc.OracleConnection)new OracleDriver().defaultConnection();
oracle.sql.BLOB retBlob =
oracle.sql.BLOB.createTemporary(conn,
true,
oracle.sql.BLOB.DURATION_SESSION);
java.io.OutputStream outStr = retBlob.setBinaryStream(0);
workbook.write(outStr);
outStr.flush();
return retBlob;
}catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
return null;
}
}
// CELL_TYPE_NUMERIC= 0
// CELL_TYPE_STRING= 1
// CELL_TYPE_FORMULA= 2
// CELL_TYPE_BLANK= 3
// CELL_TYPE_BOOLEAN= 4
// CELL_TYPE_ERROR= 5
public static int getCellType(int handle, String sheet, int row, int column) {
try{
Workbook workbook= table.get(handle);
return workbook.getSheet(sheet).getRow(row).getCell(column).getCellType();
}catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
return -1;
}
}
public static DATE getDateValue(int handle, String sheet, int row, int column) {
try{
Workbook workbook= table.get(handle);
java.sql.Date d= new java.sql.Date(workbook.getSheet(sheet).getRow(row).getCell(column).getDateCellValue().getTime());
return new DATE(d);
}catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
return null;
}
}
public static Double getNumericValue(int handle, String sheet, int row, int column) {
try{
Workbook workbook= table.get(handle);
return workbook.getSheet(sheet).getRow(row).getCell(column).getNumericCellValue();
}catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
return null;
}
}
public static String getStringValue(int handle, String sheet, int row, int column) {
try{
Workbook workbook= table.get(handle);
return workbook.getSheet(sheet).getRow(row).getCell(column).getStringCellValue();
}catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
return null;
}
}
}
and then loaded the class:
loadjava -user test/test -resolve poi_orcl.jar
Create a poi package as interface to java code
create or replace PACKAGE POI AS
FUNCTION load(bl BLOB) RETURN NUMBER AS
LANGUAGE JAVA NAME ‘poi.load(oracle.sql.BLOB) return int’;
FUNCTION close(handle NUMBER) RETURN NUMBER AS
LANGUAGE JAVA NAME ‘poi.close(int) return int’;
FUNCTION getDateValue(handle NUMBER, sheet VARCHAR2, coln NUMBER, rown NUMBER) RETURN DATE AS
LANGUAGE JAVA NAME ‘poi.getDateValue(int, java.lang.String, int, int) return oracle.sql.DATE’;
FUNCTION getStringValue(handle NUMBER, sheet VARCHAR2, coln NUMBER, rown NUMBER) RETURN VARCHAR2 AS
LANGUAGE JAVA NAME ‘poi.getStringValue(int, java.lang.String, int, int) return java.lang.String’;
FUNCTION getNumericValue(handle NUMBER, sheet VARCHAR2, coln NUMBER, rown NUMBER) RETURN NUMBER AS
LANGUAGE JAVA NAME ‘poi.getNumericValue(int, java.lang.String, int, int) return java.lang.Double’;
FUNCTION getCellType(handle NUMBER, sheet VARCHAR2, coln NUMBER, rown NUMBER) RETURN NUMBER AS
LANGUAGE JAVA NAME ‘poi.getCellType(int, java.lang.String, int, int) return int’;
END POI;