viernes, 24 de octubre de 2014

Excel ETL with PL/SQL and Apache POI

Excel ETL with PL/SQL and Apache POI

I created a ETL application to populate fact and dimension tables from an excel file. The first approach was to create a standalone java application, using Apache POI and JDBC  to complete the tables.
The application was running fine… but slowly. Whith an excel file with around than 25k rows the time elapsed was around 440 milliseconds.


image
(App and DB were running in the same server)

Then I decided to test the performance doing the same ETL process but with PL/SQL. The same process was completed in around 80 millisecons.

image

The steps were:
  • Load Apache POI java classes into the DB
  • Create a small java class to read from excel files
  • Create a poi package as interface to java code
  • Create a PL/SQL ETL routine

 

Load Apache POI java classes into the DB

Before to load java classess, the user test was created…
loadjava -user test/test -resolve stax/stax-api/1.0.1/stax-api-1.0.1.jar
loadjava -user test/test -resolve org/apache/ant/ant-launcher/1.8.2/ant-launcher-1.8.2.jar
loadjava -user test/test -resolve org/apache/ant/ant/1.8.2/ant-1.8.2.jar
loadjava -user test/test -resolve org/apache/xmlbeans/xmlbeans/2.4.0/xmlbeans-2.4.0.jar
loadjava -user test/test -resolve org/apache/poi/ooxml-schemas/1.0/ooxml-schemas-1.0.jar
loadjava -user test/test -resolve commons-codec/commons-codec/1.9/commons-codec-1.9.jar

loadjava -user test/test -resolve -genmissing org/apache/poi/poi-ooxml-schemas/3.11-beta1/poi-ooxml-schemas-3.11-beta1.jar
loadjava -user test/test -resolve -genmissing org/apache/poi/poi-ooxml/3.11-beta1/poi-ooxml-3.11-beta1.jar
loadjava -user test/test -resolve -genmissing org/apache/poi/poi/3.11-beta1/poi-3.11-beta1.jar


Create a small java class to read from excel files

public class poi {

    // Internal table to maintain workbook<->handle mappings
    private static Hashtable<Integer,Workbook> table= new Hashtable<Integer,Workbook>();
  
    // Sequence for next handle
    private static int sequence_handle_id= 1;
  

    /**
     *
     * @param bl Blob element
     * @return an handle to the workbook if successfully loaded or 0 in other case
     */
    public static int load(BLOB bl) {
        try {
            //Workbook workbook= WorkbookFactory.create(bl.getBinaryStream());
            InputStream is= bl.getBinaryStream();
            Workbook workbook= new HSSFWorkbook(is);
            table.put(sequence_handle_id, workbook);
            return sequence_handle_id++;
        } catch (Exception e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
            return 0;
        }
    }
  
    /**
     *
     * @param handle
     * @return close the workbook and return 0 if error or handle if successfully closed
     */
    public static int close(int handle) {
        // close the workbook and remove from the internal table
        try{
            Workbook workbook= table.get(handle);
            workbook.close();
            table.remove(handle);
            return handle;
        }catch (Exception e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
            return 0;
        }
    }
  
    /**
     *
     * @param handle
     * @return the workbook as BLOB or null if error
     * @throws Exception
     */
    public static BLOB getBytes(int handle) throws Exception {
        try{
            Workbook workbook= table.get(handle);
          
            // from http://stackoverflow.com/a/20072705
            oracle.jdbc.OracleConnection conn =
                      (oracle.jdbc.OracleConnection)new OracleDriver().defaultConnection();

            oracle.sql.BLOB retBlob =
                      oracle.sql.BLOB.createTemporary(conn,
                                                      true,
                                                      oracle.sql.BLOB.DURATION_SESSION);

            java.io.OutputStream outStr = retBlob.setBinaryStream(0);
            workbook.write(outStr);
            outStr.flush();
          
            return retBlob;
        }catch (Exception e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
            return null;
        }
    }
  
    // CELL_TYPE_NUMERIC= 0
    // CELL_TYPE_STRING=  1
    // CELL_TYPE_FORMULA= 2
    // CELL_TYPE_BLANK=   3
    // CELL_TYPE_BOOLEAN= 4
    // CELL_TYPE_ERROR=   5
    public static int getCellType(int handle, String sheet, int row, int column) {
        try{
            Workbook workbook= table.get(handle);
            return workbook.getSheet(sheet).getRow(row).getCell(column).getCellType();
        }catch (Exception e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
            return -1;
        }
    }
  
    public static DATE getDateValue(int handle, String sheet, int row, int column) {
        try{
            Workbook workbook= table.get(handle);
            java.sql.Date d= new java.sql.Date(workbook.getSheet(sheet).getRow(row).getCell(column).getDateCellValue().getTime());
            return new DATE(d);
        }catch (Exception e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
            return null;
        }
    }
  
    public static Double getNumericValue(int handle, String sheet, int row, int column) {
        try{
            Workbook workbook= table.get(handle);
            return workbook.getSheet(sheet).getRow(row).getCell(column).getNumericCellValue();
        }catch (Exception e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
            return null;
        }
    }
  
    public static String getStringValue(int handle, String sheet, int row, int column) {
        try{
            Workbook workbook= table.get(handle);
            return workbook.getSheet(sheet).getRow(row).getCell(column).getStringCellValue();
        }catch (Exception e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
            return null;
        }
    }
  
}


and then loaded the class:
loadjava -user test/test -resolve poi_orcl.jar

Create a poi package as interface to java code


create or replace PACKAGE POI AS    
  FUNCTION load(bl BLOB) RETURN NUMBER AS
     LANGUAGE JAVA NAME ‘poi.load(oracle.sql.BLOB) return int’;
   
  FUNCTION close(handle NUMBER) RETURN NUMBER AS
     LANGUAGE JAVA NAME ‘poi.close(int) return int’;

  FUNCTION getDateValue(handle NUMBER, sheet VARCHAR2, coln NUMBER, rown NUMBER) RETURN DATE AS
     LANGUAGE JAVA NAME ‘poi.getDateValue(int, java.lang.String, int, int) return oracle.sql.DATE’;
   
  FUNCTION getStringValue(handle NUMBER, sheet VARCHAR2, coln NUMBER, rown NUMBER) RETURN VARCHAR2 AS
     LANGUAGE JAVA NAME ‘poi.getStringValue(int, java.lang.String, int, int) return java.lang.String’;
 
  FUNCTION getNumericValue(handle NUMBER, sheet VARCHAR2, coln NUMBER, rown NUMBER) RETURN NUMBER AS
     LANGUAGE JAVA NAME ‘poi.getNumericValue(int, java.lang.String, int, int) return java.lang.Double’;
   
  FUNCTION getCellType(handle NUMBER, sheet VARCHAR2, coln NUMBER, rown NUMBER) RETURN NUMBER AS
     LANGUAGE JAVA NAME ‘poi.getCellType(int, java.lang.String, int, int) return int’;
   
END POI;


Create a PL/SQL ETL routine

Here I put the PL/SQL code for ETL, that had similar structure that ETL java code.