Excel data extractor

From Trephine

Jump to: navigation, search

Excel data extractor

This demo shows how to use trephine with Apache POI to extract data from a Microsoft Excel file.

The data is displayed in a series of HTML tables, one for each sheet, and also as raw JSON data.

You must have JavaScript enabled to run this demo.

// Status and element functions. 
var e = function(tag,index){ return demotab.getElementsByTagName(tag)[index||0]; };
var status = function(msg) {
  if (!msg) return e('p').style.display = 'none';
  e('p').style.display = '';
  e('p').innerHTML = msg;
};
 
// Setup demo 
var setup = function() {
 
  // Test for POI (may already be loaded)
  try {
    var result = trephine.js( function(){
      java.lang.Class.forName('org.apache.poi.ss.usermodel.WorkbookFactory');
    } );
  } catch(err) {
    var result = { success: false, error: err.message };
  }
  if (!result.error) {
    var status = 'ready';
  } else {
 
    // Test whether dependencies are ready for loading
    var result = trephine.js( function(){
 
      // Get handle on lib directory or create it if necessary
      var homedir = new java.io.File( java.lang.System.getProperty('user.home') );
      var libdir = new java.io.File( homedir, '.trephine/lib' );
      if (!libdir.exists()) {
        if (!libdir.mkdirs()) throw "Error: Could not create " + libdir.toString();
      }
 
      // Check for Groovy dependencies
      var jars = [
        'poi-3.5-beta5-20090219.jar', 'poi-ooxml-3.5-beta5-20090219.jar',
        'xmlbeans-2.3.0.jar', 'ooxml-schemas-1.0.jar', 'log4j-1.2.13.jar', 'dom4j-1.6.1.jar'
      ];
      for (var i=0, l=jars.length; i<l; i++) {
        var jarfile = new java.io.File( libdir, jars[i] );
        if (!jarfile.exists()) return 'missing';
      };
      return 'loadable';
 
    } );
 
    if (result.error) return demotab.innerHTML = '<div class="error">' + result.error + '</div>';
    var status = result.result.toString() + '';
 
  }
 
  if (status=='missing') {
    e('div',1).style.display = '';
  } else if (status=='ready') {
    e('form').style.display = '';
  } else if (status=='loadable') {
    var result = trephine.js( function(){
      var jars = [
        'poi-3.5-beta5-20090219.jar', 'poi-ooxml-3.5-beta5-20090219.jar',
        'xmlbeans-2.3.0.jar', 'ooxml-schemas-1.0.jar', 'log4j-1.2.13.jar', 'dom4j-1.6.1.jar'
      ];
      var homedir = new java.io.File( java.lang.System.getProperty('user.home') );
      var libdir = new java.io.File( homedir, '.trephine/lib' );
      var executor = java.lang.Thread.currentThread();
      for (var i=0, l=jars.length; i<l; i++) {
        var jarfile = new java.io.File(libdir, jars[i]);
        executor.addSystemJar(jarfile);
      }
      return true;
    } );
    if (result.error) return demotab.innerHTML = '<div class="error">' + result.error.toString() + '</div>';
    e('form').style.display = '';
  } else {
    return demotab.innerHTML = '<div class="error">Unknown status [' + status + '] returned.</div>';
  }
 
  // Setup download link
  e('a').onclick = function(evt) {
    if (!evt) evt = window.event;
    evt.cancelBubble = true;
    if (evt.stopPropagation) evt.stopPropagation();
    var result = trephine.js( function() {
 
      // Shorthand mechanism for creating and starting background threads.
      Function.prototype.run = function() {
        var thread = new java.lang.Thread( new java.lang.Runnable( { run: this } ) );
        thread.start();
        return thread;
      };
 
      // Helper function for downloading a file asynchronously
      function download( fromURL, toFile, options ) {
        options = options || {};
        var onprogress = typeof options.onprogress == "undefined" ? null : options.onprogress;
        var interval = typeof options.interval == "undefined" ? 500 : options.interval;
        var async = typeof options.async == "undefined" ? true : options.async;
        var f = function(){
          var conn = fromURL.openConnection();
          var len = conn.contentLength;
          var bis = new java.io.BufferedInputStream(conn.inputStream);
          var out = new java.io.BufferedOutputStream(new java.io.FileOutputStream(toFile));
          var b, count = 0, n = 0;
          var buf = java.lang.reflect.Array.newInstance(java.lang.Byte.TYPE, 4096);
          if (!onprogress) {
            while ((n = bis.read(buf, 0, buf.length)) != -1) out.write(buf, 0, n);
            out.flush();
            bis.close();
            out.close();
            return;
          }
          var t = (function(){
            while (count < len) {
              onprogress(count, len);
              java.lang.Thread.currentThread().sleep(interval);
            }
          }).run();
          while ((n = bis.read(buf, 0, buf.length)) != -1) {
            count += n;
            out.write(buf, 0, n);
          }
          t.stop();
          out.flush();
          bis.close();
          out.close();
          onprogress(count, len);
        };
        return async ? f.run() : f();
      };
 
      // Download POI dependencies, add them to the system classpath and register the engine factory
      var executor = java.lang.Thread.currentThread();
      (function(){
        var jars = [
          'poi-3.5-beta5-20090219.jar', 'poi-ooxml-3.5-beta5-20090219.jar',
          'xmlbeans-2.3.0.jar', 'ooxml-schemas-1.0.jar', 'log4j-1.2.13.jar', 'dom4j-1.6.1.jar'
        ];
        var jarids = ['poi-beta', 'poi-ooxml', 'xmlbeans', 'ooxml-schemas', 'log4j', 'dom4j'];
        var homedir = new java.io.File( java.lang.System.getProperty('user.home') );
        var libdir = new java.io.File( homedir, '.trephine/lib' );
        var base = applet['c'+'lass'].protectionDomain.codeSource.location.toString().replaceFirst('^(.*'+'/).*$', "$1");
        for (var i=0, l=jarids.length; i<l; i++) (function(id, jar){
          var jarfile = new java.io.File( libdir, jar );
          var msg = "<span class=\"spinner\"></span> " + jar + " <span>0</span>% complete."
          window.eval("window.top.document.getElementById('"+id+"').innerHTML = '"+msg+"';");
          download(new java.net.URL(base + jar), jarfile, {
            async: false,
            onprogress: function(count, len) {
              if (count >= len) {
                var msg = "<span class=\"check\"></span> " + jar + " Done!";
                return window.eval([
                  "window.top.document.getElementById('", id, "').innerHTML = '", msg, "';"
                ].join(''));
              }
              window.eval([
                "window.top.document.getElementById('", id, "').getElementsByTagName('span')[1].innerHTML = ",
                "'" + Math.round(100 * count / len) + "';"
              ].join(''));
            }
          });
          executor.addSystemJar(jarfile);
        })(jarids[i], jars[i]);
 
        // Switch back from download to main demo form
        var code = '(' + function(){
          var e = function(tag,index){ return window.top.document.getElementById('demotab').getElementsByTagName(tag)[index||0]; };
          e('div',1).style.display = 'none';
          e('p').innerHTML = 'Dependencies successfully loaded!';
          e('p').style.display = '';
          e('form').style.display = 'block';
          setTimeout(function(){ e('p').style.display='none'; }, 2000);
        } + ')()';
        window.eval( code.replace(new RegExp('\\r?\\n','g'), '') );
      }).run();
 
    } );
    e('p',2).innerHTML = 'Download in progress...';
    return false;
  };
 
  // Setup form
  e('form').onsubmit = function(event) {
    try {
      if (!trephine || !trephine.loaded) throw "Error: trephine not loaded.";
      if (!trephine.hasPermission()) throw "Error: elevated permissions have not been granted.";
      e('div').innerHTML = '<p class="notice" style="float:none"><span class="spinner"></span> Extracting data ...</p>';
      setTimeout( function(){
 
        // Here's where the real work gets done
        var result = trephine.js( function(filename) {
 
          // First, confirm that the file is actually a file that we can read
          filename = new java.lang.String(filename);
          var file = new java.io.File(filename);
          if (!file.exists()) throw "Error: File does not exist " + file.toString();
          if (!file.isFile()) throw "Error: Selected path is not a file " + file.toString();
          if (!file.canRead()) throw "Error: The selected file cannot be read " + file.toString();
 
          // Now we try to create a WorkBook
          try {
            var inp = new java.io.FileInputStream(file);
            var wb = Packages.org.apache.poi.ss.usermodel.WorkbookFactory.create(inp);
            if (!wb) throw "WorkBook is null";
          } catch(err) {
            throw "Error creating workbook from file - " + err.message;
          }
 
          // Cell value extractor
          var cellValue = function(cell) {
            if (!cell) return '';
            var t = cell.cellType, c = Packages.org.apache.poi.ss.usermodel.Cell;
            var du = Packages.org.apache.poi.ss.usermodel.DateUtil;
            if (t == c.CELL_TYPE_NUMERIC) return du.isCellDateFormatted(cell) ? cell.dateCellValue + '' : cell.numericCellValue + 0.0;
            if (t == c.CELL_TYPE_STRING) return cell.stringCellValue + '';
            if (t == c.CELL_TYPE_FORMULA) {
              try { return du.isCellDateFormatted(cell) ? cell.dateCellValue + '' : cell.numericCellValue + 0.0; } catch(err) { }
              try { return cell.stringCellValue + ''; } catch(err) { }
              try { return cell.numericCellValue + 0.0; } catch(err) { }
              try { return !!cell.booleanCellValue; } catch(err) { }
              try { return cell.errorCellValue + ''; } catch(err) { }
              return '';
            }
            if (t == c.CELL_TYPE_BLANK) return '';
            if (t == c.CELL_TYPE_BOOLEAN) return !!cell.booleanCellValue;
            if (t == c.CELL_TYPE_ERROR) return cell.errorCellValue + '';
            return '';
          };
 
          // Build a JSON data model of the workbook's sheets and return it
          var sheets = [];
          var p = function(s) { java.lang.System.out.println(s); };
          for (var i=0, l=wb.numberOfSheets; i<l; i++) {
            var sheet = { data: [], cols: 0, name: wb.getSheetName(i) + '' };
            var rit = wb.getSheetAt(i).rowIterator();
            while (rit.hasNext()) {
              var row = [], cols = 0, cit = rit.next().cellIterator();
              while (cit.hasNext()) row[cols++] = cellValue(cit.next());
              sheet.data[sheet.data.length] = row;
              if (cols > sheet.cols) sheet.cols = cols;
            }
            if (sheet.data.length) sheets[sheets.length] = sheet;
          }
 
          return JSON.stringify(sheets);
 
        }, e('input').value );
 
        // Show an error message if anything went wrong
        if (result.success.toString() != 'true') {
          var msg = result.error;
          try { msg = msg.toString() + ''; } catch(err) { }
          return e('div').innerHTML = [
            '<p class="error">', msg, '</p>',
            '<p><strong>Tip</strong>: Enable the Java Console to see full traces.<br />'
          ].join('');
        }
        var json = result.result;
        try { json = json.toString() + ''; } catch(err) {}
        try {
          var sheets = eval('(' + json + ')');
        } catch(err) {
          return e('div').innerHTML = '<p class="error">Error extracting data: ' + err.message + '</p>';
        }
 
        // Extraction successful, build HTML table
        var output = ['<h4>HTML Data:</h4>'];
        for (var i=0, l=sheets.length; i<l; i++) {
          var sheet = sheets[i];
          output[output.length] = "<h5>" + sheet.name + "</h5><div class=\"datatable\"><table class=\"wikitable\"><tbody>";
          for (var j=0, m=sheet.data.length; j<m; j++) {
            var row = sheet.data[j];
            output[output.length] = '<tr>';
            for (var k=0, n=sheet.cols; k<n; k++) {
              output[output.length] = '<td>';
              if (k<row.length) output[output.length] = row[k];
              output[output.length] = '</td>';
            }
            output[output.length] = '</tr>';
          }
          output[output.length] = '</tbody></table></div>';
        }
        output[output.length] = '<h4>JSON Data:</h4><textarea id="jsondata"></textarea>';
 
        // Set the output
        e('div').innerHTML = output.join('');
 
        // Shrink any datatable div's containing small tables
        var divs = e('div').getElementsByTagName('div');
        for (var i=0, l=divs.length; i<l; i++) {
          var div = divs[i], table = div.getElementsByTagName('table')[0];
          if (!table) continue;
          if (table.clientHeight < div.clientHeight) div.style.height = table.clientHeight + 'px';
        }
 
        // Inject the JSON data as a new text node
        document.getElementById('jsondata').value = json;
 
      }, 10 );
    } catch(err) {
      e('div').innerHTML = '<p class="errorbox" style="float:none">' + err + '</p>';
    }
    return false;
  };
 
  // Setup the browse button
  e('input', 1).onclick = function(evt) {
    window._browseCallback = function(path) {
      e('input').value = path.toString() + '';
      delete window._browseCallback;
    };
    var result = trephine.js( function() {
      (new java.lang.Thread(new java.lang.Runnable({
        run: function() {
          var dialog = new java.awt.FileDialog(
            new java.awt.Frame(),
            "Choose a directory",
            java.awt.FileDialog.LOAD
          ); 
          dialog.show();
          var path = (dialog.directory + dialog.file + '').replace(new RegExp('\\\\','g'), '\\\\');
          path = JSON.stringify(path);
          window.eval("setTimeout('window.top._browseCallback(" + path + ")', 10)");
        }
      }))).start();
      return true;
    } );
    if (result.error) throw "Error: " + result.error;
  };
};
 
// Load trephine 
status('<span class="spinner"></span> Loading trephine...');
trephine.load({
  debug: true, // Enables debug logging to the Java Console
  onload: function() { // Immediately prompt for permissions
    status('<span class="spinner"></span> Asking for elevated permissions...');
    trephine.askPermission( function(response) {
      if (!response) { // Short-circuit if permission request was denied
        return demotab.innerHTML = '<div class="error">Sorry, you must grant trephine privileges to enjoy this demo.</div>';
      }
      setup();
      status('Permission request granted.');
      setTimeout( function(){ status(); }, 2000 );
    } );
  } 
});
<p class="notice"></p>
<form style="display:none">
  <h4>Excel File:</h4>
  <p>
    <input class="filepath" type="text" value="">
    <input class="browse" type="button" value="browse"/>
    <input class="extract" type="submit" value="extract"/>
  </p>
  <div></div>
</form>
<div style="display:none">
  <p class="notice">
    This demo requires the following POI support jars.
    <strong><a href="" title="begin download">Click here to begin download</a></strong>
  </p>
  <ul>
    <li id="poi-beta">poi-3.5-beta5-20090219.jar (1.5 MB)</li>
    <li id="poi-ooxml">poi-ooxml-3.5-beta5-20090219.jar (333 KB)</li>
    <li id="xmlbeans">xmlbeans-2.3.0.jar (2.6 MB)</li>
    <li id="ooxml-schemas">ooxml-schemas-1.0.jar (13.6 MB)</li>
    <li id="log4j">log4j-1.2.13.jar (350 KB)</li>
    <li id="dom4j">dom4j-1.6.1.jar (307 KB)</li>
  </ul>
</div>
.demotab h4 {
  font-weight: bold;
}
.demotab .filepath {
  width: 400px;
}
.demotab td {
  border-bottom: 1px solid #ccc;
  border-left: 1px solid #ccc;
  white-space: nowrap;
  min-width: 6em;
}
.demotab table {
  border-right: 1px solid #ccc;
}
.demotab .spinner {
  padding: 10px 10px;
  background: url("skins/common/images/spinner.gif");
  background-position: center center;
  background-repeat: no-repeat;
}
.demotab .check {
  padding: 8px 10px;
  background: url("skins/trephine/accept.gif");
  background-position: center center;
  background-repeat: no-repeat;
}
.demotab form div p {
  overflow: auto;
}
.demotab .datatable {
  height: 358px;
  margin-bottom: 18px;
  overflow: auto;
  border: 1px solid #ccc;
  border-top: none;
}
.demotab h5 {
  background: #C3D9FF;
  margin-bottom: 0;
  border: 1px solid #ccc;
  height: 16px;
  padding: 0 1em;
}
.demotab textarea {
  height: 358px;
  overflow: auto;
  margin-bottom: 18px;
  border: 1px solid #ccc;
  background: #fffff0;
  width: 602px;
}

Tip: All code on this page is available under the MIT license as explained on the about page.

Activation

  1. Activate the demo by clicking the specified link.
  2. You may see a Java Security dialog, if so, click "Run" or "Trust" to indicate your acceptance
  3. In the "Trephine permission request" dialog, answer the math challenge and click "OK" to grant elevated privileges.
  4. If this is your first time running the demo, you will be prompted to download the POI dependencies, click the link to do so.
  5. Once all the dependency jars have been downloaded, the demo will be ready to use.

Usage

  1. Click the browse button and select an Excel file to open, or type in a path directly.
  2. Click the execute button to initiate data extraction, data will appear below.
  3. Repeat as many times as you like.

Caveats and Gotchas

Support for Microsoft Office 2007 Excel files is still in beta. Once POI releases a stable version with xlsx support, this shouldn't be a problem.

The Excel input files I've used so far have been very simple, so it's very likely that using a complex spreadsheet may break the demo. If that happens, and you're able to share the Excel file that did it, please leave a comment below, or email me directly. Thanks in advance!

--Jim R. Wilson (jimbojw) 07:29, 28 March 2009 (UTC)

Personal tools