Fast Scraping
Large Amounts of Data
While this window is showing instructions, the user interface of OutWit Hub remains operational.
You can still interact normally with the application and you can move this tutorial window around on the screen to better see the parts of the interface that you want.
userSpace.eyeCatcherOK = !(wizardKit.platform=="mac" && /firefox/i.test(navigator.userAgent) && /rv:1[2-7]/i.test(navigator.userAgent));
if (/Firefox\/[23]\./.test(navigator.userAgent)){
alert("OutWit wizards cannot run on your version of Firefox. Please update to the current version and try again.");
wizard.close();
} else if (!("witscript" in window) || !witscript.version || !witscript.version("3")){
alert("This wizard is not compatible with your version of the OutWit Kernel. Please download the latest version (2.0.1 or higher)");
wizard.close();
}
if(witscript.version("3")){$(".owui-wizard-homelink").html("Hub Tutorials")};
wizardKit.hideCatch();
wizardKit.hideLog();
if(!(/work\/wikipedia\/List_of_highest-grossing_films_by_year/.test(witscript.toolbar.urlBar.getValue()))){
witscript.views.page.load("http://www.outwit.com/support/help/hub/tutorials/work/wikipedia/List_of_highest-grossing_films_by_year.html");
}
userSpace.setWizardPrefs();
witscript.logPanel.setAttribute("height",0);
wizardKit.say(this.parentNode);
witscript.views.page.display();
$(".owui-wizard-nav-next").removeAttr("disabled");
wizardKit.say(this.parentNode);
wizardKit.hideCatch();
wizardKit.hideLog();
if(!(/work\/wikipedia\/List_of_highest-grossing_films_by_year/.test(witscript.toolbar.urlBar.getValue()))){
witscript.views.page.load("http://www.outwit.com/support/help/hub/tutorials/work/wikipedia/List_of_highest-grossing_films_by_year.html");
}
witscript.views.page.display();
witscript.menutree.focus();
witscript.views.page.browser.scrollToPercent(0);
Film Details
We used this wikipedia page earlier, in the first tutorial on table extraction.
This time, we want the available information about the films themselves and we will have to go and get it in 139 different pages.
wizardKit.say(this.parentNode);
wizardKit.hideCatch();
wizardKit.hideLog();
if(!(/work\/wikipedia\/List_of_highest-grossing_films_by_year/.test(witscript.toolbar.urlBar.getValue()))){
witscript.views.page.load("http://www.outwit.com/support/help/hub/tutorials/work/wikipedia/List_of_highest-grossing_films_by_year.html");
}
witscript.views.page.display();
witscript.menutree.focus();
for(var i = 0.0; i < 2.4; i += 0.1){
//alert(Object.keys(views.page));
witscript.views.page.browser.scrollToPercent(i);
witscript.wait(50);
}
page.findBar.textbox.setValue("High-grossing films by year of release");
witscript.menutree.focus();
witscript.wait(300);
page.findBar.textbox.setValue("");
We need to use the links that are in the table and explore all the pages to extract the details on each movie.
$(".owui-wizard-nav-next").removeAttr("disabled");
wizardKit.say(this.parentNode);
witscript.views.tables.exportPreview.setAttribute("width",100)
witscript.views.tables.previewSplitter.collapseAfter()
witscript.views.tables.exportPreview.exportType.setValue("excel")
witscript.views.tables.display();
witscript.menutree.focus();
The links to all film detail pages
are in the tables view, first column.
$(".owui-wizard-nav-next").removeAttr("disabled");
userSpace.theScraper = $("#scraper").html().replace(/^[\s\S]+?<!--[\s\S]*?(<[\s\S]+?>)[^<>]*?-->[^<>]*$/,"$1");
if (!witscript.version("2.1.4.27")) {
userSpace.theScraper = new XML(userSpace.theScraper);
}
wizardKit.say(this.parentNode);
witscript.views.tables.exportPreview.setAttribute("width",100)
witscript.views.tables.previewSplitter.collapseAfter()
witscript.views.tables.exportPreview.exportType.setValue("excel")
witscript.views.tables.datasheet.selectAll();
witscript.views.tables.display();
witscript.menutree.focus();
witscript.wait(1500);
witscript.views.scraped.display();
witscript.views.tables.datasheet.selectRows([0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]);
witscript.views.tables.datasheet.setCurrentCell(0, 3)
var URLsToExplore = views.tables.datasheet.getSelectedCells();
witscript.views.scraped.bottomPanel.emptyButton.click("left", true); // button is not visible but still works
if(views.scraped.datasheet.getRowCount() < 2) {
$(".owui-wizard-nav-next").attr("disabled", "disabled");
witscript.views.scraped.bottomPanel.pageLoad.empty.setValue(false);
witscript.setPreference("applyScraper.tempo.min", "600");
witscript.setPreference("applyScraper.tempo.max", "600");
witscript.scrapeURLs(URLsToExplore, userSpace.theScraper);
witscript.views.scraped.display();
witscript.wait(5000);
$(".owui-wizard-nav-next").removeAttr("disabled");
}
Selecting all these links let's ask OutWit Hub to apply a scraper to all of them, right-clicking on one of them and choosing
'Auto-Explore > Fast Scrape'.
$(".owui-wizard-nav-next").removeAttr("disabled");
wizardKit.say(this.parentNode);
var rows = [];
var startRow = 15;
for (var i = startRow;i<50;i++) {rows.push(i)};
witscript.views.tables.datasheet.selectRows(rows);
witscript.views.tables.datasheet.setCurrentCell(startRow, 3)
var URLsToExplore = views.tables.datasheet.getSelectedCells();
witscript.views.scraped.display();
if(views.scraped.datasheet.getRowCount() < 20) {
$(".owui-wizard-nav-next").attr("disabled", "disabled");
witscript.views.scraped.bottomPanel.pageLoad.empty.setValue(false);
witscript.setPreference("applyScraper.tempo.min", "300");
witscript.setPreference("applyScraper.tempo.max", "300");
witscript.scrapeURLs(URLsToExplore, userSpace.theScraper);
witscript.wait(7500);
$(".owui-wizard-nav-next").removeAttr("disabled");
}
You can set the preferences to go faster...
$(".owui-wizard-nav-next").removeAttr("disabled");
wizardKit.say(this.parentNode);
var rows = [];
var startRow = 50;
for (var i = startRow;i<views.tables.datasheet.getRowCount();i++) {rows.push(i)};
witscript.views.tables.datasheet.selectRows(rows);
witscript.views.tables.datasheet.setCurrentCell(startRow, 3)
var URLsToExplore = views.tables.datasheet.getSelectedCells();
if(views.scraped.datasheet.getRowCount() < 60) {
witscript.views.scraped.bottomPanel.pageLoad.empty.setValue(false);
witscript.setPreference("applyScraper.tempo.min", "100");
witscript.setPreference("applyScraper.tempo.max", "100");
witscript.scrapeURLs(URLsToExplore, userSpace.theScraper);
witscript.views.scraped.display();
}
or even faster...
...depending on the available bandwidth and the terms of service of the site you are scraping. Please, do not overuse server resources.
(Note that to set a good example and avoid imposing the same on wikipedia, these demo copies are located on OutWit servers, so if it scrapes any slower than 4 or 5 pages per second, it is because of our servers.)
witscript.views.scraped.bottomPanel.pageLoad.empty.setValue(true);
//alert(userSpace.WTI);
wizardKit.say(this.parentNode);
// wizardKit.restoreOriginalPrefs();
// XXX JC: This should not be here. Move to the close button (or event)
witscript.menutree.focus();
$(".owui-wizard-homelink").attr("style","color: #DFFFF9 !important; float:left;").html("More Tutorials");
if(views.scraped.datasheet.getRowCount() == 139) {
$("#endTitle").html("The 139 Pages Were Scraped.");
} else {
$("#endTitle").html("Your Turn to Use This Powerful Tool");
}
Fast scraping doesn't apply to all cases, though.
AJAX pages, for instance, are generated dynamically by scripts. As the Fast Scraping mode doesn't execute scripts, these can only be scraped during a regular exploration (automatic Browse or Dig) in which pages are actually rendered and scripts executed.
All this is explained in the Help Center. Please read about the difference between Browsing and Fast Scraping.