Yet another thing on my to-do list has been to either get someone to either start updating the product catalog on the Tech And House site, or to get a scraper to extract the information on the Frigidaire site. Didn’t realize how easy it was to whip up an extraction script using XPath! First, if you don’t already have the XPather add-on for Firefox, download it. Makes determining the XPath to elements on the page very easy:



Using the code below, scraping data from pretty much any site shouldn’t be too difficult. Modifying the XPaths in frigidaire_scraper.php and its extract_specs() method are where the main changes will occur to build a scraper for another site.

File 1: scraper.php

<?php
class Scraper {
  var $oldSetting;
  var $html;
 
  function scraper($targetUrl) {
    $this->oldSetting = libxml_use_internal_errors( true ); 
    libxml_clear_errors(); 
 
    $this->html = new DOMDocument(); 
    $this->html->loadHtmlFile($targetUrl); 
  }
 
  function extract($links_xpath,$extract,$include_blanks=true) {
    $return = array();
 
    $xpath = new DOMXPath($this->html); 
    $items = $xpath->query($links_xpath);
 
    foreach ($items as $item) {        
    	$newDom = new DOMDocument;
    	$newDom->appendChild($newDom->importNode($item,true));
 
    	$xpath = new DOMXPath($newDom); 
    	$extraction = trim($xpath->query($extract)->item(0)->nodeValue);
 
    	if ($include_blanks==true)
    	  array_push($return,$extraction);
    	else if ($extraction!="")
    	  array_push($return,$extraction);
    }
 
    return $return;
  }
}
?>

File 2: frigidaire_scraper.php

<?php
require ("scraper.php");
 
class FrigidaireScraper extends Scraper {
 
  var $category_links;
  var $product_links;
  var $products;
  var $keys;
 
  function FrigidaireScraper($targetUrl) {
    parent::scraper($targetUrl);
  }
 
  function extract_category_links() {
    $this->category_links = $this->extract("//div[@id='left-nav']/a[@class='left-nav-item-sub']","@href");
  }
 
  function extract_product_links() {
    $this->product_links = array();
    foreach($this->category_links as $link): 
      parent::scraper($link);
      $products = $this->extract("//table[@id='ProductTable']//h4/a","@href");
      $this->product_links = array_merge($this->product_links,$products);
    endforeach;
  }
 
  function extract_specs() {
    $this->keys = array();
    $this->products = array();
 
    foreach($this->product_links as $link):  
      parent::scraper($link);
      $model = explode(": ", array_pop($this->extract("//div[@id='main-inner']//h5","text()[1]")));
      $model = $model[1];
      $msrp = explode(" ", array_pop($this->extract("//div[@id='main-inner']//h5","text()[3]")));
      $msrp = $msrp[1];
 
      $specs = $this->extract("//div[@id='ctl00_CPHMain_TabContainer1_TabSpecifications']/div[2]/div/div[1]/div","text()",false);
 
      $this->products[$model]["Price"] = $msrp;
      $this->products[$model]["Model"] = $model;
      $this->products[$model]["Link"] = $link;
 
      foreach($specs as $spec):
        $spec = explode(": ", $spec);
        $this->products[$model][$spec[0]] = $spec[1];
      endforeach;
 
      $this->keys = array_unique(array_merge($this->keys,array_keys($this->products[$model])));
    endforeach;
  }
}
?>

File 3: scrape_it.php (bring it all together)

<?php
require_once("frigidaire_scraper.php");
 
$scraper = new FrigidaireScraper("http://www.frigidaire.com");
$scraper->extract_category_links();
$scraper->extract_product_links();
$scraper->extract_specs();
 
//echo a simple tab delimited file structure
foreach ($scraper->keys as $key):
  echo $key."\t";
endforeach;
 
echo "\n";
 
foreach ($scraper->products as $row):    
  foreach ($scraper->keys as $key):
    if (isset($row[$key]))
      echo $row[$key] . "\t";
    else echo "\t";
  endforeach;
  echo "\n";
endforeach;
?>

Scrape Output: