Changeset - 3c5df1fe0953
[Not reviewed]
default
0 1 0
Nathan Brink (binki) - 15 years ago 2011-01-31 20:47:48
ohnobinki@ohnopublishing.net
Dynamically calculate the list of departments when crawling Cedarville's registration data. Should fix bug 73 rather permanently, or until the format of the page changes.
1 file changed with 64 insertions and 8 deletions:
0 comments (0 inline, 0 general)
school.d/cedarville.inc
Show inline comments
 
@@ -99,18 +99,35 @@ function cedarville_crawl($semester, $ve
 

	
 
  $season = strtolower(substr($semester->season_get(), 0, 2));
 
  $year = $semester->year_get();
 
  $season_string = $year . $season;
 

	
 
  /* Current academic departments. Update as needed. */
 
  $departments = array('ad', 'be','ba','ca','ed','eg','es','hg','id','ll','ms','mu','ns','ph','py','sm','sw');
 
  $basepath = "http://cedarville.edu/courses/schedule/";
 
  $basepath = 'http://cedarville.edu/courses/schedule/';
 

	
 
  if ($verbosity)
 
    echo "cedarville_crawl(): Beginning crawl of Cedarville:\n";
 

	
 
  echo "cedarville_crawl(): Beginning crawl of Cedarville:\n";
 
  if ($verbosity > 1)
 
    echo "cedarville_crawl(): Determining list of departments.\n";
 
  /*
 
   * We need two passes because the first department's code name is
 
   * not accessible available in the first pageload.
 
   */
 
  $departments = array();
 
  if (cedarville_crawl_departments_get($basepath . $year . $season . '_index.htm', $departments, $season_string))
 
    return 1;
 
  if (!count($departments))
 
    {
 
      echo "cedarville_crawl(): Unable to get a listing of departments.\n";
 
      return 1;
 
    }
 
  /* find the first department whose name we don't yet know */
 
  if (cedarville_crawl_departments_get($basepath . $year . $season . '_' . current(array_keys($departments)) . '_all.htm', $departments, $season_string))
 
    return 1;
 

	
 
  $season = strtolower($season);
 
  $tables = array();
 
  foreach($departments as $department)
 
  foreach ($departments as $department => $dept_name)
 
    {
 
      echo "cedarville_crawl(): Crawling department \"$department\"...\n";
 
      echo 'cedarville_crawl(): Crawling department ' . $department . ' (' . $dept_name . ")...\n";
 
      $html = file_get_contents($basepath . $year . $season . '_' . $department . '_' . 'all.htm');
 
      if (!$html)
 
	continue;
 
@@ -259,10 +276,49 @@ function cedarville_crawl($semester, $ve
 

	
 
/**
 
 * \brief
 
 *   Scan cedarville's course listing pages for departments.
 
 *
 
 * \return
 
 *   An associative array mapping department codes onto department
 
 *   friendly names.
 
 */
 
function cedarville_crawl_departments_get($dept_url, array &$departments, $season_string)
 
{
 
  $html = file_get_contents($dept_url);
 
  $dept_dom = new DOMDocument();
 
  if (!$dept_dom->loadHTML(cedarville_html_fix($html)))
 
    {
 
      echo "cedarville_crawl(): Error determining list of available departments: Unable to parse HTML.\n";
 
      return 1;
 
    }
 
  $xpath = new DOMXPath($dept_dom);
 

	
 
  $dept_node_list = $xpath->query('/descendant::div[@id="contenttext"]/child::span[position()=1 or position()=2]/child::a');
 
  foreach ($dept_node_list as $dept_node)
 
    {
 
      $href = $dept_node->getAttribute('href');
 
      if (!preg_match('/^' . preg_quote($season_string, '/') . '_([a-z]+)_[a-z]+\.htm$/', $href, $matches))
 
	{
 
	  echo 'cedarvillege_crawl(): Error determining list of available departments: Unable to parse the department string out of href="' . $href . "\".\n";
 
	  return 1;
 
	}
 

	
 
      $dept = $matches[1];
 
      $departments[$dept] = $dept_node->textContent;
 
    }
 

	
 
  return 0;
 
}
 

	
 
/**
 
 * \brief
 
 *   Fix some incorrect usage of the HTML entity delimiter, the ampersand.
 
 */
 
function cedarville_html_fix($html)
 
{
 
  $html = preg_replace('/&&/', '&&', $html);
 
  return preg_replace('/&([^;]{5})/', '&$1', $html);
 
  $html = preg_replace('/&([^;]{5})/', '&$1', $html);
 
  $html = preg_replace('/ID="(LINKS|HERE)"/', '', $html);
 

	
 
  return $html;
 
}
0 comments (0 inline, 0 general)