Changeset - fcd16eecfcf4
[Not reviewed]
default
0 1 0
Nathan Brink (binki) - 15 years ago 2011-02-09 00:03:21
ohnobinki@ohnopublishing.net
cedarville: Insert hack into crawler so that the few semesters it understands will be crawled properly.
1 file changed with 10 insertions and 0 deletions:
0 comments (0 inline, 0 general)
school.d/cedarville.crawl.inc
Show inline comments
 
@@ -51,82 +51,92 @@ function table_parse($html)
 
  }
 
  return $arr;
 
}
 

	
 
/** Crawls Cedarville course listings. $season is "fa" or "sp", year is 4-digit year */
 
function cedarville_crawl(array &$semesters, $verbosity = 1)
 
{  
 
  $basepath = 'http://cedarville.edu/courses/schedule/';
 

	
 
  if ($verbosity)
 
    echo "cedarville_crawl(): Beginning crawl of Cedarville:\n";
 

	
 
  if ($verbosity > 1)
 
    echo "cedarville_crawl(): Determining list of departments.\n";
 

	
 
  if ($verbosity > 1)
 
    fprintf(STDERR, "cedarville_crawl(): Determining list of semesters.\n");
 
  $semesters_dom = new DOMDocument();
 
  $semesters_dom->loadHTML(file_get_contents($basepath));
 

	
 
  $content_div_dom = $semesters_dom->getElementById('contenttext');
 
  if (!$content_div_dom)
 
    {
 
      fprintf(STDERR, "cedarville_crawl(): Error finding location of the list of departments.\n");
 
      if (count($semesters))
 
	{
 
	  fprintf(STDERR, "cedarville_crawl(): Assuming that I got enough info anyways, returning successful code so that the few semesters I was able to crawl will be cached.\n");
 
	  return 0;
 
	}
 
      return 1;
 
    }
 
  $departments_xpath = new DOMXPath($semesters_dom);
 
  foreach ($departments_xpath->query('.//li/a') as $department_a_dom)
 
    {
 
      $semester_href = $department_a_dom->getAttribute('href');
 
      $semester_href_parts = split('_', $semester_href);
 

	
 
      $semester_name = $department_a_dom->textContent;
 
      if (stripos($semester_name, 'graduate') !== FALSE
 
	  || strpos($semester_href, 'index') === FALSE)
 
	/* cedarville has about 1 graduate course, lol */
 
	continue;
 
      $semester_name_parts = split(' ', $semester_name);
 

	
 
      $semester_year = $semester_name_parts[0];
 
      $semester_season = strtolower($semester_name_parts[1]);
 

	
 
      $semester = new Semester($semester_year, $semester_season);
 

	
 
      if ($verbosity > 1)
 
	fprintf(STDERR, "cedarville_crawl(): Crawling semester: %s.\n",
 
		$semester_name);
 

	
 
  /*
 
   * We need two passes because the first department's code name is
 
   * not accessible available in the first pageload.
 
   */
 
  $departments = array();
 
  if (cedarville_crawl_departments_get($basepath . $semester_href, $departments, $semester_href_parts[0]))
 
    return 1;
 
  if (!count($departments))
 
    {
 
      echo "cedarville_crawl(): Unable to get a listing of departments.\n";
 
      if (count($semesters))
 
	{
 
	  fprintf(STDERR, "cedarville_crawl(): Assuming that I got enough info anyways, returning successful code so that the few semesters I was able to crawl will be cached.\n");
 
	  return 0;
 
	}
 
      return 1;
 
    }
 
  /* find the first department whose name we don't yet know */
 
  if (cedarville_crawl_departments_get($basepath . $semester_href_parts[0] . '_' . current(array_keys($departments)) . '_all.htm', $departments, $semester_href_parts[0]))
 
    return 1;
 

	
 
  $tables = array();
 
  foreach ($departments as $department => $dept_name)
 
    {
 
      if ($verbosity > 2)
 
	echo 'cedarville_crawl(): Crawling department ' . $department . ' (' . $dept_name . ")...\n";
 
      $html = file_get_contents($basepath . $semester_href_parts[0] . '_' . $department . '_' . 'all.htm');
 
      if (!$html)
 
	continue;
 
      $tables[$department] = table_parse(cedarville_html_fix($html));
 
    }
 

	
 
  $meeting_type_maps = array('LAB' => 'lab', 'LECT' => 'lecture');
 

	
 
  foreach ($tables as $dept_table)
 
    {
 
      /*
 
       * Discard the first row, which has the contents of the <th />
 
       * elements.
0 comments (0 inline, 0 general)