Changeset - fcd16eecfcf4
[Not reviewed]
default
0 1 0
Nathan Brink (binki) - 15 years ago 2011-02-09 00:03:21
ohnobinki@ohnopublishing.net
cedarville: Insert hack into crawler so that the few semesters it understands will be crawled properly.
1 file changed with 10 insertions and 0 deletions:
0 comments (0 inline, 0 general)
school.d/cedarville.crawl.inc
Show inline comments
 
@@ -63,24 +63,29 @@ function cedarville_crawl(array &$semest
 
  if ($verbosity > 1)
 
    echo "cedarville_crawl(): Determining list of departments.\n";
 

	
 
  if ($verbosity > 1)
 
    fprintf(STDERR, "cedarville_crawl(): Determining list of semesters.\n");
 
  $semesters_dom = new DOMDocument();
 
  $semesters_dom->loadHTML(file_get_contents($basepath));
 

	
 
  $content_div_dom = $semesters_dom->getElementById('contenttext');
 
  if (!$content_div_dom)
 
    {
 
      fprintf(STDERR, "cedarville_crawl(): Error finding location of the list of departments.\n");
 
      if (count($semesters))
 
	{
 
	  fprintf(STDERR, "cedarville_crawl(): Assuming that I got enough info anyways, returning successful code so that the few semesters I was able to crawl will be cached.\n");
 
	  return 0;
 
	}
 
      return 1;
 
    }
 
  $departments_xpath = new DOMXPath($semesters_dom);
 
  foreach ($departments_xpath->query('.//li/a') as $department_a_dom)
 
    {
 
      $semester_href = $department_a_dom->getAttribute('href');
 
      $semester_href_parts = split('_', $semester_href);
 

	
 
      $semester_name = $department_a_dom->textContent;
 
      if (stripos($semester_name, 'graduate') !== FALSE
 
	  || strpos($semester_href, 'index') === FALSE)
 
	/* cedarville has about 1 graduate course, lol */
 
@@ -97,24 +102,29 @@ function cedarville_crawl(array &$semest
 
		$semester_name);
 

	
 
  /*
 
   * We need two passes because the first department's code name is
 
   * not accessible available in the first pageload.
 
   */
 
  $departments = array();
 
  if (cedarville_crawl_departments_get($basepath . $semester_href, $departments, $semester_href_parts[0]))
 
    return 1;
 
  if (!count($departments))
 
    {
 
      echo "cedarville_crawl(): Unable to get a listing of departments.\n";
 
      if (count($semesters))
 
	{
 
	  fprintf(STDERR, "cedarville_crawl(): Assuming that I got enough info anyways, returning successful code so that the few semesters I was able to crawl will be cached.\n");
 
	  return 0;
 
	}
 
      return 1;
 
    }
 
  /* find the first department whose name we don't yet know */
 
  if (cedarville_crawl_departments_get($basepath . $semester_href_parts[0] . '_' . current(array_keys($departments)) . '_all.htm', $departments, $semester_href_parts[0]))
 
    return 1;
 

	
 
  $tables = array();
 
  foreach ($departments as $department => $dept_name)
 
    {
 
      if ($verbosity > 2)
 
	echo 'cedarville_crawl(): Crawling department ' . $department . ' (' . $dept_name . ")...\n";
 
      $html = file_get_contents($basepath . $semester_href_parts[0] . '_' . $department . '_' . 'all.htm');
0 comments (0 inline, 0 general)