Changeset - 3db76bd5a41c
[Not reviewed]
default
0 6 0
Nathan Brink (binki) - 14 years ago 2011-10-08 01:31:20
ohnobinki@ohnopublishing.net
Refactor crawling to write out data a semester at a time instead of for all semesters at once, enabling PHP to use a reasonable memory limit. Convert calvin, cedarville, and ccbcmd to the new crawling method.
6 files changed with 592 insertions and 192 deletions:
0 comments (0 inline, 0 general)
inc/admin.inc
Show inline comments
 
@@ -152,39 +152,6 @@ function school_cache($schools)
 
						  'weight' => $semester_weights ++,
 
						  'name' => $semester->name_get(),
 
						  );
 

	
 
	      $cache_auto_school_semester_dir_name = $cache_auto_school_dir_name . $semester->id() . DIRECTORY_SEPARATOR;
 
	      if (!is_dir($cache_auto_school_semester_dir_name))
 
		{
 
		  if (!mkdir($cache_auto_school_semester_dir_name, 0755, TRUE))
 
		    error_log('Unable to create needed directory: `' . $cache_auto_school_semester_dir_name . '\'');
 
		}
 

	
 
	      $departments = $semester->departments_get();
 
	      sort($departments);
 

	
 
	      $dept_file = fopen($cache_auto_school_semester_dir_name . '-depts', 'wb');
 
	      fwrite($dept_file, serialize($departments));
 
	      fclose($dept_file);
 

	
 
	      /* now per-department autocomplete */
 
	      foreach ($departments as $department)
 
		{
 
		  $classes = $semester->department_classes_get($department);
 
		  $classes_file = fopen($cache_auto_school_semester_dir_name . $department . '.sects', 'wb');
 
		  fwrite($classes_file, serialize($classes));
 
		  fclose($classes_file);
 

	
 
		  /* now individual section informations, pre-JSON-ized */
 
		  foreach ($classes as $class)
 
		    {
 
		      if (!is_dir($cache_auto_school_semester_dir_name . $department))
 
			mkdir($cache_auto_school_semester_dir_name . $department);
 
		      $class_file = fopen($cache_auto_school_semester_dir_name . $department . DIRECTORY_SEPARATOR . $class, 'wb');
 
		      fwrite($class_file, json_encode($semester->class_get($department, $class)->to_json_array()));
 
		      fclose($class_file);
 
		    }
 
		}
 
	    } /* foreach ( => $semester) */
 
	  /*
 
	   * Store/cache the semester metadata:
 
@@ -260,10 +227,20 @@ function school_cache_semesters_sort (Se
 
 */
 
function school_crawl(array &$school, Page $page = NULL, $verbosity = 1)
 
{
 
  $cache_dir_name = dirname(__FILE__) . DIRECTORY_SEPARATOR . '..'
 
    . DIRECTORY_SEPARATOR . 'cache' . DIRECTORY_SEPARATOR;
 
  $cache_auto_dir_name = $cache_dir_name . 'auto' . DIRECTORY_SEPARATOR;
 
  $cache_auto_school_dir_name = $cache_auto_dir_name . $school['id'] . DIRECTORY_SEPARATOR;
 
  if (!is_dir($cache_auto_school_dir_name))
 
    {
 
      if (!mkdir($cache_auto_school_dir_name, 0755, TRUE))
 
	error_log('Unable to create needed directory: `' . $cache_auto_dir_name . '\'');
 
    }
 

	
 
  $school['crawled'] = FALSE;
 

	
 
  $school_crawl_func = $school['id'] . '_crawl';
 
  if (!function_exists($school_crawl_func))
 
  $school_crawl_semesters_list_func = $school['id'] . '_crawl_semester_list';
 
  if (!function_exists($school_crawl_semesters_list_func))
 
    return;
 

	
 
  $school_crawl_log_opts = array('verbosity' => $verbosity);
 
@@ -276,20 +253,81 @@ function school_crawl(array &$school, Pa
 
  $semesters = array();
 

	
 
  if ($verbosity > 0)
 
    school_crawl_logf($school_crawl_log, 2, "Calling crawler...");
 
    school_crawl_logf($school_crawl_log, 2, "Calling %s()...", $school_crawl_semesters_list_func);
 

	
 
  $ret = $school_crawl_func($semesters, $school_crawl_log, $verbosity);
 
  $ret = $school_crawl_semesters_list_func($school, $semesters, $school_crawl_log);
 
  if ($ret)
 
    {
 
      school_crawl_logf($school_crawl_log, 1, "Crawling %s failed: %s() returned nonzero",
 
			$school['id'], $school_crawl_func);
 
			$school['id'], $school_crawl_semesters_list_func);
 
      school_crawl_logf($school_crawl_log, 6, "");
 
      return;
 
    }
 

	
 
  $successful_semesters = array();
 
  $school_crawl_semester_func = $school['id'] . '_crawl_semester';
 
  if (!function_exists($school_crawl_semester_func))
 
    {
 
      school_crawl_logf($school_crawl_log, 3, "%s() is defined but %s() isn't.",
 
			$school_crawl_semesters_list_func, $school_crawl_semester_func);
 
      return;
 
    }
 

	
 
  foreach ($semesters as $semester)
 
    {
 
      school_crawl_logf($school_crawl_log, 2, "Calling %s(%s)...", $school_crawl_semester_func, $semester);
 
      $ret = $school_crawl_semester_func($school, $semester, $school_crawl_log);
 
      if ($ret)
 
	{
 
	  school_crawl_logf($school_crawl_log, 1, "Failed to crawl semester %s. Skipping semester.", $semester);
 
	  continue;
 
	}
 

	
 
      /*
 
       * Write out this semester's cache now that we're here.
 
       */
 
      $cache_auto_school_semester_dir_name = $cache_auto_school_dir_name . $semester->id() . DIRECTORY_SEPARATOR;
 
	      if (!is_dir($cache_auto_school_semester_dir_name))
 
		{
 
		  if (!mkdir($cache_auto_school_semester_dir_name, 0755, TRUE))
 
		    error_log('Unable to create needed directory: `' . $cache_auto_school_semester_dir_name . '\'');
 
		}
 

	
 
	      $departments = $semester->departments_get();
 
	      sort($departments);
 

	
 
	      $dept_file = fopen($cache_auto_school_semester_dir_name . '-depts', 'wb');
 
	      fwrite($dept_file, serialize($departments));
 
	      fclose($dept_file);
 

	
 
	      /* now per-department autocomplete */
 
	      foreach ($departments as $department)
 
		{
 
		  $classes = $semester->department_classes_get($department);
 
		  $classes_file = fopen($cache_auto_school_semester_dir_name . $department . '.sects', 'wb');
 
		  fwrite($classes_file, serialize($classes));
 
		  fclose($classes_file);
 

	
 
		  /* now individual section informations, pre-JSON-ized */
 
		  foreach ($classes as $class)
 
		    {
 
		      if (!is_dir($cache_auto_school_semester_dir_name . $department))
 
			mkdir($cache_auto_school_semester_dir_name . $department);
 
		      $class_file = fopen($cache_auto_school_semester_dir_name . $department . DIRECTORY_SEPARATOR . $class, 'wb');
 
		      fwrite($class_file, json_encode($semester->class_get($department, $class)->to_json_array()));
 
		      fclose($class_file);
 
		    }
 
		}
 

	
 
      /* Purge the data written to disk from memory */
 
      $semester->purge();
 

	
 
      school_crawl_logf($school_crawl_log, 6, "");
 
      $successful_semesters[] = $semester;
 
    }
 

	
 
  $school['crawled'] = TRUE;
 
  $school['crawled_semesters'] = $semesters;
 

	
 
  school_crawl_logf($school_crawl_log, 6, "");
 
  $school['crawled_semesters'] = $successful_semesters;
 

	
 
  return $school_crawl_log;
 
}
 
@@ -319,7 +357,7 @@ function school_cache_recreate($crawl_on
 
    foreach ($crawl_only as $crawl_only_school_id)
 
      if (!in_array($crawl_only_school_id, $school_id_list))
 
	{
 
	  fprintf(STDERR, "error: Invalid school_id specified for crawling: %s",
 
	  fprintf(STDERR, "error: Invalid school_id specified for crawling: %s\n",
 
		  $crawl_only_school_id);
 
	  return 1;
 
	}
inc/class.semester.inc
Show inline comments
 
@@ -369,4 +369,13 @@ class Semester
 
  {
 
    return array(self::SEASON_SPRING, self::SEASON_SUMMER, self::SEASON_FALL);
 
  }
 

	
 
  /**
 
   * \brief
 
   *   Clean the semester of all sections, keeping metadata intact.
 
   */
 
  public function purge()
 
  {
 
    $this->departments = array();
 
  }
 
}
inc/school.crawl.inc
Show inline comments
 
@@ -126,18 +126,24 @@ function school_crawl_time_format($time)
 

	
 
/**
 
 * \brief
 
 *   Equivalent of mktime() except that it accepts strptime()'s output
 
 *   format as an input.
 
 *   Equivalent of gmmktime() except that it accepts strptime()'s
 
 *   output format as an input.
 
 *
 
 * \param $tm
 
 *   An array formatted as the output of strptime().
 
 * \param $timezone_offset
 
 *   Optional offset of the school's timezone in seconds from
 
 *   UTC. This offset gets _added_ to the resulting timestamp. So, for
 
 *   example, Eastern Daylight Time would use a value of 60*60 * -4
 
 *   since it is -0400 during Daylight time.
 
 * \return
 
 *   A unix timestamp.
 
 */
 
function school_crawl_mktime(array $tm)
 
function school_crawl_gmmktime(array $tm, $timezone_offset = 0)
 
{
 
  return mktime($tm['tm_hour'], $tm['tm_min'], $tm['tm_sec'],
 
		$tm['tm_mon'] + 1, $tm['tm_mday'], $tm['tm_year'] + 1900);
 
  return gmmktime($tm['tm_hour'], $tm['tm_min'], $tm['tm_sec'],
 
		$tm['tm_mon'] + 1, $tm['tm_mday'], $tm['tm_year'] + 1900)
 
    + $timezone_offset;
 
}
 

	
 
/**
school.d/calvin.crawl.inc
Show inline comments
 
@@ -20,16 +20,26 @@
 

	
 
/**
 
 * \brief
 
 *   Crawl's Calvin's registration course listing pages.
 
 *   Retrieve a list of crawlable semesters from Calvin College.
 
 *
 
 * \param $school
 
 *   The calvin school handle.
 
 * \param $semesters
 
 *   An array to be filled with Semester objects which I should
 
 *   populate.
 
 *   The array to populate with empty Semester objects.
 
 * \param $school_crawl_log
 
 *   A school_crawl_log handle.
 
 *   A school_crawl_log handle for informing the user/developer of
 
 *   progress.
 
 */
 
function calvin_crawl(array &$semesters, &$school_crawl_log)
 
function calvin_crawl_semester_list(array $school, array &$semesters, &$school_crawl_log)
 
{
 
  $season_map = array(
 
		      'FA' => Semester::SEASON_FALL,
 
		      'IN' => 'interim',
 
		      'SP' => Semester::SEASON_SPRING,
 
		      'MA' => 'may',
 
		      /* I don't know if SU is a valid Calvin Semester ID or not */
 
		      'SU' => Semester::SEASON_SUMMER);
 

	
 
  /**
 
   * The first link we start at is the one from KV into WebAdvisor.
 
   *
 
@@ -48,95 +58,68 @@ function calvin_crawl(array &$semesters,
 
   */
 

	
 
  $cookies = array();
 
  $uri = 'https://kvdata.calvin.edu/walive/WebAdvisor?CONSTITUENCY=WBST&type=P&pid=ST-WESTS12A&LASTTOKEN=NULL';
 
  $semesters_html = calvin_crawl_geturi($uri, $cookies, $school_crawl_log);
 

	
 
  $uri = 'https://kvdata.calvin.edu/walive/WebAdvisor?CONSTITUENCY=WBST&type=P&pid=ST-WESTS12A&LASTTOKEN=NULL';
 
  $departments_html = calvin_crawl_geturi($uri, $cookies, $school_crawl_log);
 

	
 
  $departments_dom = new DOMDocument();
 
  $departments_dom->loadHTML($departments_html);
 
  $semesters_dom = new DOMDocument();
 
  $semesters_dom->loadHTML($semesters_html);
 

	
 
  /*
 
   * Discover the available semesters
 
   */
 
  $semesters_select_nodes = $departments_dom->getElementById('VAR1')->childNodes;
 
  $semester_strs = array();
 
  $semesters_var1 = $semesters_dom->getElementById('VAR1');
 
  if (empty($semesters_var1))
 
    {
 
      school_crawl_logf($school_crawl_log, 0, "Error: Unable to load list of semesters.");
 
      return 1;
 
    }
 
  $semesters_select_nodes = $semesters_var1->childNodes;
 
  foreach ($semesters_select_nodes as $semester_node)
 
    {
 
      if ($semester_node->tagName != 'option'
 
	  || !$semester_node->hasAttribute('value')
 
	  || !strlen($semester_node->getAttribute('value')))
 
	continue;
 
      $semester_strs[$semester_node->getAttribute('value')] =
 
	$semester_node->nodeValue;
 
    }
 
  $semester_strs = array_reverse($semester_strs, TRUE);
 

	
 
  $departments_select_nodes = $departments_dom->getElementById('LIST_VAR1_1')->childNodes;
 
  $departments = array();
 
  foreach ($departments_select_nodes as $dept_node)
 
    {
 
      if ($dept_node->tagName != 'option'
 
	  || !$dept_node->hasAttribute('value'))
 
	continue;
 
      $departments[$dept_node->getAttribute('value')] =
 
	$dept_node->nodeValue;
 
    }
 

	
 

	
 
  /*
 
   * get all of the different possible course levels... dynamically
 
   * rather than hardcodedly ;-).
 
   */
 
  $departments_select_nodes = $departments_dom->getElementById('LIST_VAR1_2')->childNodes;
 
  $course_levels = array();
 
  foreach ($departments_select_nodes as $courselevel_node)
 
    {
 
      if ($courselevel_node->tagName != 'option'
 
	  || !$courselevel_node->hasAttribute('value'))
 
	continue;
 
      $course_levels[] = $courselevel_node->getAttribute('value');
 
    }
 
      $semester_str = $semester_node->getAttribute('value');
 

	
 
  $return_url = dom_input_value($departments_dom, 'RETURN.URL');
 

	
 

	
 
  school_crawl_logf($school_crawl_log, 7, "Available semesters: %s.", implode($semester_strs, ', '));
 

	
 
  $semester_start_uri = $uri;
 

	
 
  $season_map = array(
 
		      'FA' => Semester::SEASON_FALL,
 
		      'IN' => 'interim',
 
		      'SP' => Semester::SEASON_SPRING,
 
		      'MA' => 'may',
 
		      /* I don't know if SU is a valid Calvin Smester ID or not */
 
		      'SU' => Semester::SEASON_SUMMER);
 
  foreach ($semester_strs as $semester_str => $semester_info)
 
    {
 
      if (empty($season_map[substr($semester_str, 3)]))
 
	{
 
	  school_crawl_logf($school_crawl_log, 6, "Warning: Unknown semester identification chars: %s. Skipping this semester.",
 
		  $semester_str);
 
			    $semester_str);
 
	  continue;
 
	}
 
      $season = $season_map[substr($semester_str, 3)];
 
      $year_timespec = strptime(substr($semester_str, 0, 2), '%y');
 
      $year =  $year_timespec['tm_year'] + 1900;
 
      $year = $year_timespec['tm_year'] + 1900;
 

	
 
      $semester = new Semester($year, $season);
 
      $semesters[$semester_str] = $semester;
 
    }
 
  $semester = array_reverse($semesters, TRUE);
 

	
 
      /* useful and necessary stats */
 
      $skipped_sections = array('incomplete meeting info' => 0, 'invalid meeting info format' => 0);
 

	
 
      $semester_start_min = 0;
 
      $semester_end_max = 0;
 
  return 0;
 
}
 

	
 
      $dept = '';
 
      $course_level = '';
 
      $uri = $semester_start_uri;
 

	
 
      school_crawl_logf($school_crawl_log, 6, "Crawling semester %s->%s.",
 
			$semester_str, $semester_info);
 
/**
 
 * \brief
 
 *   Crawl the courses for a semester from Calvin College.
 
 *
 
 * \param $school
 
 *   The calvin school handle.
 
 * \param $semester
 
 *   The Semester object to populate with courses.
 
 * \param $school_crawl_log
 
 *   The logger handle.
 
 */
 
function calvin_crawl_semester(array $school, Semester $semester, &$school_crawl_log)
 
{
 
  $cookies = array();
 
  $uri = 'https://kvdata.calvin.edu/walive/WebAdvisor?CONSTITUENCY=WBST&type=P&pid=ST-WESTS12A&LASTTOKEN=NULL';
 
  $html = calvin_crawl_geturi($uri, $cookies, $school_crawl_log);
 
  $seed_dom = new DOMDocument();
 
  $seed_dom->loadHTML($html);
 
  $return_url = dom_input_value($seed_dom, 'RETURN.URL');
 

	
 
  /*
 
   * LIST.VAR<X>_<N>: <X> is the column, <N> is the row. There
 
@@ -150,9 +133,12 @@ function calvin_crawl(array &$semesters,
 
   * LIST.VAR4: I forget
 
   *
 
   */
 
  $semester_str = sprintf("%02d/%s", $semester->year_get() % 100, strtoupper(substr($semester->season, 0, 2)));
 
  school_crawl_logf($school_crawl_log, 6, 'Using %s for a semester string.',
 
		    $semester_str);
 
  $form = array('VAR1' => $semester_str,
 
		'LIST.VAR1_1' => $dept,
 
		'LIST.VAR2_1' => $course_level,
 
		'LIST.VAR1_1' => '',
 
		'LIST.VAR2_1' => '',
 

	
 
		/*
 
		 * Other form items we're not querying but which need
 
@@ -216,10 +202,14 @@ function calvin_crawl(array &$semesters,
 
    $form['VAR' . $day] = '';
 
  */
 

	
 
  $semester_start_min = 0;
 
  $semester_end_max = 0;
 

	
 
  $skipped_sections = array('incomplete meeting info' => 0, 'invalid meeting info format' => 0);
 
  /*
 
   * pages is populated by preg_match() below after the first looping.
 
   */
 
  $pages = array(1 => 0, 2=> 1);
 
  $pages = array(1 => 0, 2 => 1);
 
  while ($pages[1] < $pages[2])
 
    {
 
      $html = calvin_crawl_noscript_filter(school_crawl_geturi($uri, $cookies, $school_crawl_log, $form));
 
@@ -243,7 +233,7 @@ function calvin_crawl(array &$semesters,
 
	    }
 

	
 
	  /*
 
	   * the same info below should be gettable with 
 
	   * The same info below should be retrievable with 
 
	   * dom_id_content($results_dom, 'SEC_FACULTY_INFO_' . $list_row);
 
	   */
 
	  $faculty_name = dom_input_value($results_dom, 'SEC.FACULTY.INFO_' . $list_row);
 
@@ -396,13 +386,16 @@ function calvin_crawl(array &$semesters,
 
	  $date_end_time = strptime($date_end, '%m/%d/%Y');
 
	  if ($date_start_time !== FALSE)
 
	    {
 
	      $date_start_time = school_crawl_mktime($date_start_time);
 
	      $date_start_time = school_crawl_gmmktime($date_start_time, -5 * 60*60);
 
	      if (!$semester_start_min || $semester_start_min > $date_start_time)
 
		$semester_start_min = $date_start_time;
 
		{
 
		  school_crawl_logf($school_crawl_log, 1, "Using section %s for the minimum start time.", $section_id['department'] . '-' . $section_id['course'] . '-' . $section_id['section']);
 
		  $semester_start_min = $date_start_time;
 
		}
 
	    }
 
	  if ($date_end_time !== FALSE)
 
	    {
 
	      $date_end_time = school_crawl_mktime($date_end_time);
 
	      $date_end_time = school_crawl_gmmktime($date_end_time, -5 * 60*60);
 
	      if ($semester_end_max < $date_end_time)
 
		$semester_end_max = $date_end_time;
 
	    }
 
@@ -432,29 +425,24 @@ function calvin_crawl(array &$semesters,
 
      school_crawl_logf($school_crawl_log, 7, "%s: %d", $reason, $num);
 
    }
 

	
 
    $semester->time_end_set($semester_end_max);
 
    $semester->time_start_set($semester_start_min);
 
  $semester->time_end_set($semester_end_max);
 
  $semester->time_start_set($semester_start_min);
 

	
 
    /*
 
     * Calculate lab-based course dependencies.
 
     */
 
    school_crawl_logf($school_crawl_log, 7, 'Adding implicit lab dependencies.');
 
    foreach ($semester->departments_get() as $department)
 
      foreach ($semester->department_classes_get($department) as $course)
 
        {
 
	  $the_course = $semester->class_get($department, $course);
 
	  $lab_course = $semester->class_get($department, $course . 'L');
 
	  if (!empty($lab_course))
 
	    {
 
	      $the_course->dependency_add($lab_course);
 
	      school_crawl_logf($school_crawl_log, 8, "Adding dependency of %s-%s for %s-%s.",
 
				$department, $course . 'L', $department, $course);
 
	    }
 
  /*
 
   * Calculate lab-based course dependencies.
 
   */
 
  school_crawl_logf($school_crawl_log, 7, 'Adding implicit lab dependencies.');
 
  foreach ($semester->departments_get() as $department)
 
    foreach ($semester->department_classes_get($department) as $course)
 
    {
 
      $the_course = $semester->class_get($department, $course);
 
      $lab_course = $semester->class_get($department, $course . 'L');
 
      if (!empty($lab_course))
 
	{
 
	  $the_course->dependency_add($lab_course);
 
	  school_crawl_logf($school_crawl_log, 8, "Adding dependency of %s-%s for %s-%s.",
 
			    $department, $course . 'L', $department, $course);
 
	}
 

	
 
    $semesters[] = $semester;
 

	
 
    school_crawl_logf($school_crawl_log, 6, "");
 
    }
 

	
 
  return 0;
school.d/ccbcmd.crawl.inc
Show inline comments
 
@@ -18,6 +18,282 @@
 
 * along with slate_permutate.  If not, see <http://www.gnu.org/licenses/>.
 
 */
 

	
 
define('CCBCMD_CRAWL_URI', 'http://ccbcmd.edu/schedule/sched.html');
 

	
 
/**
 
 * \brief
 
 *   Obtain list of crawlable semesters offered by CCBCMD.
 
 *
 
 * \parram $school
 
 *   The CCBCMD school handle.
 
 * \param $semesters
 
 *   Array to populate with available semesters.
 
 * \return
 
 *   0 on success.
 
 */
 
function ccbcmd_crawl_semester_list(array $school, array &$semesters, &$school_crawl_log)
 
{
 
  $cookies = array();
 

	
 
  /*
 
   * It seems that http://ccbcmd.edu/schedule/sched.html is what we're
 
   * meant to start from. That's just a redirect to some other page
 
   * from which we get a listing of available semesters and choose
 
   * one.
 
   */
 
  $uri = CCBCMD_CRAWL_URI;
 
  $semesters_dom = new DOMDocument();
 
  $semesters_dom->loadHTML(school_crawl_geturi($uri, $cookies, $school_crawl_log, NULL, TRUE, 'ccbcmd_crawl_curlhook'));
 
  $semesters_select_node = $semesters_dom->getElementById('term_input_id');
 
  if ($semesters_select_node === NULL)
 
    {
 
      school_crawl_logf($school_crawl_log, 0, "Could not get list of available semesters to choose from.");
 
      return 1;
 
    }
 

	
 
  foreach ($semesters_select_node->childNodes as $semesters_option_node)
 
    {
 
      $semester_text = $semesters_option_node->textContent;
 
      $semester_value = $semesters_option_node->getAttribute('value');
 
      if (empty($semester_value))
 
	/* skip the empty ``None'' semester */
 
	continue;
 

	
 
      if (stripos($semester_text, 'continuing') !== FALSE)
 
	/* skip the year-long semesters dedicated to continuing education */
 
	continue;
 

	
 
      list($semester_season, $semester_year) = explode(' ', $semester_text);
 

	
 
      /* the college has two separate summer sessions, so distinguish between them */
 
      if (preg_match(';session ([0-9]+);i', $semester_text, $matches))
 
	$semester_season .= '_' . $matches[1];
 

	
 
      $semesters[] = new Semester($semester_year, strtolower($semester_season));
 
    }
 

	
 
  return 0;
 
}
 

	
 
/**
 
 * \brief
 
 *   Crawl a CCBCMD semester.
 
 *
 
 * \param $school
 
 *   The CCBCMD school handle.
 
 * \param $semester
 
 *   The semester to fill with courses.
 
 */
 
function ccbcmd_crawl_semester($school, $semester, &$school_crawl_log)
 
{
 
  $cookies = array();
 
  $uri = CCBCMD_CRAWL_URI;
 
  $semesters_dom = new DOMDocument();
 
  $semesters_dom->loadHTML(school_crawl_geturi($uri, $cookies, $school_crawl_log, NULL, TRUE, 'ccbcmd_crawl_curlhook'));
 
  $semesters_select_node = $semesters_dom->getElementById('term_input_id');
 
  if (empty($semesters_select_node))
 
    {
 
      school_crawl_logf($school_crawl_log, 0, "Could not locate the list of semesters from which to choose.");
 
      return 1;
 
    }
 

	
 
  $semesters_form = school_crawl_element_ancestor($semesters_select_node, 'form');
 
  if ($semesters_form === NULL)
 
    {
 
      school_crawl_logf($school_crawl_log, 0, "Unable to find <form /> associated with semester.");
 
      return 1;
 
    }
 
  $semesters_post = school_crawl_form($semesters_form);
 

	
 
  $semester_found = FALSE;
 
  foreach ($semesters_select_node->childNodes as $semesters_option_node)
 
    {
 
      $semester_text = $semesters_option_node->textContent;
 
      $semester_value = $semesters_option_node->getAttribute('value');
 
      if (empty($semester_value))
 
	continue;
 

	
 
      list($semester_season, $semester_year) = explode(' ', $semester_text);
 
      if (preg_match(';session ([0-9]+);i', $semester_text, $matches))
 
	$semester_season .= '_' . $matches[1];
 
      $semester_season = strtolower($semester_season);
 

	
 
      if ($semester_year == $semester->year_get()
 
	  && $semester_season == $semester->season_get())
 
	{
 
	  $semester_found = TRUE;
 
	  break;
 
	}
 
    }
 
  if (!$semester_found)
 
    {
 
      school_crawl_logf($school_crawl_log, 1, "Unable to find the entry for semester %s.", $semester);
 
      return 1;
 
    }
 

	
 
  $semesters_post[$semesters_select_node->getAttribute('name')] = $semester_value;
 

	
 
  $subjects_dom = new DOMDocument();
 
  $uri = school_crawl_url($uri, $semesters_form->getAttribute('action'));
 
  $subjects_dom->loadHTML(school_crawl_geturi($uri, $cookies, $school_crawl_log, $semesters_post, TRUE, 'ccbcmd_crawl_curlhook'));
 

	
 
  $subjects_form_nodelist = $subjects_dom->getElementsByTagName('form');
 
  if (!$subjects_form_nodelist->length)
 
    {
 
      school_crawl_logf($school_crawl_log, 0, "Unable to find <form /> to submit for the subjects-choosing page.");
 
      return 1;
 
    }
 
  $subjects_form_node = $subjects_form_nodelist->item(0);
 
  $subjects_post = school_crawl_form($subjects_form_node);
 

	
 
  $subjects_select_node = $subjects_dom->getElementById('subj_id');
 
  foreach ($subjects_select_node->childNodes as $subjects_option_node)
 
    if (!strcasecmp('all', trim($subjects_option_node->textContent)))
 
      $subjects_post[$subjects_select_node->getAttribute('name')][] = $subjects_option_node->getAttribute('value');
 

	
 
  $courses_dom = new DOMDocument();
 
  $uri = school_crawl_url($uri, $subjects_form_node->getAttribute('action'));
 
  $courses_dom->loadHTML(school_crawl_geturi($uri, $cookies, $school_crawl_log, $subjects_post, TRUE, 'ccbcmd_crawl_curlhook'));
 

	
 
  $courses_xpath = new DOMXPath($courses_dom);
 

	
 
  /* The second row of the table has all of the headers in it */
 
  $tr_header_nodelist = $courses_xpath->query('//table[@class="datadisplaytable" and position()=1]//tr[position()=2]');
 
  if (!$tr_header_nodelist->length)
 
    {
 
      school_crawl_logf($school_crawl_log, 0, "Unable to find the row of the course/section data table which gives us the mappings of column names onto columns.");
 
      return 1;
 
    }
 
  $tr_header_node = $tr_header_nodelist->item(0);
 

	
 
  $section_offsets = array(
 
			   'registration_number' => school_crawl_table_resolve_column($tr_header_node, 'CRN'),
 
			   'section_id' => school_crawl_table_resolve_column($tr_header_node, 'subj/crse/sec'),
 
			   /* there's a boolean column which says whether or not the course has any prerequisites/corequisites.... */
 
			   'credits' => school_crawl_table_resolve_column($tr_header_node, 'credhrs'),
 
			   /* there's a column for the number of contact hours, vs. credit hours */
 
			   'dates' => school_crawl_table_resolve_column($tr_header_node, 'sessiondates'),
 
			   );
 
  foreach (array('title', 'days', 'times', 'instructor', 'location') as $column_key)
 
    $section_offsets[$column_key] = school_crawl_table_resolve_column($tr_header_node, $column_key);
 
  /* there's also a column for ``session dates'' */
 

	
 
  /* error check and calculate the number of children that a node must have to be  */
 
  $max_offset = 0;
 
  foreach ($section_offsets as $name => $value)
 
    {
 
      if ($value === FALSE)
 
	{
 
	  school_crawl_logf($school_crawl_log, 0, "Unable to find column offset for `%s'.",
 
		  $name);
 
	  return 1;
 
	}
 
      else
 
	school_crawl_logf($school_crawl_log, 9, "%s -> %s", $name, $value);
 

	
 
      $max_offset = max($max_offset, $value);
 
    }
 
      
 
  foreach ($courses_xpath->query('//table[@class="datadisplaytable" and position()=1]//tr') as $tr_node)
 
    {
 
      $children = school_crawl_table_rownodes($tr_node);
 
      if ($children->length < $max_offset)
 
	/*
 
	 * Skip this row because it doesn't have all of the columns we
 
	 * want and thus it can't be a row containing information
 
	 * about a section.
 
	 */
 
	continue;
 
      if (!strcmp($children->item($section_offsets['section_id'])->tagName, 'th'))
 
	/*
 
	 * We've hit one of the <tr/>s filled with <th/>s. Skip this one.
 
	 */
 
	continue;
 

	
 
      /*
 
       * There are some rows with the time set to TBA and with empty
 
       * section_id columns. Respond to this by skipping empty
 
       * section_id columns since there's no useful data in these
 
       * rows. We use strlen() < 3 because trim() doesn't take care of
 
       * &nbsp; :-/
 
       */
 
      $section_id = trim($children->item($section_offsets['section_id'])->textContent);
 
      if (strlen($section_id) < 3)
 
	continue;
 

	
 
      $section_id_parts = Section::parse($section_id);
 

	
 
      $registration_number = $children->item($section_offsets['registration_number'])->textContent;
 
      $instructor = $children->item($section_offsets['instructor'])->textContent;
 

	
 
      $section_meetings = array();
 
      {
 
	$time_range_text = $children->item($section_offsets['times'])->textContent;
 
	if (strpos($time_range_text, 'TBA') !== FALSE)
 
	  {
 
	    /*
 
	     * Add the section to the autocomplete list, just without
 
	     * any meeting info (i.e., $section_meetings is still
 
	     * empty now).
 
	     */
 
	    $semester->section_add($section_id_parts['department'], $section_id_parts['course'],
 
				   new Section($section_id_parts['section'], $section_meetings, $registration_number));
 
	    continue;
 

	
 
	  }
 
	if (($dash_pos = strpos($time_range_text, '-')) === FALSE)
 
	  {
 
	    school_crawl_logf($school_crawl_log, 0, "Unable to understand course's time range format, cannot find dash: ``%s''.",
 
		    $time_range_text);
 
	    return 1;
 
	  }
 

	
 
	$time_start_text = substr($time_range_text, 0, $dash_pos);
 
	$time_start = strptime($time_start_text, '%I:%M %p');
 
	$time_end_text = substr($time_range_text, $dash_pos + 1);
 
	/*
 
	 * Make sure that _only_ one date range is specified to ensure
 
	 * data integrity. I.e., make sure that the college doesn't
 
	 * suddenly support multiple meeting times without our
 
	 * anticipating that and then cause us to have invalid
 
	 * data. ;-). --binki
 
	 */
 
	if (strpos($time_end_text, '-') !== FALSE)
 
	  {
 
	    school_crawl_logf($school_crawl_log, 0, "College seems to support multiple meeting times per semester which we don't know how to parse (even though slate_permutate itself can handle this situation): ``%s'' time_end_text: ``%s''.",
 
		    $time_range_text, $time_end_text);
 
	    return 1;
 
	  }
 
	$time_end = strptime($time_end_text, '%I:%M %p');
 
	if ($time_end === FALSE || $time_start === FALSE)
 
	  {
 
	    school_crawl_logf($school_crawl_log, 0, "Error parsing start or end time: start: ``%s'' end: ``%s''.",
 
		    $time_start_text, $time_end_text);
 
	    return 1;
 
	  }
 

	
 
	$days = school_crawl_days_str_format($school_crawl_log, $children->item($section_offsets['days'])->textContent);
 

	
 
	$section_meetings[] = new SectionMeeting($days, school_crawl_time_format($time_start), school_crawl_time_format($time_end),
 
						 $children->item($section_offsets['location'])->textContent,
 
						 $instructor);
 

	
 
	/* check if a semester's date range should be increased */
 
	$section_dates = $children->item($section_offsets['dates'])->textContent;
 
	if (preg_match(';^([0-9]+)/([0-9]+)-([0-9]+)/([0-9]+)$;', $section_dates, $section_dates_matches))
 
	  {
 
	    $semester->time_start_set_test(gmmktime(0, 0, 0, $section_dates_matches[1], $section_dates_matches[2], $semester->year_get()));
 
	    $semester->time_end_set_test(gmmktime(0, 0, 0, $section_dates_matches[3], $section_dates_matches[4], $semester->year_get()));
 
	  }
 
      }
 

	
 
      $semester->section_add($section_id_parts['department'], $section_id_parts['course'],
 
			     new Section($section_id_parts['section'], $section_meetings, $registration_number));
 
    }
 

	
 
  return 0;
 
}
 

	
 
/**
 
 * \brief
 
 *   Crawl CCBCMD's registration stuffage.
school.d/cedarville.crawl.inc
Show inline comments
 
@@ -52,81 +52,107 @@ function table_parse($html)
 
  return $arr;
 
}
 

	
 
/** Crawls Cedarville course listings. $season is "fa" or "sp", year is 4-digit year */
 
function cedarville_crawl(array &$semesters, &$school_crawl_log)
 
{  
 
  $basepath = 'http://cedarville.edu/courses/schedule/';
 

	
 
  school_crawl_logf($school_crawl_log, 6, "Beginning crawl of Cedarville:");
 

	
 
  school_crawl_logf($school_crawl_log, 7, "Determining list of departments.");
 
define('CEDARVILLE_BASE_URI', 'http://cedarville.edu/courses/schedule/');
 
define('CEDARVILLE_TIMEZONE_OFFSET', 60*60 * -4);
 

	
 
  school_crawl_logf($school_crawl_log, 8, "Determining list of semesters.");
 
  $semesters_dom = new DOMDocument();
 
  $semesters_dom->loadHTML(file_get_contents($basepath));
 

	
 
  $content_div_dom = $semesters_dom->getElementById('contenttext');
 
  if (!$content_div_dom)
 
/**
 
 * \brief
 
 *   Obtain the list of crawlable semesters offered by Cedarville.
 
 *
 
 * \param $school
 
 *   The school's info array/handle.
 
 * \param $semesters
 
 *   An array to insert the semesters into.
 
 * \return
 
 *   0 on success.
 
 */
 
function cedarville_crawl_semester_list(array $school, array &$semesters, &$school_crawl_log)
 
{
 
  $uri = CEDARVILLE_BASE_URI;
 
  $cookies = array();
 
  $html = school_crawl_geturi($uri, $cookies, $school_crawl_log);
 
  if (empty($html))
 
    {
 
      school_crawl_logf($school_crawl_log, 6, "Error finding location of the list of departments.");
 
      if (count($semesters))
 
	{
 
	  school_crawl_logf($school_crawl_log, 6, "Assuming that I got enough info anyways, returning successful code so that the few semesters I was able to crawl will be cached.");
 
	  return 0;
 
	}
 
      school_crawl_logf($school_crawl_log, 0, "Couldn't find any departments.");
 
      school_crawl_logf($school_crawl_log, 1, "Unable to fetch %s.", CEDARVILLE_BASE_URI);
 
      return 1;
 
    }
 

	
 
  $semesters_dom = new DOMDocument();
 
  $semesters_dom->loadHTML($html);
 

	
 
  $departments_xpath = new DOMXPath($semesters_dom);
 
  foreach ($departments_xpath->query('.//li/a') as $department_a_dom)
 
  $have_semesters = FALSE;
 
  foreach ($departments_xpath->query('//*[@id="contenttext"]//li/a') as $department_a_dom)
 
    {
 
      $semester_href = $department_a_dom->getAttribute('href');
 
      $semester_href_parts = split('_', $semester_href);
 
      $semester_href_parts = explode('_', $semester_href);
 

	
 
      $semester_name = $department_a_dom->textContent;
 
      if (stripos($semester_name, 'graduate') !== FALSE
 
	  || strpos($semester_href, 'index') === FALSE)
 
	/* cedarville has about 1 graduate course, lol */
 
	continue;
 
      $semester_name_parts = split(' ', $semester_name);
 

	
 
      $semester_name_parts = explode(' ', $semester_name);
 

	
 
      $semester_year = $semester_name_parts[0];
 
      $semester_season = strtolower($semester_name_parts[1]);
 
      $semester_min_date_start = 0;
 
      $semester_max_date_end = 0;
 

	
 
      $semester = new Semester($semester_year, $semester_season);
 

	
 
      school_crawl_logf($school_crawl_log, 6, "Crawling semester: %s.",
 
			$semester_name);
 
      $semesters[] = new Semester($semester_year, $semester_season);
 
      $have_semesters = TRUE;
 
    }
 

	
 
  /*
 
   * We need two passes because the first department's code name is
 
   * not accessible available in the first pageload.
 
   * Prime cedarville_semester_uri()'s cache to have one fewer page
 
   * load.
 
   */
 
  cedarville_semester_uri(NULL, $school_crawl_log, $semesters_dom);
 

	
 
  return $have_semesters ? 0 : 1;
 
}
 

	
 
/**
 
 * \brief
 
 *   Crawl a given Cedarville semester.
 
 *
 
 * \param $school
 
 *   The school handle.
 
 * \param $semester
 
 *   The semester to populate with courses.
 
 */
 
function cedarville_crawl_semester(array $school, Semester $semester, &$school_crawl_log)
 
{
 
  $semester_uri = cedarville_semester_uri($semester, $school_crawl_log);
 
  if (empty($semester_uri))
 
    return 1;
 
  list($season_string) = explode('_', $semester_uri);
 

	
 
  /*
 
   * Two passes are needed to determine the listing of departments
 
   * because the first department's code name is not accessible
 
   * available in the first pageload.
 
   */
 
  $departments = array();
 
  if (cedarville_crawl_departments_get($basepath . $semester_href, $departments, $semester_href_parts[0], $school_crawl_log))
 
  if (cedarville_crawl_departments_get(CEDARVILLE_BASE_URI . $semester_uri, $departments, $season_string, $school_crawl_log))
 
    return 1;
 
  if (!count($departments))
 
    {
 
      school_crawl_logf($school_crawl_log, 6, "Unable to get a listing of departments.");
 
      if (count($semesters))
 
	{
 
	  school_crawl_logf($school_crawl_log, 6, "Assuming that I got enough info anyways, returning successful code so that the few semesters I was able to crawl will be cached.");
 
	  return 0;
 
	}
 
      school_crawl_logf($school_crawl_log, 0, "Unable to get listing of departments.");
 
      school_crawl_logf($school_crawl_log, 2, "Unable to get a listing of departments.");
 
      return 1;
 
    }
 

	
 
  /* find the first department whose name we don't yet know */
 
  if (cedarville_crawl_departments_get($basepath . $semester_href_parts[0] . '_' . current(array_keys($departments)) . '_all.htm', $departments, $semester_href_parts[0], $school_crawl_log))
 
  if (cedarville_crawl_departments_get(CEDARVILLE_BASE_URI . $season_string . '_' . current(array_keys($departments)) . '_all.htm', $departments, $season_string, $school_crawl_log))
 
    return 1;
 

	
 
  $tables = array();
 
  $cookies = array();
 
  foreach ($departments as $department => $dept_name)
 
    {
 
      school_crawl_logf($school_crawl_log, 7, "Crawling department %s (%s).", $department, $dept_name);
 
      $html = file_get_contents($basepath . $semester_href_parts[0] . '_' . $department . '_' . 'all.htm');
 

	
 
      $uri = CEDARVILLE_BASE_URI . $season_string . '_' . $department . '_all.htm';
 
      $html = school_crawl_geturi($uri, $cookies, $school_crawl_log);
 
      if (!$html)
 
	continue;
 
      $tables[$department] = table_parse(cedarville_html_fix($html));
 
@@ -239,8 +265,8 @@ function cedarville_crawl(array &$semest
 
	      /* check for daterange information -- i.e., if the first regex successfully matched: */
 
	      if (count($meeting_matches) > 7)
 
		{
 
		  $date_start = school_crawl_mktime(strptime($meeting_matches[6], '%m/%d/%y'));
 
		  $date_end = school_crawl_mktime(strptime($meeting_matches[7], '%m/%d/%y'));
 
		  $date_start = school_crawl_gmmktime(strptime($meeting_matches[6], '%m/%d/%y'), CEDARVILLE_TIMEZONE_OFFSET);
 
		  $date_end = school_crawl_gmmktime(strptime($meeting_matches[7], '%m/%d/%y'), CEDARVILLE_TIMEZONE_OFFSET);
 
		  if (!empty($date_start) && !empty($date_end))
 
		    {
 
		      $semester->time_start_set_test($date_start);
 
@@ -258,10 +284,66 @@ function cedarville_crawl(array &$semest
 
	}
 
    }
 

	
 
  $semesters[] = $semester;
 
  return 0;
 
}
 

	
 
/**
 
 * \brief
 
 *   Look up the URI used to access information about a particular
 
 *   Cedarville semester.
 
 *
 
 * \param $semester
 
 *   The semester whose URI is being retrieved.
 
 * \param $document
 
 *   Optional DOMDocument of the Cedarville semester listing page, to
 
 *   aid seeding the cache. To prime the cache, just set $semester to
 
 *   NULL and pass in $document.
 
 * \return
 
 *   The URI for that semester's courses relative to
 
 *   CEDARVILLE_BASE_URI.
 
 */
 
function cedarville_semester_uri(Semester $semester = NULL, &$school_crawl_log, DOMDocument $document = NULL)
 
{
 
  static $semester_to_uri = array();
 

	
 
  if (empty($semester_to_uri))
 
    {
 
      if (empty($document))
 
	{
 
	  $uri = CEDARVILLE_BASE_URI;
 
	  $cookies = array();
 
	  $html = school_crawl_geturi($uri, $cookies, $school_crawl_log);
 
	  if (empty($html))
 
	    return NULL;
 

	
 
	  $document = new DOMDocument();
 
	  $document->loadHTML($html);
 
	}
 

	
 
      $departments_xpath = new DOMXPath($document);
 
      foreach ($departments_xpath->query('//*[@id="contenttext"]//li/a') as $department_a_dom)
 
	{
 
	  $semester_href = $department_a_dom->getAttribute('href');
 

	
 
	  $semester_name = $department_a_dom->textContent;
 

	
 
	  list($semester_year, $semester_season) = explode(' ', $semester_name);
 
	  $semester_season = strtolower($semester_season);
 

	
 
	  $semester_to_uri += array($semester_year => array());
 
	  $semester_to_uri[$semester_year][$semester_season] = $semester_href;
 
	}
 
    }
 

	
 
  return 0;
 
  if (empty($semester))
 
    return NULL;
 

	
 
  $year = $semester->year_get();
 
  $season = $semester->season_get();
 
  if (empty($semester_to_uri[$year][$season]))
 
    return NULL;
 

	
 
  return $semester_to_uri[$year][$season];
 
}
 

	
 
/**
 
@@ -274,7 +356,8 @@ function cedarville_crawl(array &$semest
 
 */
 
function cedarville_crawl_departments_get($dept_url, array &$departments, $season_string, $school_crawl_log)
 
{
 
  $html = file_get_contents($dept_url);
 
  $cookies = array();
 
  $html = school_crawl_geturi($dept_url, $cookies, $school_crawl_log);
 
  $dept_dom = new DOMDocument();
 
  if (!$dept_dom->loadHTML(cedarville_html_fix($html)))
 
    {
0 comments (0 inline, 0 general)