Changeset - b5e0a0fe027d
[Not reviewed]
default
0 7 0
Nathan Brink (binki) - 15 years ago 2011-02-12 01:03:57
ohnobinki@ohnopublishing.net
Use an internal interface for logging information when crawling a school.
7 files changed with 213 insertions and 138 deletions:
0 comments (0 inline, 0 general)
admin/rehash.php
Show inline comments
 
@@ -60,7 +60,7 @@ function main($argc, $argv)
 
  if (isset($opts['crawl-only']))
 
    $crawl_only = split(',', $opts['crawl-only']);
 

	
 
  $verbosity = 1;
 
  $verbosity = 5;
 
  if (isset($opts['verbosity']))
 
    $verbosity = (int)$opts['verbosity'];
 
  if (isset($opts['V']))
 
@@ -75,7 +75,7 @@ function main($argc, $argv)
 

	
 
  if ($crawl)
 
    {
 
      $ret = school_cache_recreate($crawl_only, $verbosity);
 
      $ret = school_cache_recreate($crawl_only, NULL, $verbosity);
 
      if ($ret)
 
	{
 
	  fprintf(STDERR, "error: Unable to successfully crawl schools.\n");
inc/admin.inc
Show inline comments
 
@@ -250,13 +250,15 @@ function school_cache_semesters_sort (Se
 
 * \param $school
 
 *   The school which should be checked for crawl functionality and
 
 *   crawled.
 
 * \param $semester_year
 
 *   The year of the semester for which we should grab data.
 
 * \param $semester_season
 
 *   The season of the year of the semester for which we should grab
 
 *   data.
 
 * \param $page
 
 *   The Page object for which HTML formatted logs should be outputted
 
 * \param $verbosity
 
 *   How verbose to be. Sane values are from 0 through 10.
 
 * \return
 
 *   A school_crawl_log handle, upopn which school_crawl_log_fetch()
 
 *   may be used.
 
 */
 
function school_crawl(array &$school, $verbosity = 1)
 
function school_crawl(array &$school, Page $page = NULL, $verbosity = 1)
 
{
 
  $school['crawled'] = FALSE;
 

	
 
@@ -264,23 +266,32 @@ function school_crawl(array &$school, $v
 
  if (!function_exists($school_crawl_func))
 
    return;
 

	
 
  $school_crawl_log_opts = array('verbosity' => $verbosity);
 
  if (defined('STDERR'))
 
    $school_crawl_log_opts['stream'] = STDERR;
 
  if ($page !== NULL)
 
    $school_crawl_log_opts['page'] = $page;
 
  $school_crawl_log = school_crawl_log_init($school, $school_crawl_log_opts);
 

	
 
  $semesters = array();
 

	
 
  if ($verbosity > 0)
 
    fprintf(STDERR, "%s()\n", $school_crawl_func);
 
  $ret = $school_crawl_func($semesters, $verbosity);
 
    school_crawl_logf($school_crawl_log, 2, "Calling crawler...");
 

	
 
  $ret = $school_crawl_func($semesters, $school_crawl_log, $verbosity);
 
  if ($ret)
 
    {
 
      fprintf(STDERR, "Crawling %s failed: %s() returned nonzero\n",
 
      school_crawl_logf($school_crawl_log, 1, "Crawling %s failed: %s() returned nonzero",
 
	      $school['id'], $school_crawl_func);
 
      fwrite(STDERR, "\n");
 
      school_crawl_logf($school_crawl_log, 6, "");
 
      return;
 
    }
 
  $school['crawled'] = TRUE;
 
  $school['crawled_semesters'] = $semesters;
 

	
 
  if ($verbosity > 0)
 
    fwrite(STDERR, "\n");
 
  school_crawl_logf($school_crawl_log, 6, "");
 

	
 
  return $school_crawl_log;
 
}
 

	
 

	
 
@@ -295,7 +306,7 @@ function school_crawl(array &$school, $v
 
 * \param $verbosity
 
 *   An integer indicating how loud to be.
 
 */
 
function school_cache_recreate($crawl_only = NULL, $verbosity = 1)
 
function school_cache_recreate($crawl_only = NULL, Page $page = NULL, $verbosity = 5)
 
{
 
  $school_id_list = school_list();
 
  if (!$school_id_list)
 
@@ -330,7 +341,7 @@ function school_cache_recreate($crawl_on
 

	
 
      if ($crawl_only === NULL || in_array($school['id'], $crawl_only))
 
	{
 
	  school_crawl($school, $verbosity);
 
	  $school_crawl_log = school_crawl($school, $page, $verbosity);
 
	}
 
      else
 
	{
inc/class.page.php
Show inline comments
 
@@ -577,4 +577,19 @@ class page
 
    }
 
  }
 

	
 
  /**
 
   * \brief
 
   *   Generate special code to close a self-closing XHTML/HTML
 
   *   element.
 
   *
 
   * \return
 
   *   A string containing the correct self-closing chars. For
 
   *   example, this would be ' /' for XHTML.
 
   */
 
  public function element_self_close()
 
  {
 
    if ($this->xhtml)
 
      return ' /';
 
    return '';
 
}
 
}
inc/school.crawl.inc
Show inline comments
 
@@ -26,6 +26,91 @@
 

	
 
/**
 
 * \brief
 
 *   Initialize a school_crawl_log handle.
 
 *
 
 * \param $school
 
 *   The school for which this crawl handle is.
 
 * \param $opts
 
 *   An array optionally with one of the following keys:
 
 *   - stream: an fopen()-compatible stream to fwrite()/fprintf() output to.
 
 *   - page: a Page object used to help format HTML output.
 
 *   - verbosity: A number from 0 through 10 describing the desired
 
 *       verbosity.
 
 */
 
function school_crawl_log_init(array $school, $opts = array())
 
{
 
  $opts += array('verbosity' => 5);
 
  return array('school' => $school, 'out' => array('html' => array(), 'plain' => array())) + $opts;
 
}
 

	
 
/**
 
 * \brief
 
 *   Log progress of a crawler.
 
 *
 
 * This function's arguments take the same style as fprintf() does.
 
 *
 
 * \param $school_crawl_log
 
 *   The logging resource.
 
 * \param $verboseness
 
 *   The verbosity level at which to log the message. Should be a
 
 *   value from 0 to 10, where 0 is unconditionally printed and 5 is
 
 *   the default.
 
 * \param $format
 
 *   The printf()-style format string.
 
 */
 
function school_crawl_logf(array $school_crawl_log, $verboseness, $format)
 
{
 
  $args = func_get_args();
 
  array_shift($args);
 
  array_shift($args);
 

	
 
  if ($verboseness > $school_crawl_log['verbosity'])
 
    /*
 
     * The given message gives us more detail than we want. Therefore,
 
     * discard it.
 
     */
 
    return;
 

	
 
  $log_line = call_user_func_array('sprintf', $args);
 

	
 
  /* store output in a place where it's retrievable */
 
  $school_crawl_log['out']['plain'][] = sprintf("%s_crawl(): %s\n",
 
						$school_crawl_log['school']['id'], $log_line);
 

	
 
  /* store the output in a retrievable list of outputs */
 
  if (isset($school_crawl_log['page']))
 
    $school_crawl_log['out']['html'][] = sprintf("<div class=\"logline\"><tt>%s_crawl()</tt>: %s</div><br class=\"logline\"%s>\n",
 
						 $school_crawl_log['school']['id'], htmlentities($log_line),
 
						 $school_crawl_log['page']->element_self_close());
 

	
 
  /* print to a stream potentially */
 
  if (isset($school_crawl_log['stream']))
 
    fprintf($school_crawl_log['stream'], "%s_crawl(): %s\n", $school_crawl_log['school']['id'], $log_line);
 

	
 
  return 0;
 
}
 

	
 
/**
 
 * \brief
 
 *   Recover stored crawling log stuffage.
 
 *
 
 * \param $html
 
 *   Whether to retrieve formatted HTML output if it's available.
 
 * \return
 
 *   An array of output lines.
 
 */
 
function school_crawl_log_fetch(array $school_crawl_log, $html = FALSE)
 
{
 
  if ($html)
 
    if (isset($school_crawl_log['page']))
 
      return $school_crawl_log['out']['html'];
 
    else
 
      return nl2br(htmlentities($school_crawl_log['out']['plain']));
 
  return $school_crawl_log['out']['plain'];
 
}
 

	
 
/**
 
 * \brief
 
 *   Parse a simple time string into slate_permutate's time
 
 *   representation.
 
 *
 
@@ -150,6 +235,8 @@ function school_crawl_days_str_format($d
 
 *   The URL to fetch. If a redirect occurs, this is updated.
 
 * \param $cookies
 
 *   An associative array of cookies and where to save new cookies.
 
 * \param $school_crawl_log
 
 *   The school_crawl_log handle to use.
 
 * \param $post
 
 *   If not NULL, causes an HTTP POST. In that case, should be an
 
 *   associative array of form keys/values.
 
@@ -161,8 +248,6 @@ function school_crawl_days_str_format($d
 
 *   A function which is passed a curl handle which allows the caller
 
 *   to do silly things like setting CURLOPT_SSLVERSION for silly
 
 *   sites like ccbcmd's registration site.
 
 * \param $verbosity
 
 *   How verbose to be.
 
 * \param $loopspin
 
 *   An internal variable to prevent us from following perpetual
 
 *   redirects.
 
@@ -171,23 +256,17 @@ function school_crawl_days_str_format($d
 
 *   malformed HTML, especially with Calvin's WebAdvisor
 
 *   installation).
 
 */
 
function school_crawl_geturi(&$uri, &$cookies, $post = NULL, $follow_meta_refresh = FALSE, $curlsetup_hook = NULL, $verbosity = 0, $loopspin = 0)
 
function school_crawl_geturi(&$uri, &$cookies, array &$school_crawl_log, $post = NULL, $follow_meta_refresh = FALSE, $curlsetup_hook = NULL, $loopspin = 0)
 
{
 
  global $school_crawl_geturi_write_buf, $school_crawl_geturi_headers_buf, $school_crawl_geturi_verbosity;
 
  global $school_crawl_geturi_write_buf, $school_crawl_geturi_headers_buf;
 

	
 
  if ($verbosity > 5)
 
    {
 
      echo "\n";
 
      echo 'school_crawl_geturi(' . $uri . ")\n";
 
      echo "\n";
 
    }
 
  school_crawl_logf($school_crawl_log, 7, "school_crawl_geturi('%s').", $uri);
 

	
 
  $curl = curl_init();
 

	
 
  if ($curlsetup_hook !== NULL)
 
    $curlsetup_hook($curl);
 

	
 
  $school_crawl_geturi_verbosity = $verbosity;
 
  $school_crawl_geturi_write_buf = '';
 
  $school_crawl_geturi_headers_buf = '';
 
  curl_setopt($curl, CURLOPT_URL, $uri);
 
@@ -200,8 +279,7 @@ function school_crawl_geturi(&$uri, &$co
 
      $cookies_str .= $key . '=' . $val;
 
    }
 

	
 
  if ($verbosity > 8)
 
    echo 'cookies sent: ' . $cookies_str . "\n";
 
  school_crawl_logf($school_crawl_log, 10, "cookies sent: %s", $cookies_str);
 
  curl_setopt($curl, CURLOPT_COOKIE, $cookies_str);
 
  curl_setopt($curl, CURLOPT_HEADERFUNCTION, 'school_crawl_geturi_header_cb');
 
  curl_setopt($curl, CURLOPT_WRITEFUNCTION, 'school_crawl_geturi_write_cb');
 
@@ -236,8 +314,7 @@ function school_crawl_geturi(&$uri, &$co
 
	    $posttxt .= (strlen($posttxt) ? '&' : '')
 
	    . urlencode($postkey) . '=' . (strpos($postkey, 'MEMBER') === FALSE ? urlencode($postval) : $postval);
 
	}
 
      if ($verbosity > 8)
 
	echo 'setting POST to ' . $posttxt . "\n";
 
      school_crawl_logf($school_crawl_log, 10, "Setting POST to %s", $posttxt);
 

	
 
      /* curl_setopt($curl, CURLOPT_POST, TRUE); */
 
      curl_setopt($curl, CURLOPT_POSTFIELDS, $posttxt);
 
@@ -256,20 +333,15 @@ function school_crawl_geturi(&$uri, &$co
 
	continue;
 
      list($header_name, $header_val) = explode(': ', $header, 2);
 

	
 
      if ($verbosity > 8)
 
	echo $header_name . ' : ' . $header_val . "\n";
 
      school_crawl_logf($school_crawl_log, 9, "%s: %s", $header_name, $header_val);
 

	
 
      switch($header_name)
 
	{
 
	case 'Set-Cookie':
 
	  list($cookie_name, $cookie_val) = explode('=', $header_val, 2);
 
	  if ($verbosity > 9)
 
	    {
 
	      if (isset($cookies[$cookie_name]))
 
		echo 'Replacing cookie ' . $cookie_name . '=' . $cookies[$cookie_name]
 
		  . ' with ';
 
	      echo 'new cookie ' . $cookie_name . '=' . $cookie_val . "\n";
 
	    }
 
	    school_crawl_logf($school_crawl_log, 10, "Replacing cookie %s=%s with...", $cookie_name, $cookies[$cookie_name]);
 
	  school_crawl_logf($school_crawl_log, 10, "...new cookie %s=%s.", $cookie_name, $cookie_val);
 
	  $cookies[$cookie_name] = $cookie_val;
 
	  break;
 

	
 
@@ -291,11 +363,10 @@ function school_crawl_geturi(&$uri, &$co
 
	    && !strcasecmp('refresh', $meta_node->getAttribute('http-equiv')))
 
	  {
 
	    $meta_content = $meta_node->getAttribute('content');
 
	    if ($verbosity > 2)
 
	      echo 'Following http-equiv Refresh: ' . $meta_content . PHP_EOL;
 
	    school_crawl_logf($school_crawl_log, 7, "Following http-equiv Refresh: %s", $meta_content);
 
	    if (!(preg_match('/^[0-9]+; *url=(.*)$/', $meta_content, $meta_matches)))
 
	      {
 
		echo 'Error following http-equiv Refresh: ' . $meta_content . PHP_EOL;
 
		school_crawl_logf($school_crawl_log, 0, "Error following http-equiv Refresh: %s", $meta_content);
 
	      }
 
	    else
 
	      {
 
@@ -305,12 +376,11 @@ function school_crawl_geturi(&$uri, &$co
 
	  }
 
    }
 

	
 
  if ($verbosity > 9)
 
    echo $school_crawl_geturi_write_buf;
 
  school_crawl_logf($school_crawl_log, 10, "%s", $school_crawl_geturi_write_buf);
 
  if ($location && $loopspin < 6)
 
    {
 
      $uri = $location;
 
      return school_crawl_geturi($uri, $cookies, $post, $follow_meta_refresh, $curlsetup_hook, $verbosity, $loopspin + 1);
 
      return school_crawl_geturi($uri, $cookies, $school_crawl_log, $post, $follow_meta_refresh, $curlsetup_hook, $loopspin + 1);
 
    }
 
  return $school_crawl_geturi_write_buf;
 
}
school.d/calvin.crawl.inc
Show inline comments
 
@@ -25,10 +25,10 @@
 
 * \param $semesters
 
 *   An array to be filled with Semester objects which I should
 
 *   populate.
 
 * \param $verbosity
 
 *   How verbose I should be. Sensicle range is from 0 through 10.
 
 * \param $school_crawl_log
 
 *   A school_crawl_log handle.
 
 */
 
function calvin_crawl(array &$semesters, $verbosity = 1)
 
function calvin_crawl(array &$semesters, &$school_crawl_log)
 
{
 
  /**
 
   * The first link we start at is the one from KV into WebAdvisor.
 
@@ -52,19 +52,16 @@ function calvin_crawl(array &$semesters,
 
  $baseuri = 'https://kvdata.calvin.edu/walive/WebAdvisor?CONSTITUENCY=WBST&type=P&pid=ST-WESTS12A&LASTTOKEN=NULL';
 

	
 
  $token_uri = $baseuri . '&TOKENIDX=NULL';
 
  $token_html = calvin_crawl_noscript_filter(school_crawl_geturi($token_uri, $cookies));
 
  $token_html = calvin_crawl_noscript_filter(school_crawl_geturi($token_uri, $cookies, $school_crawl_log));
 
  if (!preg_match('/setWindowHTML\(\'\', \'([0-9]+)\'\);/', $token_html, $matches))
 
    {
 
      fprintf(STDERR, "Could not steal the token\n");
 
      school_crawl_logf($school_crawl_log, 1, "Could not steal the token: crawling failed.");
 
      return 1;
 
    }
 
  $token = $matches[1];
 

	
 
  if ($verbosity > 5)
 
    {
 
      echo 'token: ' . $token . "\n";
 
      echo "\n";
 
    }
 
  school_crawl_logf($school_crawl_log, 7, "token: %s.", $token);
 
  school_crawl_logf($school_crawl_log, 7, "");
 

	
 
  /*
 
   * here we have arrived at the main webadvisor screen which lists the
 
@@ -73,7 +70,7 @@ function calvin_crawl(array &$semesters,
 
   * individual department for courses.
 
   */
 
  $uri = $baseuri . '&TOKENIDX=' . $token;
 
  $departments_html = calvin_crawl_noscript_filter(school_crawl_geturi($uri, $cookies));
 
  $departments_html = calvin_crawl_noscript_filter(school_crawl_geturi($uri, $cookies, $school_crawl_log));
 

	
 
  $departments_dom = new DOMDocument();
 
  $departments_dom->loadHTML($departments_html);
 
@@ -123,8 +120,7 @@ function calvin_crawl(array &$semesters,
 
  $return_url = dom_input_value($departments_dom, 'RETURN.URL');
 

	
 

	
 
  if ($verbosity > 4)
 
    fprintf(STDERR, "Available semesters: %s\n", implode($semester_strs, ', '));
 
  school_crawl_logf($school_crawl_log, 7, "Available semesters: %s.", implode($semester_strs, ', '));
 

	
 
  $semester_start_uri = $uri;
 

	
 
@@ -139,7 +135,7 @@ function calvin_crawl(array &$semesters,
 
    {
 
      if (empty($season_map[substr($semester_str, 3)]))
 
	{
 
	  fprintf(STDERR, "Warning: Unknown semester identification chars: %s. Skipping this semester.\n",
 
	  school_crawl_logf($school_crawl_log, 6, "Warning: Unknown semester identification chars: %s. Skipping this semester.",
 
		  $semester_str);
 
	  continue;
 
	}
 
@@ -159,8 +155,7 @@ function calvin_crawl(array &$semesters,
 
      $course_level = '';
 
      $uri = $semester_start_uri;
 

	
 
      if ($verbosity)
 
	fprintf(STDERR, "Crawling semester %s->%s\n",
 
      school_crawl_logf($school_crawl_log, 6, "Crawling semester %s->%s.",
 
		$semester_str, $semester_info);
 

	
 
  /*
 
@@ -247,7 +242,7 @@ function calvin_crawl(array &$semesters,
 
  $pages = array(1 => 0, 2=> 1);
 
  while ($pages[1] < $pages[2])
 
    {
 
      $html = calvin_crawl_noscript_filter(school_crawl_geturi($uri, $cookies, $form));
 
      $html = calvin_crawl_noscript_filter(school_crawl_geturi($uri, $cookies, $school_crawl_log, $form));
 

	
 
      $results_dom = new DOMDocument();
 
      $results_dom->loadHTML($html);	
 
@@ -281,16 +276,13 @@ function calvin_crawl(array &$semesters,
 
	  if (preg_match(';\(([0-9]+)\);', $sec_short_title, $matches))
 
	    $synonym = $matches[1];
 

	
 
	  if ($verbosity > 6)
 
	    {
 
	      echo "\n";
 
	      echo implode('-', $section_id) . ': ' . $sec_short_title . "\n";
 
	      echo $openness . "\n";
 
	      echo $sec_meeting_info . "\n";
 
	      echo $faculty_name . "\n";
 
	      echo $credits . "\n";
 
	      echo $comment . "\n";
 
	    }
 
	  school_crawl_logf($school_crawl_log, 10, "");
 
	  school_crawl_logf($school_crawl_log, 10, implode('-', $section_id) . ': ' . $sec_short_title);
 
	  school_crawl_logf($school_crawl_log, 10, $openness);
 
	  school_crawl_logf($school_crawl_log, 10, $sec_meeting_info);
 
	  school_crawl_logf($school_crawl_log, 10, $faculty_name);
 
	  school_crawl_logf($school_crawl_log, 10, $credits);
 
	  school_crawl_logf($school_crawl_log, 10, $comment);
 

	
 
	  /*
 
	   * The input format for this is, thankfully, pretty rigid
 
@@ -319,8 +311,7 @@ function calvin_crawl(array &$semesters,
 
	  if (strpos($sec_meeting_info, 'Times to be Announced') !== FALSE
 
	      || strpos($sec_meeting_info, 'Days to be Announced') !== FALSE)
 
	    {
 
	      if ($verbosity > 2)
 
		error_log('Skipping class because of incomplete meeting time information: '
 
	      school_crawl_logf($school_crawl_log, 8, 'Skipping class because of incomplete meeting time information: '
 
			  . implode('-', $section_id) . ' has meeting info of `'
 
			  . $sec_meeting_info . '\'');
 
	      $skipped_sections['incomplete meeting info'] ++;
 
@@ -331,7 +322,7 @@ function calvin_crawl(array &$semesters,
 

	
 
	  if (!preg_match(';^([0-9]{2}/[0-9]{2}/[0-9]{4})-([0-9]{2}/[0-9]{2}/[0-9]{4}) (([^ ,]+ )+)([^0-9]+) ([^ ]+) - ([^ ]+), (.*)$;', $sec_meeting_info, $meeting_info_matches))
 
	    {
 
	      error_log('Unable to parse calvin section meeting info string into start/end/days information for '
 
	      school_crawl_logf($school_crawl_log, 8, 'Unable to parse calvin section meeting info string into start/end/days information for '
 
			. implode('-', $section_id) . ': ``' . $sec_meeting_info . '\'\'');
 
	      $skipped_sections['invalid meeting info format'] ++;
 
	      /*
 
@@ -351,9 +342,8 @@ function calvin_crawl(array &$semesters,
 
	  $time_end = school_crawl_time_format(strptime($meeting_info_matches[7], '%I:%M%p'));
 
	  $meeting_place = $meeting_info_matches[8];
 

	
 
	  if ($verbosity > 5)
 
	    foreach (array('date_start', 'date_end', 'meeting_type', 'days', 'time_start', 'time_end', 'meeting_place', 'meeting_type') as $var)
 
	      echo $var . ':' . ${$var} . "\n";
 
	    school_crawl_logf($school_crawl_log, 10, "%s:%s", $var, ${$var});
 

	
 
	  $section = new Section($section_id['section'], array(new SectionMeeting($days, $time_start, $time_end, $meeting_place, $meeting_type)), $synonym, $faculty_name);
 
	  $semester->section_add($section_id['department'], $section_id['course'], $section);
 
@@ -380,14 +370,11 @@ function calvin_crawl(array &$semesters,
 

	
 
      if (!preg_match(';Page ([0-9]+) of ([0-9]+)\</td\>$;m', $html, $pages))
 
	{
 
	  error_log('Unable to determine the number of pages in this Calvin resultset');
 
	  school_crawl_logf($school_crawl_log, 0, 'Unable to determine the number of pages in this Calvin resultset');
 
	  break;
 
	}
 

	
 
      if ($verbosity > 0)
 
	{
 
	  echo 'calvin_crawl(): finished page ' . $pages[1] . ' of ' . $pages[2] . ' with ' . ($list_row - 1) . " courses.\n";
 
	}
 
      school_crawl_logf($school_crawl_log, 8, "calvin_crawl(): finished page %d of %d with %d courses.", $pages[1], $pages[2], $list_row - 1);
 

	
 
      $form = array(
 
		    'ACTION*Grp:WSS.COURSE.SECTIONS' => 'NEXT',
 
@@ -395,14 +382,13 @@ function calvin_crawl(array &$semesters,
 
    }
 

	
 
  $has_stat = FALSE;
 
  if ($verbosity > 1)
 
    foreach ($skipped_sections as $reason => $num)
 
      {
 
	if (!$num)
 
	  continue;
 
	if (!$has_stat)
 
	  error_log('Skipped some sections for <reason>: <number skipped>:');
 
	error_log($reason . ': ' . $num);
 
	school_crawl_logf($school_crawl_log, 7, 'Skipped some sections for <reason>: <number skipped>:');
 
      school_crawl_logf($school_crawl_log, 7, "%s: %d", $reason, $num);
 
      }
 

	
 
    $semester->time_end_set($semester_end_max);
 
@@ -410,8 +396,7 @@ function calvin_crawl(array &$semesters,
 

	
 
    $semesters[] = $semester;
 

	
 
    if ($verbosity)
 
      fprintf(STDERR, "\n");
 
    school_crawl_logf($school_crawl_log, 6, "");
 
    }
 

	
 
  return 0;
school.d/ccbcmd.crawl.inc
Show inline comments
 
@@ -24,12 +24,12 @@
 
 *
 
 * \param $semester
 
 *   The Semester object which I should populate.
 
 * \param $verbosity
 
 *   A scale from 0 to 10 determining how loud I should be.
 
 * \param $school_crawl_log
 
 *   The school_crawl_log handle.
 
 * \return
 
 *   1 on failure, 0 on success.
 
 */
 
function ccbcmd_crawl(array &$semesters, $verbosity = 1)
 
function ccbcmd_crawl(array &$semesters, &$school_crawl_log)
 
{
 
  $cookies = array();
 

	
 
@@ -41,11 +41,11 @@ function ccbcmd_crawl(array &$semesters,
 
   */
 
  $uri = 'http://ccbcmd.edu/schedule/sched.html';
 
  $semesters_dom = new DOMDocument();
 
  $semesters_dom->loadHTML(school_crawl_geturi($uri, $cookies, NULL, TRUE, 'ccbcmd_crawl_curlhook', $verbosity));
 
  $semesters_dom->loadHTML(school_crawl_geturi($uri, $cookies, $school_crawl_log, NULL, TRUE, 'ccbcmd_crawl_curlhook'));
 
  $semesters_select_node = $semesters_dom->getElementById('term_input_id');
 
  if ($semesters_select_node === NULL)
 
    {
 
      fprintf(STDERR, "Could not get list of available semesters to choose from\n");
 
      school_crawl_logf($school_crawl_log, 0, "Could not get list of available semesters to choose from.");
 
      return 1;
 
    }
 

	
 
@@ -54,7 +54,7 @@ function ccbcmd_crawl(array &$semesters,
 
  $semesters_form = school_crawl_element_ancestor($semesters_select_node, 'form');
 
  if ($semesters_form === NULL)
 
    {
 
      fprintf(STDERR, "Unable to find <form /> associated with semester.\n");
 
      school_crawl_logf($school_crawl_log, 0, "Unable to find <form /> associated with semester.");
 
      return 1;
 
    }
 
  $semesters_post_save = school_crawl_form($semesters_form);
 
@@ -79,12 +79,10 @@ function ccbcmd_crawl(array &$semesters,
 
      if (preg_match(';session ([0-9]+);i', $semester_text, $matches))
 
	$semester_season .= '_' . $matches[1];
 

	
 
      if ($verbosity)
 
	fprintf(STDERR, "Crawling semester %s:%s -> %s.\n", $semester_year, $semester_season, $semester_text);
 
      school_crawl_logf($school_crawl_log, 6, "Crawling semester %s:%s -> %s.", $semester_year, $semester_season, trim($semester_text));
 
      $semester = new Semester($semester_year, strtolower($semester_season));
 

	
 
  if ($verbosity > 1)
 
    fprintf(STDERR, "Found semester: %s=``%s''=``%s''.\n",
 
      school_crawl_logf($school_crawl_log, 8, "Found semester: %s=``%s''=``%s''.",
 
  	    $semester_value, $semester->id(), trim($semesters_option_node->textContent));
 
  /* load stored semester-page URI / form data */
 
  $semesters_post = $semesters_post_save;
 
@@ -93,12 +91,12 @@ function ccbcmd_crawl(array &$semesters,
 

	
 
  $subjects_dom = new DOMDocument();
 
  $uri = school_crawl_url($uri, $semesters_form->getAttribute('action'));
 
  $subjects_dom->loadHTML(school_crawl_geturi($uri, $cookies, $semesters_post, TRUE, 'ccbcmd_crawl_curlhook', $verbosity));
 
  $subjects_dom->loadHTML(school_crawl_geturi($uri, $cookies, $school_crawl_log, $semesters_post, TRUE, 'ccbcmd_crawl_curlhook'));
 

	
 
  $subjects_form_nodelist = $subjects_dom->getElementsByTagName('form');
 
  if (!$subjects_form_nodelist->length)
 
    {
 
      fprintf(STDERR, "Unable to find <form /> to submit for the subjects choosing page.\n");
 
      school_crawl_logf($school_crawl_log, 0, "Unable to find <form /> to submit for the subjects-choosing page.");
 
      return 1;
 
    }
 
  $subjects_form_node = $subjects_form_nodelist->item(0);
 
@@ -111,7 +109,7 @@ function ccbcmd_crawl(array &$semesters,
 

	
 
  $courses_dom = new DOMDocument();
 
  $uri = school_crawl_url($uri, $subjects_form_node->getAttribute('action'));
 
  $courses_dom->loadHTML(school_crawl_geturi($uri, $cookies, $subjects_post, TRUE, 'ccbcmd_crawl_curlhook', $verbosity));
 
  $courses_dom->loadHTML(school_crawl_geturi($uri, $cookies, $school_crawl_log, $subjects_post, TRUE, 'ccbcmd_crawl_curlhook'));
 

	
 
  $courses_xpath = new DOMXPath($courses_dom);
 

	
 
@@ -119,7 +117,7 @@ function ccbcmd_crawl(array &$semesters,
 
  $tr_header_nodelist = $courses_xpath->query('//table[@class="datadisplaytable" and position()=1]//tr[position()=2]');
 
  if (!$tr_header_nodelist->length)
 
    {
 
      fprintf(STDERR, "Unable to find the row of the course/section data table which gives us the mappings of column names onto columns.\n");
 
      school_crawl_logf($school_crawl_log, 0, "Unable to find the row of the course/section data table which gives us the mappings of column names onto columns.");
 
      return 1;
 
    }
 
  $tr_header_node = $tr_header_nodelist->item(0);
 
@@ -142,13 +140,12 @@ function ccbcmd_crawl(array &$semesters,
 
    {
 
      if ($value === FALSE)
 
	{
 
	  fprintf(STDERR, "Unable to find column offset for `%s'.\n",
 
	  school_crawl_logf($school_crawl_log, 0, "Unable to find column offset for `%s'.",
 
		  $name);
 
	  return 1;
 
	}
 
      else
 
	if ($verbosity > 6)
 
	  echo $name . ' -> ' . $value . PHP_EOL;
 
	school_crawl_logf($school_crawl_log, 9, "%s -> %s", $name, $value);
 

	
 
      $max_offset = max($max_offset, $value);
 
    }
 
@@ -202,7 +199,7 @@ function ccbcmd_crawl(array &$semesters,
 
	  }
 
	if (($dash_pos = strpos($time_range_text, '-')) === FALSE)
 
	  {
 
	    fprintf(STDERR, "Unable to understand course's time range format, cannot find dash: ``%s''.\n",
 
	    school_crawl_logf($school_crawl_log, 0, "Unable to understand course's time range format, cannot find dash: ``%s''.",
 
		    $time_range_text);
 
	    return 1;
 
	  }
 
@@ -219,14 +216,14 @@ function ccbcmd_crawl(array &$semesters,
 
	 */
 
	if (strpos($time_end_text, '-') !== FALSE)
 
	  {
 
	    fprintf(STDERR, "College seems to support multiple meeting times per semester which we don't know how to parse (even though slate_permutate itself can handle this situation): ``%s'' time_end_text: ``%s''.\n",
 
	    school_crawl_logf($school_crawl_log, 0, "College seems to support multiple meeting times per semester which we don't know how to parse (even though slate_permutate itself can handle this situation): ``%s'' time_end_text: ``%s''.",
 
		    $time_range_text, $time_end_text);
 
	    return 1;
 
	  }
 
	$time_end = strptime($time_end_text, '%I:%M %p');
 
	if ($time_end === FALSE || $time_start === FALSE)
 
	  {
 
	    fprintf(STDERR, "Error parsing start or end time: start: ``%s'' end: ``%s''.\n",
 
	    school_crawl_logf($school_crawl_log, 0, "Error parsing start or end time: start: ``%s'' end: ``%s''.",
 
		    $time_start_text, $time_end_text);
 
	    return 1;
 
	  }
school.d/cedarville.crawl.inc
Show inline comments
 
@@ -53,30 +53,28 @@ function table_parse($html)
 
}
 

	
 
/** Crawls Cedarville course listings. $season is "fa" or "sp", year is 4-digit year */
 
function cedarville_crawl(array &$semesters, $verbosity = 1)
 
function cedarville_crawl(array &$semesters, &$school_crawl_log)
 
{  
 
  $basepath = 'http://cedarville.edu/courses/schedule/';
 

	
 
  if ($verbosity)
 
    echo "cedarville_crawl(): Beginning crawl of Cedarville:\n";
 
  school_crawl_logf($school_crawl_log, 6, "Beginning crawl of Cedarville:");
 

	
 
  if ($verbosity > 1)
 
    echo "cedarville_crawl(): Determining list of departments.\n";
 
  school_crawl_logf($school_crawl_log, 7, "Determining list of departments.");
 

	
 
  if ($verbosity > 1)
 
    fprintf(STDERR, "cedarville_crawl(): Determining list of semesters.\n");
 
  school_crawl_logf($school_crawl_log, 8, "Determining list of semesters.");
 
  $semesters_dom = new DOMDocument();
 
  $semesters_dom->loadHTML(file_get_contents($basepath));
 

	
 
  $content_div_dom = $semesters_dom->getElementById('contenttext');
 
  if (!$content_div_dom)
 
    {
 
      fprintf(STDERR, "cedarville_crawl(): Error finding location of the list of departments.\n");
 
      school_crawl_logf($school_crawl_log, 6, "Error finding location of the list of departments.");
 
      if (count($semesters))
 
	{
 
	  fprintf(STDERR, "cedarville_crawl(): Assuming that I got enough info anyways, returning successful code so that the few semesters I was able to crawl will be cached.\n");
 
	  school_crawl_logf($school_crawl_log, 6, "Assuming that I got enough info anyways, returning successful code so that the few semesters I was able to crawl will be cached.");
 
	  return 0;
 
	}
 
      school_crawl_logf($school_crawl_log, 0, "Couldn't find any departments.");
 
      return 1;
 
    }
 
  $departments_xpath = new DOMXPath($semesters_dom);
 
@@ -97,8 +95,7 @@ function cedarville_crawl(array &$semest
 

	
 
      $semester = new Semester($semester_year, $semester_season);
 

	
 
      if ($verbosity > 1)
 
	fprintf(STDERR, "cedarville_crawl(): Crawling semester: %s.\n",
 
      school_crawl_logf($school_crawl_log, 6, "Crawling semester: %s.",
 
		$semester_name);
 

	
 
  /*
 
@@ -106,27 +103,27 @@ function cedarville_crawl(array &$semest
 
   * not accessible available in the first pageload.
 
   */
 
  $departments = array();
 
  if (cedarville_crawl_departments_get($basepath . $semester_href, $departments, $semester_href_parts[0]))
 
  if (cedarville_crawl_departments_get($basepath . $semester_href, $departments, $semester_href_parts[0], $school_crawl_log))
 
    return 1;
 
  if (!count($departments))
 
    {
 
      echo "cedarville_crawl(): Unable to get a listing of departments.\n";
 
      school_crawl_logf($school_crawl_log, 6, "Unable to get a listing of departments.");
 
      if (count($semesters))
 
	{
 
	  fprintf(STDERR, "cedarville_crawl(): Assuming that I got enough info anyways, returning successful code so that the few semesters I was able to crawl will be cached.\n");
 
	  school_crawl_logf($school_crawl_log, 6, "Assuming that I got enough info anyways, returning successful code so that the few semesters I was able to crawl will be cached.");
 
	  return 0;
 
	}
 
      school_crawl_logf($school_crawl_log, 0, "Unable to get listing of departments.");
 
      return 1;
 
    }
 
  /* find the first department whose name we don't yet know */
 
  if (cedarville_crawl_departments_get($basepath . $semester_href_parts[0] . '_' . current(array_keys($departments)) . '_all.htm', $departments, $semester_href_parts[0]))
 
  if (cedarville_crawl_departments_get($basepath . $semester_href_parts[0] . '_' . current(array_keys($departments)) . '_all.htm', $departments, $semester_href_parts[0], $school_crawl_log))
 
    return 1;
 

	
 
  $tables = array();
 
  foreach ($departments as $department => $dept_name)
 
    {
 
      if ($verbosity > 2)
 
	echo 'cedarville_crawl(): Crawling department ' . $department . ' (' . $dept_name . ")...\n";
 
      school_crawl_logf($school_crawl_log, 7, "Crawling department %s (%s).", $department, $dept_name);
 
      $html = file_get_contents($basepath . $semester_href_parts[0] . '_' . $department . '_' . 'all.htm');
 
      if (!$html)
 
	continue;
 
@@ -183,8 +180,8 @@ function cedarville_crawl(array &$semest
 
	  $section_parts = Section::parse($course_table[1]);
 
	  if (count($section_parts) < 3)
 
	    {
 
	      error_log('Error parsing section_id. Given `' . $course_table[1] . '\', interpreted as `'
 
			. implode('-', $section_parts) . '\'. Skipping.');
 
	      school_crawl_logf($school_crawl_log, 6, "Error parsing section_id. Given `%s'; interpreted as `%s'. Skipping.",
 
				$course_table[1], implode('-', $section_parts));
 
	      continue;
 
	    }
 

	
 
@@ -199,8 +196,7 @@ function cedarville_crawl(array &$semest
 
	  $meetings_str = $course_table[6];
 
	  if (strpos($meetings_str, 'TBA') !== FALSE)
 
	    {
 
	      if ($verbosity > 1)
 
		error_log('Skipping ' . implode('-', $section_parts) . ' because its meeting time info has `TBA\' in it.');
 
	      school_crawl_logf($school_crawl_log, 8, "Skipping %s because its meeting time info has `TBA' in it.", implode('-', $section_parts));
 
	      continue;
 
	    }
 
	  $meetings = array();
 
@@ -213,17 +209,18 @@ function cedarville_crawl(array &$semest
 
		  if (preg_match(';^Dates:[^0-9]+([/0-9]{8})-([/0-9]{8});',
 
				 $meetings_str, $meeting_matches))
 
		    {
 
		      if ($verbosity > 4)
 
			error_log('Skipping some meeting data for '
 
				  . implode('-', $section_parts) . ' because it is a date range: `'
 
				  . $meeting_matches[0] . '\'');
 
		      /**
 
		       * \todo
 
		       *   This is a perfect place to get Semester's
 
		       *   time_start and time_end values.
 
		       */
 
		      school_crawl_logf($school_crawl_log, 8, "Skipping some meeting data for %s because it is a date range: `%s'.",
 
					implode('-', $section_parts), $meeting_matches[0]);
 
		      $meetings_str = substr($meetings_str, strlen($meeting_matches[0]));
 
		      continue;
 
		    }
 

	
 
		  if ($verbosity > 0)
 
		    error_log('Error parsing meeting time. Given `' . $meetings_str . '\'. Skipping '
 
			      . implode('-', $section_parts));
 
		  school_crawl_logf($school_crawl_log, 6, "Error parsing meeting time. Given `%s'. Skipping %s.", $meetings_str, implode('-', $section_parts));
 
		  break;
 
		}
 
	      /* prepare for parsing the next meeting time */
 
@@ -263,13 +260,13 @@ function cedarville_crawl(array &$semest
 
 *   An associative array mapping department codes onto department
 
 *   friendly names.
 
 */
 
function cedarville_crawl_departments_get($dept_url, array &$departments, $season_string)
 
function cedarville_crawl_departments_get($dept_url, array &$departments, $season_string, $school_crawl_log)
 
{
 
  $html = file_get_contents($dept_url);
 
  $dept_dom = new DOMDocument();
 
  if (!$dept_dom->loadHTML(cedarville_html_fix($html)))
 
    {
 
      echo "cedarville_crawl(): Error determining list of available departments: Unable to parse HTML.\n";
 
      school_crawl_logf($school_crawl_log, 6, "Error determining list of available departments: Unable to parse HTML.");
 
      return 1;
 
    }
 
  $xpath = new DOMXPath($dept_dom);
 
@@ -280,7 +277,7 @@ function cedarville_crawl_departments_ge
 
      $href = $dept_node->getAttribute('href');
 
      if (!preg_match('/^' . preg_quote($season_string, '/') . '_([a-z]+)_[a-z]+\.htm$/', $href, $matches))
 
	{
 
	  echo 'cedarvillege_crawl(): Error determining list of available departments: Unable to parse the department string out of href="' . $href . "\".\n";
 
	  school_crawl_logf($school_crawl_log, 6, "cedarvillege_crawl(): Error determining list of available departments: Unable to parse the department string out of href=\"%s\".", $href);
 
	  return 1;
 
	}
 

	
0 comments (0 inline, 0 general)