Changeset - 1838a6f6fa20
[Not reviewed]
default
0 2 0
Nathan Brink (binki) - 13 years ago 2012-11-22 15:53:37
ohnobinki@ohnopublishing.net
Update Hope College crawler to be stream/chunk based, lowering its memory usage.
2 files changed with 172 insertions and 49 deletions:
0 comments (0 inline, 0 general)
inc/school.crawl.inc
Show inline comments
 
@@ -253,243 +253,267 @@ function school_crawl_meeting_type($meet
 
  if (empty($meeting_type))
 
    $meeting_type = 'lecture';
 

	
 
  $meeting_type = strtolower(trim($meeting_type));
 
  if (!empty($meeting_type_maps[$meeting_type]))
 
    $meeting_type = $meeting_type_maps[$meeting_type];
 
  elseif (!empty($meeting_type_maps[substr($meeting_type, 0, 3)]))
 
    $meeting_type = $meeting_type_maps[substr($meeting_type, 0, 3)];
 

	
 
  return $meeting_type;
 
}
 

	
 
/**
 
 * \brief
 
 *   Simulate some aspects of a web browser while retreiving a
 
 *   document.
 
 *
 
 * This allows us to view our cookies in an associative array and to
 
 * have the server's response automatically update our cookies.
 
 *
 
 * If $post is specified as an associative array, an HTTP POST is
 
 * performed and the data is encoded properly as if we were performing
 
 * a form submission.
 
 *
 
 * Follows redirects. If there is a redirect, the page from which you
 
 * are redirected is lost... but few people put any information on
 
 * those pages anyways ;-).
 
 *
 
 * \param $uri
 
 *   The URL to fetch. If a redirect occurs, this is updated.
 
 * \param $cookies
 
 *   An associative array of cookies and where to save new cookies.
 
 * \param $school_crawl_log
 
 *   The school_crawl_log handle to use.
 
 * \param $post
 
 *   If not NULL, causes an HTTP POST. In that case, should be an
 
 *   associative array of form keys/values.
 
 * \param $follow_meta_refresh
 
 *   Parse the resultant HTML with http://docs.php.net/dom and if it
 
 *   contains a line that looks like ``<meta http-equiv="Refresh" content="0; url=https://simon.ccbcmd.edu/pls/PROD/bwckschd.p_disp_dyn_sched">'',
 
 *   follow that URL.
 
 * \param $curlsetup_hook
 
 *   A function which is passed a curl handle which allows the caller
 
 *   to do silly things like setting CURLOPT_SSLVERSION for silly
 
 *   sites like ccbcmd's registration site.
 
 * \param $loopspin
 
 *   An internal variable to prevent us from following perpetual
 
 *   redirects.
 
 * \param $options
 
 *   Extra optional arguments with keys as follows:
 
 *   - 'writefunc': A curl-compatible write function of the form
 
 *       function($state, $data) and returns the number of eaten bytes
 
 *       which must be equal to the number of bytes received unless if
 
 *       the transfer should be aborted. Settings this and using
 
 *       $follow_meta_refresh are mutually exclusive and will cause
 
 *       undefined behavior.
 
 *   - 'writestate': The value which should be passed to writefunc as
 
 *       the $state parameter.
 
 * \return
 
 *   The body of the document returned by the server (normally
 
 *   malformed HTML, especially with Calvin's WebAdvisor
 
 *   installation).
 
 *   malformed HTML, especially with Calvin's WebAdvisor installation)
 
 *   or, if 'writestate' and 'writefunc' are set, the value stored in
 
 *   'writestate'.
 
 */
 
function school_crawl_geturi(&$uri, &$cookies, array &$school_crawl_log, $post = NULL, $follow_meta_refresh = FALSE, $curlsetup_hook = NULL, $loopspin = 0)
 
function school_crawl_geturi(&$uri, &$cookies, array &$school_crawl_log, $post = NULL, $follow_meta_refresh = FALSE, $curlsetup_hook = NULL, $options = array(), $loopspin = 0)
 
{
 
  global $school_crawl_geturi_write_buf, $school_crawl_geturi_headers_buf;
 
  global $school_crawl_geturi_writefunc,
 
    $school_crawl_geturi_writestate,
 
    $school_crawl_geturi_headers_buf;
 

	
 
  school_crawl_logf($school_crawl_log, 7, "school_crawl_geturi('%s').", $uri);
 

	
 
  $options += array(
 
    'writefunc' => 'school_crawl_geturi_writefunc_cb',
 
    'writestate' => '',
 
  );
 
  $school_crawl_geturi_writefunc = $options['writefunc'];
 
  $GLOBALS['school_crawl_geturi_writestate'] = &$options['writestate'];
 

	
 
  $curl = curl_init();
 
  curl_setopt($curl, CURLOPT_USERAGENT, SP_PACKAGE_NAME . '/' . SP_PACKAGE_VERSION);
 

	
 
  if ($curlsetup_hook !== NULL)
 
    $curlsetup_hook($curl);
 

	
 
  $school_crawl_geturi_write_buf = '';
 
  $school_crawl_geturi_headers_buf = '';
 
  curl_setopt($curl, CURLOPT_URL, $uri);
 

	
 
  $cookies_str = '';
 
  foreach ($cookies as $key => $val)
 
    {
 
      if (strlen($cookies_str))
 
	$cookies_str .= ';';
 
      $cookies_str .= $key . '=' . $val;
 
    }
 

	
 
  school_crawl_logf($school_crawl_log, 10, "cookies sent: %s", $cookies_str);
 
  curl_setopt($curl, CURLOPT_COOKIE, $cookies_str);
 
  curl_setopt($curl, CURLOPT_HEADERFUNCTION, 'school_crawl_geturi_header_cb');
 
  curl_setopt($curl, CURLOPT_WRITEFUNCTION, 'school_crawl_geturi_write_cb');
 

	
 
  if ($post != NULL && is_array($post))
 
    {
 

	
 
      /* var_dump($post); */
 

	
 
      $posttxt = '';
 
      foreach ($post as $postkey => $postvals)
 
	{
 
	  /*
 
	   * This not escaping MEMBER thing is Calvin-specific
 
	   * too. Maybe we need a way to ask for some particular char
 
	   * not to be encoded?
 
	   */
 

	
 
	  /*
 
	   * Apparently, browsers like seamonkey will send multiple
 
	   * versions of <input type="hidden" name="field" value="1"
 
	   * /> if another input exists with name="field", like:
 
	   * field=1&field=blah. It seems like the webserver for
 
	   * ccbcmd cares about having these multiple values too...
 
	   *
 
	   * Yes, sending subj_sel=dummy&subj_sel=%25 made _all_ of
 
	   * the difference. Wow.
 
	   */
 
	  if (!is_array($postvals))
 
	    $postvals = array($postvals);
 
	  foreach ($postvals as $postval)
 
	    $posttxt .= (strlen($posttxt) ? '&' : '')
 
	    . urlencode($postkey) . '=' . (strpos($postkey, 'MEMBER') === FALSE ? urlencode($postval) : $postval);
 
	}
 
      school_crawl_logf($school_crawl_log, 10, "Setting POST to %s", $posttxt);
 

	
 
      /* curl_setopt($curl, CURLOPT_POST, TRUE); */
 
      curl_setopt($curl, CURLOPT_POSTFIELDS, $posttxt);
 
    }
 

	
 
  curl_exec($curl);
 
  curl_close($curl);
 

	
 
  $location = NULL;
 
  foreach (explode("\r\n", $school_crawl_geturi_headers_buf) as $header)
 
    {
 
      /*
 
       * yes, we don't want the line if the first char is a ':' or if it has no ':'
 
       */
 
      if (!strpos($header, ':'))
 
	continue;
 
      list($header_name, $header_val) = explode(': ', $header, 2);
 

	
 
      school_crawl_logf($school_crawl_log, 9, "%s: %s", $header_name, $header_val);
 

	
 
      switch($header_name)
 
	{
 
	case 'Set-Cookie':
 
	  list($cookie_name, $cookie_val) = explode('=', $header_val, 2);
 
	  if (isset($cookies[$cookie_name]))
 
	    school_crawl_logf($school_crawl_log, 10, "Replacing cookie %s=%s with...", $cookie_name, $cookies[$cookie_name]);
 
	  school_crawl_logf($school_crawl_log, 10, "...new cookie %s=%s.", $cookie_name, $cookie_val);
 
	  $cookies[$cookie_name] = $cookie_val;
 
	  break;
 

	
 
	case 'Location':
 
	  $location = $header_val;
 
	  /* yes, a calvin-specific replacement :-/ */
 
	  $location = preg_replace(';(kvdata\.calvin\.edu/)(WebAdvisor);', '\1walive/\2', $location) . "\n";
 
	  $post = NULL;
 
	  break;
 
	}
 
    }
 

	
 
  if ($follow_meta_refresh)
 
    {
 
      $dom = new DOMDocument();
 
      $dom->loadHTML($school_crawl_geturi_write_buf);
 
      $dom->loadHTML($options['writestate']);
 
      foreach ($dom->getElementsByTagName('meta') as $meta_node)
 
	if ($meta_node->hasAttribute('http-equiv')
 
	    && !strcasecmp('refresh', $meta_node->getAttribute('http-equiv')))
 
	  {
 
	    $meta_content = $meta_node->getAttribute('content');
 
	    school_crawl_logf($school_crawl_log, 7, "Following http-equiv Refresh: %s", $meta_content);
 
	    if (!(preg_match('/^[0-9]+; *url=(.*)$/', $meta_content, $meta_matches)))
 
	      {
 
		school_crawl_logf($school_crawl_log, 0, "Error following http-equiv Refresh: %s", $meta_content);
 
	      }
 
	    else
 
	      {
 
		$location = $meta_matches[1];
 
		$post = NULL;
 
	      }
 
	  }
 
    }
 

	
 
  school_crawl_logf($school_crawl_log, 10, "%s", $school_crawl_geturi_write_buf);
 
  school_crawl_logf($school_crawl_log, 10, "%s", $options['writestate']);
 
  if ($location && $loopspin < 6)
 
    {
 
      $uri = $location;
 
      return school_crawl_geturi($uri, $cookies, $school_crawl_log, $post, $follow_meta_refresh, $curlsetup_hook, $loopspin + 1);
 
      return school_crawl_geturi($uri, $cookies, $school_crawl_log, $post, $follow_meta_refresh, $curlsetup_hook, $options, $loopspin + 1);
 
    }
 
  return $school_crawl_geturi_write_buf;
 
  return $options['writestate'];
 
}
 

	
 
function school_crawl_geturi_header_cb($curl, $header_buf)
 
{
 
  global $school_crawl_geturi_headers_buf;
 
  $school_crawl_geturi_headers_buf .= $header_buf;
 
  return strlen($header_buf);
 
}
 

	
 
function school_crawl_geturi_writefunc_cb(&$writebuf, $data)
 
{
 
  $writebuf .= $data;
 
  return strlen($data);
 
}
 

	
 
function school_crawl_geturi_write_cb($curl, $write_buf)
 
{
 
  global $school_crawl_geturi_write_buf;
 
  $school_crawl_geturi_write_buf .= $write_buf;
 
  return strlen($write_buf);
 
  global $school_crawl_geturi_writefunc, $school_crawl_geturi_writestate;
 
  return $school_crawl_geturi_writefunc($school_crawl_geturi_writestate, $write_buf);
 
}
 

	
 
/**
 
 * \brief
 
 *   Finds the closest parent of a DOM element with a certain tag
 
 *   name.
 
 *
 
 * Useful for finding the <form /> element associated with a given
 
 * <select /> or set of <input />s so that the form's action=""
 
 * parameter may be found.
 
 *
 
 * The node itself passed in will be considered for whether or not it
 
 * matches the $element_name.
 
 *
 
 * \param $node
 
 *   The dom node whose ancestor should be found.
 
 * \param $element_name
 
 *   The name of the ancestor element which is requested.
 
 * \return
 
 *   The DOMElement sought or NULL if not found.
 
 */
 
function school_crawl_element_ancestor(DOMElement $node, $element_name)
 
{
 
  if (!strcmp($node->tagName, $element_name))
 
    return $node;
 
  if ($node->parentNode)
 
    return school_crawl_element_ancestor($node->parentNode, $element_name);
 
  return NULL;
 
}
 

	
 
/**
 
 * \brief
 
 *   Create an array based on an HTML form for submitting the form.
 
 *
 
 * Currently, this will only support the <input /> and <select />
 
 * elements.
 
 *
 
 * \param $form_node
 
 *   The dom node of the form.
 
 * \return
 
 *   An array suitable for passing to school_crawl_geturi().
 
 */
 
function school_crawl_form(DOMElement $form_node)
 
{
 
  $form = array();
 

	
 
  $xpath = new DOMXPath($form_node->ownerDocument);
 
  foreach ($xpath->query('.//input', $form_node) as $input_node)
 
@@ -697,179 +721,196 @@ function school_crawl_table_rownodes(DOM
 
 */
 
function school_crawl_table_rownode_index(DOMNodeList $rownodes, $i)
 
{
 
  foreach ($rownodes as $data_node)
 
    {
 
      $colspan = 1;
 
      if ($data_node->hasAttribute('colspan'))
 
	$colspan = $data_node->getAttribute('colspan');
 
      /** \todo check validity of colspan */
 

	
 
      $i -= $colspan;
 
      if ($i < 0)
 
	return $data_node;
 
    }
 
  return NULL;
 
}
 

	
 
/**
 
 * \brief
 
 *   Detect if a point in a buffer is at a newline.
 
 *
 
 * \internal
 
 *   Used by school_crawl_csv_parse().
 
 * \param $data
 
 *   The buffer
 
 * \param $len
 
 *   Number of bytes in buffer.
 
 * \param $i
 
 *   The location within the buffer.
 
 * \param $eof
 
 *   The end of the buffer is the end of the file.
 
 * \return
 
 *   The location of the next character after the EOL sequence or
 
 *   FALSE if there is no EOL.
 
 */
 
function _school_crawl_csv_parse_eol($data, $len, $i = 0, $eof = FALSE)
 
{
 
  if ($len <= $i)
 
    return $eof ? $i : FALSE;
 
  if ($data[$i] == "\n")
 
    return $i + 1;
 
  if ($data[$i] == "\r" && $len > $i + 1 && $data[$i + 1] == "\n")
 
      return $i + 2;
 
  return FALSE;
 
}
 

	
 
/**
 
 * \brief
 
 *   Read a line of CSV and return it as an array.
 
 *   Read a string of CSV and return it as an array of row arrays.
 
 *
 
 * \param $data
 
 *   CSV data to parse. Parsed data shall be deleted.
 
 * \param $options
 
 *   An array with any number of the following optional arguments
 
 *   which have the documented defaults:
 
 *   - delimiter (','): The character which delimits fields.
 
 *   - eof (FALSE): Whether there will be no more data coming.
 
 *     Normally, if the $data ends without a newline this function
 
 *     will assume that it cannot assume that there is an implicit
 
 *     newline. Some improper files don't have the extra newline at
 
 *     their end and thus this is needed to support them.
 
 *   - stream (unset): If set to an array containing the keys
 
 *     'callback' and 'state', will call the 'callback' which is a
 
 *     function($state, $row) with $state set to the value in 'state'
 
 *     instead of storing all rows and returning them all.
 
 * \return
 
 *   An array with an entry for each line in the CSV file where each
 
 *   line's entry is an array of the items in that row. An empty array
 
 *   will be returned in the case that there is insufficient data to
 
 *   read a line (or insufficient data to tell if the line is
 
 *   complete, see $options['eof']).
 

	
 
 *   complete, see $options['eof']). If the 'stream' option is set in
 
 *   $options, then the return value shall be the number of rows
 
 *   parsed.
 
 */
 
function school_crawl_csv_parse(&$data, array $options = array())
 
{
 
  $options += array(
 
    'delimiter' => ',',
 
    'eof' => FALSE,
 
  );
 

	
 
  $ret = array();
 
  $i = 0;
 
  $last_line_i = $i;
 
  $strlen_data = strlen($data);
 
  $streammode = !empty($options['stream']);
 
  if ($streammode)
 
    $ret = 0;
 
  else
 
    $ret = array();
 

	
 
  while ($i < $strlen_data)
 
    {
 
	$row = array();
 

	
 
	$quote = FALSE;
 
	$entry = '';
 
	while ($quote
 
	       || (_school_crawl_csv_parse_eol($data, $strlen_data, $i, $options['eof']) === FALSE))
 
	  {
 
	    /*
 
	     * There are two ways to read data. One within the
 
	     * doublequoted entry and the other outside of that.
 
	     */
 
	    if ($quote)
 
	      switch ($data[$i])
 
		{
 
		case '"':
 
		  /*
 
		   * This is either a lone quote or the terminating
 
		   * quote. It is a terminating quote if the next
 
		   * character is EOL or non-quote.
 
		   */
 
		  if ($strlen_data > $i + 1 && $data[$i + 1] != '"'
 
		      || _school_crawl_csv_parse_eol($data, $strlen_data, $i + 1, $options['eof']) !== FALSE)
 
		    {
 
		      $quote = FALSE;
 
		      /*
 
		       * can't fall-through for this case, eat the
 
		       * doublequote.
 
		       */
 
		      break;
 
		    }
 
		  else
 
		    {
 
		      /*
 
		       * We got `""' inside of a doublequoted string,
 
		       * which is CSV's way of escaping a
 
		       * doublequote. Thus, eat one of the two
 
		       * doublequotes.
 
		       */
 
		      $i ++;
 
		      /*
 
		       * or we don't yet have enough data... The outer
 
		       * loop also will break on its own in this case...
 
		       */
 
		      if ($strlen_data <= $i)
 
			break;
 
		      /* fall-through to append doublequote */
 
		    }
 

	
 
		default:
 
		  $entry .= $data[$i];
 
		}
 
	    else /* if ($quote) */
 
	      switch ($data[$i])
 
		{
 
		case '"':
 
		  /**
 
		   * \todo
 
		   *   Decide if we want to parse ``a,b"c",d'' as
 
		   *   ["a", "b\"c\"", "d"] or (current) ["a", "bc",
 
		   *   "d"].
 
		   */
 
		  $quote = TRUE;
 
		  break;
 

	
 
		case $options['delimiter']:
 
		  $row[] = $entry;
 
		  $entry = '';
 
		  break;
 

	
 
		default:
 
		  $entry .= $data[$i];
 
		}
 

	
 
	    $i ++;
 
	    if ($i >= $strlen_data)
 
	      break;
 
	  }
 

	
 
	/* Ignore read row because if we encountered end of buffer */
 
	if (($next_i = _school_crawl_csv_parse_eol($data, $strlen_data, $i, $options['eof'])) === FALSE)
 
	  break;
 

	
 
	$i = $next_i;
 
	$last_line_i = $i;
 
	$row[] = $entry;
 
	$ret[] = $row;
 
	if ($streammode)
 
	  {
 
	    $options['stream']['callback']($options['stream']['state'], $row);
 
	    $ret ++;
 
	  }
 
	else
 
	  $ret[] = $row;
 
    }
 

	
 
  if (!empty($last_line_i))
 
    {
 
      $data = substr($data, $last_line_i);
 
      if ($data === FALSE)
 
	$data = '';
 
    }
 

	
 
  return $ret;
 
}
school.d/hope.crawl.inc
Show inline comments
 
<?php /* -*- mode: php; -*- */
 
/*
 
 * Copyright 2012 Nathan Phillip Brink <ohnobinki@ohnopublishing.net>
 
 *
 
 * This file is a part of slate_permutate.
 
 *
 
 * slate_permutate is free software: you can redistribute it and/or modify
 
 * it under the terms of the GNU Affero General Public License as published by
 
 * the Free Software Foundation, either version 3 of the License, or
 
 * (at your option) any later version.
 
 *
 
 * slate_permutate is distributed in the hope that it will be useful,
 
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 
 * GNU Affero General Public License for more details.
 
 *
 
 * You should have received a copy of the GNU Affero General Public License
 
 * along with slate_permutate.  If not, see <http://www.gnu.org/licenses/>.
 
 */
 

	
 
define('SP_HOPE_CRAWL_STATE_PREHEADER', 1);
 
define('SP_HOPE_CRAWL_STATE_SECTIONS', 2);
 

	
 
/**
 
 * \brief
 
 *   Start a Hope crawling session.
 
 */
 
function _hope_crawl_start(array $school, &$uri, array &$cookies, &$dom, &$xpath, &$school_crawl_log)
 
{
 
  $cookies = array();
 
  $uri = 'http://plus.hope.edu/PROD/hxskschd.P_hopeSchedule';
 
  $dom = new DOMDocument();
 

	
 
  $html = school_crawl_geturi($uri, $cookies, $school_crawl_log);
 
  if (empty($html)
 
      || !$dom->loadHTML($html))
 
    {
 
      school_crawl_logf($school_crawl_log, 2, "Unable to load the HTML document necessary to enumerate %s's list of semesters.",
 
			$school['id']);
 
      return 1;
 
    }
 

	
 
  $xpath = new DOMXPath($dom);
 

	
 
  return 0;
 
}
 

	
 
/**
 
 * \brief
 
 *   Crawl the list of available semesters from Hope.
 
 *
 
 * Crawling starts at
 
 * http://plus.hope.edu/PROD/hxskschd.P_hopeSchedule . This is linked
 
 * to from http://hope.edu/registrar/nav/schedules.html and from
 
 * http://plus.hope.edu/ (which redirects to a PROD page which has
 
 * `Release 8.4.2'. The HTTP server claims to be ``Server:
 
 * Oracle-Application-Server-10g/10.1.2.0.2 Oracle-HTTP-Server''.
 
 *
 
 * \param $school
 
 *   The school handle for Hope College.
 
 * \param $semesters
 
 *   The array to which Semester objects shall be appended.
 
 * \param $school_crawl_log
 
 *   The school_crawl_log handle.
 
 */
 
function hope_crawl_semester_list(array $school, array &$semesters, &$school_crawl_log)
 
{
 
  $uri = NULL;
 
  $cookies = array();
 
  $dom = NULL;
 
  $xpath = NULL;
 
@@ -130,201 +133,280 @@ function hope_crawl_semester(array $scho
 
  $semester_form_action = $semester_form_node->getAttribute('action');
 
  $semester_form['term'] = $dom_select_term->getAttribute('value');
 

	
 
  foreach ($xpath->query('.//select[@name="sel_subj"]') as $dom_select_term)
 
    break;
 
  if (empty($dom_select_term))
 
    {
 
      school_crawl_logf($school_crawl_log, 4, "Unable to find Subject-selecting form input");
 
      return 1;
 
    }
 

	
 
  /*
 
   * Manually select all of the different sorts of subject materials
 
   * since selecting no subjects doesn't result in listing them all.
 
   */
 
  $semester_form['sel_subj'] = array();
 
  foreach (school_crawl_form_select_array($dom_select_term, FALSE) as $subject_name => $junk)
 
    $semester_form['sel_subj'][] = $subject_name;
 

	
 
  if (!empty($semester_form_action))
 
    $uri = school_crawl_url($uri, $semester_form_action);
 
  $sections_html = school_crawl_geturi($uri, $cookies, $school_crawl_log, $semester_form);
 

	
 
  /*
 
   * Get an HTML-based results page. We only get this page because it
 
   * has a <form /> which we can submit to get CSV.
 
   */
 
  $sections_dom = new DOMDocument();
 
  if (empty($sections_html)
 
      || !$sections_dom->loadHTML($sections_html))
 
    {
 
      school_crawl_logf($school_crawl_log, 2, "Unable to load section listings page.");
 
      return 1;
 
    }
 
  $sections_xpath = new DOMXPath($sections_dom);
 

	
 
  /* Look for the "Export to Excel" submit button */
 
  $sections_form = $sections_xpath->query('.//form[.//input[@type = "submit" and contains(@value, "xport")]]')->item(0);
 
  if (empty($sections_form))
 
    {
 
      school_crawl_logf($school_crawl_log, 2, "Unable to find CSV link for schedule.");
 
      return 1;
 
    }
 

	
 
  /* Get the CSV */
 
  $sections_form_action = $sections_form->getAttribute('action');
 
  if (!empty($sections_form_action))
 
    $uri = school_crawl_url($uri, $sections_form_action);
 
  $sections_csv = school_crawl_geturi($uri, $cookies, $school_crawl_log, school_crawl_form($sections_form));
 

	
 
  /*
 
   * Oracle likes to put random `"' into the middle of a quoted string
 
   * instead of properly escaping it like ``"This is a string with a
 
   * "" in it"''. This regex blasts away such doublequotes which are
 
   * not adjacent to delimiters (hopefully).
 
   */
 
  $sections_csv = preg_replace('/([^,\\n\\r])"([^,\\n\\r])/', '$1""$2', $sections_csv);
 
  $sections_csv = school_crawl_csv_parse($sections_csv, array('eof' => TRUE));
 
  /* Skip the introductory lines, seeking for the field headers */
 
  for ($i = 0; $i < count($sections_csv) && count($sections_csv[$i]) < 2; $i ++)
 
    ;
 

	
 
  $fields = array(
 
    'Status' => FALSE /*< OPEN, RESTRICTED, IN PROGRESS, or empty */,
 
    'Title' => FALSE /*< course name */,
 
    'Subject' => FALSE /*< subject id */,
 
    'Course Number' => FALSE,
 
    'Section Number' => FALSE,
 
    'CRN' => FALSE /*< section synonym */,
 
    'Cred' => FALSE /*< Number of credits, can be a range which would be formatted like "  1-4" */,
 
    /*
 
     * ex. "FA1", "FA2", "CH2" (online course?), "CD4", "SRS"
 
     * (seniors). If a course has multiple attributes, it will have
 
     * multiple lines following it with the attributes but no other
 
     * fields filled?
 
     */
 
    'Attr' => FALSE,
 
    /*
 
     * The first of 8 columns being Day + times. "M" (or "TBA"), "T",
 
     * "W", "R", "F", <saturday>?, <sunday>?, "1600-1800" or "TBA".
 
     */
 
    'Meeting Days/Times' => FALSE,
 
    'Location' => FALSE /*< The room or TBA */,
 
    'Capacity' => FALSE /*< Probably the maximum number of students */,
 
    'Actual' => FALSE /*< Possibly the current number of students? */,
 
    'Remainder' => FALSE  /*< Number of spots to be filled... */,
 
    'Instructor' => FALSE /*< The prof/instructor */,
 
    /*
 
     * The start/end dates in form of 07/02-07/27. This would be
 
     * particularly important for supporting half-semester
 
     * courses. Bug #122.
 
     */
 
    'Date' => FALSE,
 
    'Weeks' => FALSE /*< The total number of weeks the course meets */,
 
  );
 
  $state = array(
 
    'semester' => $semester,
 
    'fields' => $fields,
 
    'data' => '',
 
    'data_unfiltered' => '', /*< Data not yet passed through _hope_crawl_semester_csv_filter() */
 
    'expected_columns' => 0, /*< The number of columns expected to be in a section row, calculated when parsing the header row. */
 
    'rollover_values' => array(), /*< The values of columns which may be used multiple times, such as for sections with multiple meetings. */
 
    'school_crawl_log' => &$school_crawl_log,
 
    'state' => SP_HOPE_CRAWL_STATE_PREHEADER,
 
  );
 
  $sections_csv = school_crawl_geturi($uri, $cookies, $school_crawl_log, school_crawl_form($sections_form),
 
				      FALSE, NULL, array(
 
					'writefunc' => '_hope_crawl_semester_csv',
 
					'writestate' => &$state,
 
				      ));
 
  /* Deliver the EOF */
 
  $state['data'] .= _hope_crawl_semester_csv_filter($state['data_unfiltered']);
 
  school_crawl_csv_parse($state['data'], array('eof' => TRUE, 'stream' => array('callback' => '_hope_crawl_semester_csv_row', 'state' => &$state)));
 
}
 

	
 
  foreach ($sections_csv[$i] as $column => $name)
 
    if (!empty($name))
 
      $fields[$name] = $column;
 
  $expected_columns = max($fields);
 
  foreach ($fields as $name => $location)
 
    if ($location === FALSE)
 
      {
 
	school_crawl_logf($school_crawl_log, 2, "Cannot find column named %s in CSV. The column headings line looks like ``%s''.",
 
			  $name, implode(',', $sections_csv[$i]));
 
	return 1;
 
      }
 
/**
 
 * \brief
 
 *   Filter the CSV so that doublequotes are properly escaped.
 
 *
 
 * \param $lines
 
 *   One or more complete lines of CSV. Partial lines should be
 
 *   withheld for later filtering.
 
 */
 
function _hope_crawl_semester_csv_filter($lines)
 
{
 
  /*
 
   * Oracle likes to put random `"' into the middle of a quoted string
 
   * instead of properly escaping it like ``"This is a string with a
 
   * "" in it"''. This regex blasts away such doublequotes which are
 
   * not adjacent to delimiters (hopefully).
 
   */
 
  return preg_replace('/([^,\\n\\r])"([^,\\n\\r])/', '$1""$2', $lines);
 
}
 

	
 
/**
 
 * \brief
 
 *   libcurl WRITEFUNC callback for parsing CSV.
 
 *
 
 * \param $state
 
 *   The state.
 
 * \param $data
 
 *   The data read so far.
 
 *
 
 * \return
 
 *   The number of bytes in $data or a different number to indicate
 
 *   error.
 
 */
 
function _hope_crawl_semester_csv(&$state, $data)
 
{
 
  $state['data_unfiltered'] .= $data;
 
  $last_newline_pos = strrpos($state['data_unfiltered'], "\n");
 
  if ($last_newline_pos === FALSE)
 
    /* Not enough new data */
 
    return strlen($data);
 
  $state['data'] .= _hope_crawl_semester_csv_filter(substr($state['data_unfiltered'], 0, $last_newline_pos + 1));
 
  $state['data_unfiltered'] = substr($state['data_unfiltered'], $last_newline_pos + 1);
 

	
 
  school_crawl_csv_parse($state['data'], array('stream' => array('callback' => '_hope_crawl_semester_csv_row', 'state' => &$state)));
 

	
 
  /* Label the days of the week and Times column */
 
  foreach (array('M', 'T', 'W', 'R', 'F', 'S', 'U', 'Times') as $offset => $name)
 
    $fields[$name] = $fields['Meeting Days/Times'] + $offset;
 
  return strlen($data);
 
}
 

	
 
function _hope_crawl_semester_csv_row(&$state, $row)
 
{
 
  $expected_columns =& $state['expected_columns'];
 
  $fields =& $state['fields'];
 
  $rollover_values =& $state['rollover_values'];
 
  $school_crawl_log =& $state['school_crawl_log'];
 
  $semester = $state['semester'];
 

	
 
  switch ($state['state'])
 
    {
 
    case SP_HOPE_CRAWL_STATE_PREHEADER:
 
      if (count($row) < 2)
 
	/*
 
	 * Skip the introductory lines, seeking for the field headers.
 
	 */
 
	break;
 

	
 
  for ($i ++; $i < count($sections_csv); $i ++)
 
    {
 
      $section_csv = $sections_csv[$i];
 
      /*
 
       * Came upon the header line… parse the header and switch to
 
       * sections mode.
 
       */
 
      foreach ($row as $column => $name)
 
	if (!empty($name))
 
	  $fields[$name] = $column;
 
      $expected_columns = max($fields);
 
      foreach ($fields as $name => $location)
 
	if ($location === FALSE)
 
	  {
 
	    school_crawl_logf($school_crawl_log, 2, "Cannot find column named %s in CSV. The column headings line looks like ``%s''.",
 
			      $name, implode(',', $row));
 
	    return 1;
 
	  }
 

	
 
      /* Label the days of the week and Times column */
 
      foreach (array('M', 'T', 'W', 'R', 'F', 'S', 'U', 'Times') as $offset => $name)
 
	$fields[$name] = $fields['Meeting Days/Times'] + $offset;
 

	
 
      $state['state'] = SP_HOPE_CRAWL_STATE_SECTIONS;
 
      break;
 
    case SP_HOPE_CRAWL_STATE_SECTIONS:
 
      $section_csv = $row;
 

	
 
      if (count($section_csv) < $expected_columns)
 
	{
 
	  school_crawl_logf($school_crawl_log, 8, "Skipping row which has fewer entries than expected (%d): %s",
 
			    $expected_columns, implode(', ', $section_csv));
 
	  continue;
 
	}
 

	
 
      /*
 
       * If a section has multiple meetings, each extra meeting is
 
       * placed on a row following the first section's entry. However,
 
       * the course/synonym/section/subject are all blank on that
 
       * line. Therefore, we must propagate these values.
 
       */
 
      foreach (array(
 
	'subject_id' => 'Subject',
 
	'course_id' => 'Course Number',
 
	'title' => 'Title',
 
	'section_id' => 'Section Number',
 
	'synonym' => 'CRN',
 
	'instructor' => 'Instructor',
 
	'location' => 'Location',
 
      ) as $var => $field)
 
	if (strlen(trim($section_csv[$fields[$field]])))
 
	  ${$var} = trim($section_csv[$fields[$field]]);
 
	{
 
	  $rollover_values += array($var => ''); /*< (Inefficient) */
 
	  ${$var} =& $rollover_values[$var];
 
	  if (strlen(trim($section_csv[$fields[$field]])))
 
	    ${$var} = trim($section_csv[$fields[$field]]);
 
	}
 

	
 
      if ($section_csv[$fields['M']] == 'TBA'
 
	  || $section_csv[$fields['Times']] == 'TBA')
 
	{
 
	  $semester->class_add(new Course($subject_id . '-' . $course_id,
 
					  $section_csv[$fields['Title']]));
 
	  school_crawl_logf($school_crawl_log, 8, "Course %s-%s-%s has a section meeting with a TBA time, adding dummy course.",
 
			    $subject_id, $course_id, $section_id);
 
	  continue;
 
	}
 

	
 
      $date_start = $date_end = NULL;
 
      if (preg_match(',(\\d\\d)/(\\d\\d)-(\\d\\d)/(\\d\\d),', $section_csv[$fields['Date']], $matches))
 
	{
 
	  list(, $m_start, $d_start, $m_end, $d_end) = $matches;
 
	  if ($m_start && $d_start && $m_end && $d_end)
 
	    {
 
	      $y_start = $y_end = $semester->year_get();
 
	      if ($m_end < $m_start)
 
		$y_end ++;
 
	      $date_start = gmmktime(0, 0, 0, $m_start, $d_start, $y_start);
 
	      $date_end = gmmktime(0, 0, 0, $m_end, $d_end, $y_end);
 
	    }
 
	}
 

	
 
      if (trim($section_csv[$fields['U']]))
 
	school_crawl_logf($school_crawl_log, 0, "Section %d has sunday.", $synonym);
 
      $days = school_crawl_days_format($school_crawl_log, array_filter(array_slice($section_csv, $fields['M'], 7), '_hope_crawl_days_filter'));
 
      list($time_start, $time_end) = explode('-', $section_csv[$fields['Times']]);
 
      if (strlen($time_start) != 4 || strlen($time_end) != 4)
 
	{
 
	  school_crawl_logf($school_crawl_log, 4, "Section meeting (synonym=%s) has invalidly-formatted start time (%s) or end time (%s). Skipping.",
 
			    $synonym, $time_start, $time_end);
 
	  continue;
 
	}
 

	
 
      /*
 
       * Guessing the type of section_meeting: `attribute' of NSL
 
       * seems to be associated with labs. Matches `lab', `lab.', `
 
       * lab', ` labo'..., etc.
 
       */
 
      $type = 'lecture';
 
      if (preg_match('/(^|[^a-z])lab($|o|[^a-z])/i', $title))
 
	$type = 'lab';
 

	
 
      $section_meeting = new SectionMeeting($days, $time_start, $time_end,
 
					    $location,
 
					    $type,
 
					    $instructor,
 
					    $date_start, $date_end);
 
      $semester->section_meeting_add($subject_id,
 
				     $course_id,
 
				     $title,
 
				     $section_id,
 
				     $synonym,
 
				     $section_meeting,
 
				     $type,
 
				     $section_csv[$fields['Cred']]);
 
      break;
 
    }
 
  return 0;
 
}
0 comments (0 inline, 0 general)