# HG changeset patch
# User Nathan Phillip Brink <ohnobinki@ohnopublishing.net>
# Date 2012-11-22 15:53:37
# Node ID 1838a6f6fa2089f44f078a5c39d8ec9f46f244fa
# Parent  37146f2e0683bc183acf325b612a1630df2a877d

Update Hope College crawler to be stream/chunk based, lowering its memory usage.

diff --git a/inc/school.crawl.inc b/inc/school.crawl.inc
--- a/inc/school.crawl.inc
+++ b/inc/school.crawl.inc
@@ -298,24 +298,43 @@ function school_crawl_meeting_type($meet
  * \param $loopspin
  *   An internal variable to prevent us from following perpetual
  *   redirects.
+ * \param $options
+ *   Extra optional arguments with keys as follows:
+ *   - 'writefunc': A curl-compatible write function of the form
+ *       function($state, $data) and returns the number of eaten bytes
+ *       which must be equal to the number of bytes received unless if
+ *       the transfer should be aborted. Settings this and using
+ *       $follow_meta_refresh are mutually exclusive and will cause
+ *       undefined behavior.
+ *   - 'writestate': The value which should be passed to writefunc as
+ *       the $state parameter.
  * \return
  *   The body of the document returned by the server (normally
- *   malformed HTML, especially with Calvin's WebAdvisor
- *   installation).
+ *   malformed HTML, especially with Calvin's WebAdvisor installation)
+ *   or, if 'writestate' and 'writefunc' are set, the value stored in
+ *   'writestate'.
  */
-function school_crawl_geturi(&$uri, &$cookies, array &$school_crawl_log, $post = NULL, $follow_meta_refresh = FALSE, $curlsetup_hook = NULL, $loopspin = 0)
+function school_crawl_geturi(&$uri, &$cookies, array &$school_crawl_log, $post = NULL, $follow_meta_refresh = FALSE, $curlsetup_hook = NULL, $options = array(), $loopspin = 0)
 {
-  global $school_crawl_geturi_write_buf, $school_crawl_geturi_headers_buf;
+  global $school_crawl_geturi_writefunc,
+    $school_crawl_geturi_writestate,
+    $school_crawl_geturi_headers_buf;
 
   school_crawl_logf($school_crawl_log, 7, "school_crawl_geturi('%s').", $uri);
 
+  $options += array(
+    'writefunc' => 'school_crawl_geturi_writefunc_cb',
+    'writestate' => '',
+  );
+  $school_crawl_geturi_writefunc = $options['writefunc'];
+  $GLOBALS['school_crawl_geturi_writestate'] = &$options['writestate'];
+
   $curl = curl_init();
   curl_setopt($curl, CURLOPT_USERAGENT, SP_PACKAGE_NAME . '/' . SP_PACKAGE_VERSION);
 
   if ($curlsetup_hook !== NULL)
     $curlsetup_hook($curl);
 
-  $school_crawl_geturi_write_buf = '';
   $school_crawl_geturi_headers_buf = '';
   curl_setopt($curl, CURLOPT_URL, $uri);
 
@@ -405,7 +424,7 @@ function school_crawl_geturi(&$uri, &$co
   if ($follow_meta_refresh)
     {
       $dom = new DOMDocument();
-      $dom->loadHTML($school_crawl_geturi_write_buf);
+      $dom->loadHTML($options['writestate']);
       foreach ($dom->getElementsByTagName('meta') as $meta_node)
 	if ($meta_node->hasAttribute('http-equiv')
 	    && !strcasecmp('refresh', $meta_node->getAttribute('http-equiv')))
@@ -424,13 +443,13 @@ function school_crawl_geturi(&$uri, &$co
 	  }
     }
 
-  school_crawl_logf($school_crawl_log, 10, "%s", $school_crawl_geturi_write_buf);
+  school_crawl_logf($school_crawl_log, 10, "%s", $options['writestate']);
   if ($location && $loopspin < 6)
     {
       $uri = $location;
-      return school_crawl_geturi($uri, $cookies, $school_crawl_log, $post, $follow_meta_refresh, $curlsetup_hook, $loopspin + 1);
+      return school_crawl_geturi($uri, $cookies, $school_crawl_log, $post, $follow_meta_refresh, $curlsetup_hook, $options, $loopspin + 1);
     }
-  return $school_crawl_geturi_write_buf;
+  return $options['writestate'];
 }
 
 function school_crawl_geturi_header_cb($curl, $header_buf)
@@ -440,11 +459,16 @@ function school_crawl_geturi_header_cb($
   return strlen($header_buf);
 }
 
+function school_crawl_geturi_writefunc_cb(&$writebuf, $data)
+{
+  $writebuf .= $data;
+  return strlen($data);
+}
+
 function school_crawl_geturi_write_cb($curl, $write_buf)
 {
-  global $school_crawl_geturi_write_buf;
-  $school_crawl_geturi_write_buf .= $write_buf;
-  return strlen($write_buf);
+  global $school_crawl_geturi_writefunc, $school_crawl_geturi_writestate;
+  return $school_crawl_geturi_writefunc($school_crawl_geturi_writestate, $write_buf);
 }
 
 /**
@@ -742,7 +766,7 @@ function _school_crawl_csv_parse_eol($da
 
 /**
  * \brief
- *   Read a line of CSV and return it as an array.
+ *   Read a string of CSV and return it as an array of row arrays.
  *
  * \param $data
  *   CSV data to parse. Parsed data shall be deleted.
@@ -755,12 +779,19 @@ function _school_crawl_csv_parse_eol($da
  *     will assume that it cannot assume that there is an implicit
  *     newline. Some improper files don't have the extra newline at
  *     their end and thus this is needed to support them.
+ *   - stream (unset): If set to an array containing the keys
+ *     'callback' and 'state', will call the 'callback' which is a
+ *     function($state, $row) with $state set to the value in 'state'
+ *     instead of storing all rows and returning them all.
  * \return
  *   An array with an entry for each line in the CSV file where each
  *   line's entry is an array of the items in that row. An empty array
  *   will be returned in the case that there is insufficient data to
  *   read a line (or insufficient data to tell if the line is
- *   complete, see $options['eof']).
+
+ *   complete, see $options['eof']). If the 'stream' option is set in
+ *   $options, then the return value shall be the number of rows
+ *   parsed.
  */
 function school_crawl_csv_parse(&$data, array $options = array())
 {
@@ -769,10 +800,14 @@ function school_crawl_csv_parse(&$data, 
     'eof' => FALSE,
   );
 
-  $ret = array();
   $i = 0;
   $last_line_i = $i;
   $strlen_data = strlen($data);
+  $streammode = !empty($options['stream']);
+  if ($streammode)
+    $ret = 0;
+  else
+    $ret = array();
 
   while ($i < $strlen_data)
     {
@@ -861,7 +896,13 @@ function school_crawl_csv_parse(&$data, 
 	$i = $next_i;
 	$last_line_i = $i;
 	$row[] = $entry;
-	$ret[] = $row;
+	if ($streammode)
+	  {
+	    $options['stream']['callback']($options['stream']['state'], $row);
+	    $ret ++;
+	  }
+	else
+	  $ret[] = $row;
     }
 
   if (!empty($last_line_i))
diff --git a/school.d/hope.crawl.inc b/school.d/hope.crawl.inc
--- a/school.d/hope.crawl.inc
+++ b/school.d/hope.crawl.inc
@@ -18,6 +18,9 @@
  * along with slate_permutate.  If not, see <http://www.gnu.org/licenses/>.
  */
 
+define('SP_HOPE_CRAWL_STATE_PREHEADER', 1);
+define('SP_HOPE_CRAWL_STATE_SECTIONS', 2);
+
 /**
  * \brief
  *   Start a Hope crawling session.
@@ -175,19 +178,6 @@ function hope_crawl_semester(array $scho
   $sections_form_action = $sections_form->getAttribute('action');
   if (!empty($sections_form_action))
     $uri = school_crawl_url($uri, $sections_form_action);
-  $sections_csv = school_crawl_geturi($uri, $cookies, $school_crawl_log, school_crawl_form($sections_form));
-
-  /*
-   * Oracle likes to put random `"' into the middle of a quoted string
-   * instead of properly escaping it like ``"This is a string with a
-   * "" in it"''. This regex blasts away such doublequotes which are
-   * not adjacent to delimiters (hopefully).
-   */
-  $sections_csv = preg_replace('/([^,\\n\\r])"([^,\\n\\r])/', '$1""$2', $sections_csv);
-  $sections_csv = school_crawl_csv_parse($sections_csv, array('eof' => TRUE));
-  /* Skip the introductory lines, seeking for the field headers */
-  for ($i = 0; $i < count($sections_csv) && count($sections_csv[$i]) < 2; $i ++)
-    ;
 
   $fields = array(
     'Status' => FALSE /*< OPEN, RESTRICTED, IN PROGRESS, or empty */,
@@ -222,26 +212,114 @@ function hope_crawl_semester(array $scho
     'Date' => FALSE,
     'Weeks' => FALSE /*< The total number of weeks the course meets */,
   );
+  $state = array(
+    'semester' => $semester,
+    'fields' => $fields,
+    'data' => '',
+    'data_unfiltered' => '', /*< Data not yet passed through _hope_crawl_semester_csv_filter() */
+    'expected_columns' => 0, /*< The number of columns expected to be in a section row, calculated when parsing the header row. */
+    'rollover_values' => array(), /*< The values of columns which may be used multiple times, such as for sections with multiple meetings. */
+    'school_crawl_log' => &$school_crawl_log,
+    'state' => SP_HOPE_CRAWL_STATE_PREHEADER,
+  );
+  $sections_csv = school_crawl_geturi($uri, $cookies, $school_crawl_log, school_crawl_form($sections_form),
+				      FALSE, NULL, array(
+					'writefunc' => '_hope_crawl_semester_csv',
+					'writestate' => &$state,
+				      ));
+  /* Deliver the EOF */
+  $state['data'] .= _hope_crawl_semester_csv_filter($state['data_unfiltered']);
+  school_crawl_csv_parse($state['data'], array('eof' => TRUE, 'stream' => array('callback' => '_hope_crawl_semester_csv_row', 'state' => &$state)));
+}
 
-  foreach ($sections_csv[$i] as $column => $name)
-    if (!empty($name))
-      $fields[$name] = $column;
-  $expected_columns = max($fields);
-  foreach ($fields as $name => $location)
-    if ($location === FALSE)
-      {
-	school_crawl_logf($school_crawl_log, 2, "Cannot find column named %s in CSV. The column headings line looks like ``%s''.",
-			  $name, implode(',', $sections_csv[$i]));
-	return 1;
-      }
+/**
+ * \brief
+ *   Filter the CSV so that doublequotes are properly escaped.
+ *
+ * \param $lines
+ *   One or more complete lines of CSV. Partial lines should be
+ *   withheld for later filtering.
+ */
+function _hope_crawl_semester_csv_filter($lines)
+{
+  /*
+   * Oracle likes to put random `"' into the middle of a quoted string
+   * instead of properly escaping it like ``"This is a string with a
+   * "" in it"''. This regex blasts away such doublequotes which are
+   * not adjacent to delimiters (hopefully).
+   */
+  return preg_replace('/([^,\\n\\r])"([^,\\n\\r])/', '$1""$2', $lines);
+}
+
+/**
+ * \brief
+ *   libcurl WRITEFUNC callback for parsing CSV.
+ *
+ * \param $state
+ *   The state.
+ * \param $data
+ *   The data read so far.
+ *
+ * \return
+ *   The number of bytes in $data or a different number to indicate
+ *   error.
+ */
+function _hope_crawl_semester_csv(&$state, $data)
+{
+  $state['data_unfiltered'] .= $data;
+  $last_newline_pos = strrpos($state['data_unfiltered'], "\n");
+  if ($last_newline_pos === FALSE)
+    /* Not enough new data */
+    return strlen($data);
+  $state['data'] .= _hope_crawl_semester_csv_filter(substr($state['data_unfiltered'], 0, $last_newline_pos + 1));
+  $state['data_unfiltered'] = substr($state['data_unfiltered'], $last_newline_pos + 1);
+
+  school_crawl_csv_parse($state['data'], array('stream' => array('callback' => '_hope_crawl_semester_csv_row', 'state' => &$state)));
 
-  /* Label the days of the week and Times column */
-  foreach (array('M', 'T', 'W', 'R', 'F', 'S', 'U', 'Times') as $offset => $name)
-    $fields[$name] = $fields['Meeting Days/Times'] + $offset;
+  return strlen($data);
+}
+
+function _hope_crawl_semester_csv_row(&$state, $row)
+{
+  $expected_columns =& $state['expected_columns'];
+  $fields =& $state['fields'];
+  $rollover_values =& $state['rollover_values'];
+  $school_crawl_log =& $state['school_crawl_log'];
+  $semester = $state['semester'];
+
+  switch ($state['state'])
+    {
+    case SP_HOPE_CRAWL_STATE_PREHEADER:
+      if (count($row) < 2)
+	/*
+	 * Skip the introductory lines, seeking for the field headers.
+	 */
+	break;
 
-  for ($i ++; $i < count($sections_csv); $i ++)
-    {
-      $section_csv = $sections_csv[$i];
+      /*
+       * Came upon the header line… parse the header and switch to
+       * sections mode.
+       */
+      foreach ($row as $column => $name)
+	if (!empty($name))
+	  $fields[$name] = $column;
+      $expected_columns = max($fields);
+      foreach ($fields as $name => $location)
+	if ($location === FALSE)
+	  {
+	    school_crawl_logf($school_crawl_log, 2, "Cannot find column named %s in CSV. The column headings line looks like ``%s''.",
+			      $name, implode(',', $row));
+	    return 1;
+	  }
+
+      /* Label the days of the week and Times column */
+      foreach (array('M', 'T', 'W', 'R', 'F', 'S', 'U', 'Times') as $offset => $name)
+	$fields[$name] = $fields['Meeting Days/Times'] + $offset;
+
+      $state['state'] = SP_HOPE_CRAWL_STATE_SECTIONS;
+      break;
+    case SP_HOPE_CRAWL_STATE_SECTIONS:
+      $section_csv = $row;
 
       if (count($section_csv) < $expected_columns)
 	{
@@ -265,8 +343,12 @@ function hope_crawl_semester(array $scho
 	'instructor' => 'Instructor',
 	'location' => 'Location',
       ) as $var => $field)
-	if (strlen(trim($section_csv[$fields[$field]])))
-	  ${$var} = trim($section_csv[$fields[$field]]);
+	{
+	  $rollover_values += array($var => ''); /*< (Inefficient) */
+	  ${$var} =& $rollover_values[$var];
+	  if (strlen(trim($section_csv[$fields[$field]])))
+	    ${$var} = trim($section_csv[$fields[$field]]);
+	}
 
       if ($section_csv[$fields['M']] == 'TBA'
 	  || $section_csv[$fields['Times']] == 'TBA')
@@ -325,6 +407,6 @@ function hope_crawl_semester(array $scho
 				     $section_meeting,
 				     $type,
 				     $section_csv[$fields['Cred']]);
+      break;
     }
-  return 0;
 }