SlatePermutate Changeset - 1838a6f6fa20 · protofusion repositories

Changeset - 1838a6f6fa20

Parent rev.

Child rev.

[Not reviewed]

default

0 2 0

Nathan Brink (binki) - 13 years ago 2012-11-22 15:53:37
ohnobinki@ohnopublishing.net

Update Hope College crawler to be stream/chunk based, lowering its memory usage.

2 files changed with 157 insertions and 34 deletions:

inc/school.crawl.inc

school.d/hope.crawl.inc

101

0 comments (0 inline, 0 general)

inc/school.crawl.inc

➞

Show inline comments

@@ @@ -289,42 +289,61 @@ function school_crawl_meeting_type($meet @@
  *   associative array of form keys/values.
  * \param $follow_meta_refresh
  *   Parse the resultant HTML with http://docs.php.net/dom and if it
  *   contains a line that looks like ``<meta http-equiv="Refresh" content="0; url=https://simon.ccbcmd.edu/pls/PROD/bwckschd.p_disp_dyn_sched">'',
  *   follow that URL.
  * \param $curlsetup_hook
  *   A function which is passed a curl handle which allows the caller
  *   to do silly things like setting CURLOPT_SSLVERSION for silly
  *   sites like ccbcmd's registration site.
  * \param $loopspin
  *   An internal variable to prevent us from following perpetual
  *   redirects.
  * \param $options
  *   Extra optional arguments with keys as follows:
  *   - 'writefunc': A curl-compatible write function of the form
  *       function($state, $data) and returns the number of eaten bytes
  *       which must be equal to the number of bytes received unless if
  *       the transfer should be aborted. Settings this and using
  *       $follow_meta_refresh are mutually exclusive and will cause
  *       undefined behavior.
  *   - 'writestate': The value which should be passed to writefunc as
  *       the $state parameter.
  * \return
  *   The body of the document returned by the server (normally
  *   malformed HTML, especially with Calvin's WebAdvisor
  *   installation).
  *   malformed HTML, especially with Calvin's WebAdvisor installation)
  *   or, if 'writestate' and 'writefunc' are set, the value stored in
  *   'writestate'.
  */
 function school_crawl_geturi(&$uri, &$cookies, array &$school_crawl_log, $post = NULL, $follow_meta_refresh = FALSE, $curlsetup_hook = NULL, $loopspin = 0)
+function school_crawl_geturi(&$uri, &$cookies, array &$school_crawl_log, $post = NULL, $follow_meta_refresh = FALSE, $curlsetup_hook = NULL, $options = array(), $loopspin = 0)
+{
   global $school_crawl_geturi_write_buf, $school_crawl_geturi_headers_buf;
   global $school_crawl_geturi_writefunc,
     $school_crawl_geturi_writestate,
     $school_crawl_geturi_headers_buf;
   school_crawl_logf($school_crawl_log, 7, "school_crawl_geturi('%s').", $uri);
   $options += array(
     'writefunc' => 'school_crawl_geturi_writefunc_cb',
     'writestate' => '',
   );
   $school_crawl_geturi_writefunc = $options['writefunc'];
   $GLOBALS['school_crawl_geturi_writestate'] = &$options['writestate'];
   $curl = curl_init();
   curl_setopt($curl, CURLOPT_USERAGENT, SP_PACKAGE_NAME . '/' . SP_PACKAGE_VERSION);
   if ($curlsetup_hook !== NULL)
     $curlsetup_hook($curl);
   $school_crawl_geturi_write_buf = '';
   $school_crawl_geturi_headers_buf = '';
   curl_setopt($curl, CURLOPT_URL, $uri);
   $cookies_str = '';
   foreach ($cookies as $key => $val)
+    {
       if (strlen($cookies_str))
 	$cookies_str .= ';';
       $cookies_str .= $key . '=' . $val;
+    }
   school_crawl_logf($school_crawl_log, 10, "cookies sent: %s", $cookies_str);
@@ @@ -396,64 +415,69 @@ function school_crawl_geturi(&$uri, &$co @@
 	case 'Location':
 	  $location = $header_val;
 	  /* yes, a calvin-specific replacement :-/ */
 	  $location = preg_replace(';(kvdata\.calvin\.edu/)(WebAdvisor);', '\1walive/\2', $location) . "\n";
 	  $post = NULL;
 	  break;
+	}
+    }
   if ($follow_meta_refresh)
+    {
       $dom = new DOMDocument();
-      $dom->loadHTML($school_crawl_geturi_write_buf);
+      $dom->loadHTML($options['writestate']);
       foreach ($dom->getElementsByTagName('meta') as $meta_node)
 	if ($meta_node->hasAttribute('http-equiv')
 	    && !strcasecmp('refresh', $meta_node->getAttribute('http-equiv')))
+	  {
 	    $meta_content = $meta_node->getAttribute('content');
 	    school_crawl_logf($school_crawl_log, 7, "Following http-equiv Refresh: %s", $meta_content);
 	    if (!(preg_match('/^[0-9]+; *url=(.*)$/', $meta_content, $meta_matches)))
+	      {
 		school_crawl_logf($school_crawl_log, 0, "Error following http-equiv Refresh: %s", $meta_content);
+	      }
 	    else
+	      {
 		$location = $meta_matches[1];
 		$post = NULL;
+	      }
+	  }
+    }
-  school_crawl_logf($school_crawl_log, 10, "%s", $school_crawl_geturi_write_buf);
+  school_crawl_logf($school_crawl_log, 10, "%s", $options['writestate']);
   if ($location && $loopspin < 6)
+    {
       $uri = $location;
       return school_crawl_geturi($uri, $cookies, $school_crawl_log, $post, $follow_meta_refresh, $curlsetup_hook, $loopspin + 1);
+      return school_crawl_geturi($uri, $cookies, $school_crawl_log, $post, $follow_meta_refresh, $curlsetup_hook, $options, $loopspin + 1);
+    }
-  return $school_crawl_geturi_write_buf;
+  return $options['writestate'];
+}
 function school_crawl_geturi_header_cb($curl, $header_buf)
+{
   global $school_crawl_geturi_headers_buf;
   $school_crawl_geturi_headers_buf .= $header_buf;
   return strlen($header_buf);
+}
 function school_crawl_geturi_writefunc_cb(&$writebuf, $data)
+{
   $writebuf .= $data;
   return strlen($data);
+}
 function school_crawl_geturi_write_cb($curl, $write_buf)
+{
   global $school_crawl_geturi_write_buf;
   $school_crawl_geturi_write_buf .= $write_buf;
   return strlen($write_buf);
   global $school_crawl_geturi_writefunc, $school_crawl_geturi_writestate;
   return $school_crawl_geturi_writefunc($school_crawl_geturi_writestate, $write_buf);
+}
 /**
  * \brief
  *   Finds the closest parent of a DOM element with a certain tag
  *   name.
+ *
  * Useful for finding the <form /> element associated with a given
  * <select /> or set of <input />s so that the form's action=""
  * parameter may be found.
+ *
  * The node itself passed in will be considered for whether or not it
@@ @@ -733,55 +757,66 @@ function _school_crawl_csv_parse_eol($da @@
+{
   if ($len <= $i)
     return $eof ? $i : FALSE;
   if ($data[$i] == "\n")
     return $i + 1;
   if ($data[$i] == "\r" && $len > $i + 1 && $data[$i + 1] == "\n")
       return $i + 2;
   return FALSE;
+}
 /**
  * \brief
- *   Read a line of CSV and return it as an array.
+ *   Read a string of CSV and return it as an array of row arrays.
+ *
  * \param $data
  *   CSV data to parse. Parsed data shall be deleted.
  * \param $options
  *   An array with any number of the following optional arguments
  *   which have the documented defaults:
  *   - delimiter (','): The character which delimits fields.
  *   - eof (FALSE): Whether there will be no more data coming.
  *     Normally, if the $data ends without a newline this function
  *     will assume that it cannot assume that there is an implicit
  *     newline. Some improper files don't have the extra newline at
  *     their end and thus this is needed to support them.
  *   - stream (unset): If set to an array containing the keys
  *     'callback' and 'state', will call the 'callback' which is a
  *     function($state, $row) with $state set to the value in 'state'
  *     instead of storing all rows and returning them all.
  * \return
  *   An array with an entry for each line in the CSV file where each
  *   line's entry is an array of the items in that row. An empty array
  *   will be returned in the case that there is insufficient data to
  *   read a line (or insufficient data to tell if the line is
  *   complete, see $options['eof']).
  *   complete, see $options['eof']). If the 'stream' option is set in
  *   $options, then the return value shall be the number of rows
  *   parsed.
  */
 function school_crawl_csv_parse(&$data, array $options = array())
+{
   $options += array(
     'delimiter' => ',',
     'eof' => FALSE,
   );
   $ret = array();
   $i = 0;
   $last_line_i = $i;
   $strlen_data = strlen($data);
   $streammode = !empty($options['stream']);
   if ($streammode)
     $ret = 0;
   else
     $ret = array();
   while ($i < $strlen_data)
+    {
 	$row = array();
 	$quote = FALSE;
 	$entry = '';
 	while ($quote
 	       || (_school_crawl_csv_parse_eol($data, $strlen_data, $i, $options['eof']) === FALSE))
+	  {
 	    /*
 	     * There are two ways to read data. One within the
@@ @@ -852,24 +887,30 @@ function school_crawl_csv_parse(&$data, @@
 	    $i ++;
 	    if ($i >= $strlen_data)
 	      break;
+	  }
 	/* Ignore read row because if we encountered end of buffer */
 	if (($next_i = _school_crawl_csv_parse_eol($data, $strlen_data, $i, $options['eof'])) === FALSE)
 	  break;
 	$i = $next_i;
 	$last_line_i = $i;
 	$row[] = $entry;
 	if ($streammode)
+	  {
 	    $options['stream']['callback']($options['stream']['state'], $row);
 	    $ret ++;
+	  }
 	else
 	$ret[] = $row;
+    }
   if (!empty($last_line_i))
+    {
       $data = substr($data, $last_line_i);
       if ($data === FALSE)
 	$data = '';
+    }
   return $ret;
+}

school.d/hope.crawl.inc

➞

Show inline comments

@@ @@ -9,24 +9,27 @@ @@
  * the Free Software Foundation, either version 3 of the License, or
  * (at your option) any later version.
+ *
  * slate_permutate is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Affero General Public License for more details.
+ *
  * You should have received a copy of the GNU Affero General Public License
  * along with slate_permutate.  If not, see <http://www.gnu.org/licenses/>.
  */
 define('SP_HOPE_CRAWL_STATE_PREHEADER', 1);
 define('SP_HOPE_CRAWL_STATE_SECTIONS', 2);
 /**
  * \brief
  *   Start a Hope crawling session.
  */
 function _hope_crawl_start(array $school, &$uri, array &$cookies, &$dom, &$xpath, &$school_crawl_log)
+{
   $cookies = array();
   $uri = 'http://plus.hope.edu/PROD/hxskschd.P_hopeSchedule';
   $dom = new DOMDocument();
   $html = school_crawl_geturi($uri, $cookies, $school_crawl_log);
   if (empty($html)
@@ @@ -166,37 +169,24 @@ function hope_crawl_semester(array $scho @@
   /* Look for the "Export to Excel" submit button */
   $sections_form = $sections_xpath->query('.//form[.//input[@type = "submit" and contains(@value, "xport")]]')->item(0);
   if (empty($sections_form))
+    {
       school_crawl_logf($school_crawl_log, 2, "Unable to find CSV link for schedule.");
       return 1;
+    }
   /* Get the CSV */
   $sections_form_action = $sections_form->getAttribute('action');
   if (!empty($sections_form_action))
     $uri = school_crawl_url($uri, $sections_form_action);
   $sections_csv = school_crawl_geturi($uri, $cookies, $school_crawl_log, school_crawl_form($sections_form));
   /*
    * Oracle likes to put random `"' into the middle of a quoted string
    * instead of properly escaping it like ``"This is a string with a
    * "" in it"''. This regex blasts away such doublequotes which are
    * not adjacent to delimiters (hopefully).
    */
   $sections_csv = preg_replace('/([^,\\n\\r])"([^,\\n\\r])/', '$1""$2', $sections_csv);
   $sections_csv = school_crawl_csv_parse($sections_csv, array('eof' => TRUE));
   /* Skip the introductory lines, seeking for the field headers */
   for ($i = 0; $i < count($sections_csv) && count($sections_csv[$i]) < 2; $i ++)
+    ;
   $fields = array(
     'Status' => FALSE /*< OPEN, RESTRICTED, IN PROGRESS, or empty */,
     'Title' => FALSE /*< course name */,
     'Subject' => FALSE /*< subject id */,
     'Course Number' => FALSE,
     'Section Number' => FALSE,
     'CRN' => FALSE /*< section synonym */,
     'Cred' => FALSE /*< Number of credits, can be a range which would be formatted like "  1-4" */,
     /*
      * ex. "FA1", "FA2", "CH2" (online course?), "CD4", "SRS"
      * (seniors). If a course has multiple attributes, it will have
@@ @@ -213,69 +203,161 @@ function hope_crawl_semester(array $scho @@
     'Capacity' => FALSE /*< Probably the maximum number of students */,
     'Actual' => FALSE /*< Possibly the current number of students? */,
     'Remainder' => FALSE  /*< Number of spots to be filled... */,
     'Instructor' => FALSE /*< The prof/instructor */,
     /*
      * The start/end dates in form of 07/02-07/27. This would be
      * particularly important for supporting half-semester
      * courses. Bug #122.
      */
     'Date' => FALSE,
     'Weeks' => FALSE /*< The total number of weeks the course meets */,
   );
   $state = array(
     'semester' => $semester,
     'fields' => $fields,
     'data' => '',
     'data_unfiltered' => '', /*< Data not yet passed through _hope_crawl_semester_csv_filter() */
     'expected_columns' => 0, /*< The number of columns expected to be in a section row, calculated when parsing the header row. */
     'rollover_values' => array(), /*< The values of columns which may be used multiple times, such as for sections with multiple meetings. */
     'school_crawl_log' => &$school_crawl_log,
     'state' => SP_HOPE_CRAWL_STATE_PREHEADER,
   );
   $sections_csv = school_crawl_geturi($uri, $cookies, $school_crawl_log, school_crawl_form($sections_form),
 				      FALSE, NULL, array(
 					'writefunc' => '_hope_crawl_semester_csv',
 					'writestate' => &$state,
 				      ));
   /* Deliver the EOF */
   $state['data'] .= _hope_crawl_semester_csv_filter($state['data_unfiltered']);
   school_crawl_csv_parse($state['data'], array('eof' => TRUE, 'stream' => array('callback' => '_hope_crawl_semester_csv_row', 'state' => &$state)));
+}
   foreach ($sections_csv[$i] as $column => $name)
 /**
  * \brief
  *   Filter the CSV so that doublequotes are properly escaped.
+ *
  * \param $lines
  *   One or more complete lines of CSV. Partial lines should be
  *   withheld for later filtering.
  */
 function _hope_crawl_semester_csv_filter($lines)
+{
   /*
    * Oracle likes to put random `"' into the middle of a quoted string
    * instead of properly escaping it like ``"This is a string with a
    * "" in it"''. This regex blasts away such doublequotes which are
    * not adjacent to delimiters (hopefully).
    */
   return preg_replace('/([^,\\n\\r])"([^,\\n\\r])/', '$1""$2', $lines);
+}
 /**
  * \brief
  *   libcurl WRITEFUNC callback for parsing CSV.
+ *
  * \param $state
  *   The state.
  * \param $data
  *   The data read so far.
+ *
  * \return
  *   The number of bytes in $data or a different number to indicate
  *   error.
  */
 function _hope_crawl_semester_csv(&$state, $data)
+{
   $state['data_unfiltered'] .= $data;
   $last_newline_pos = strrpos($state['data_unfiltered'], "\n");
   if ($last_newline_pos === FALSE)
     /* Not enough new data */
     return strlen($data);
   $state['data'] .= _hope_crawl_semester_csv_filter(substr($state['data_unfiltered'], 0, $last_newline_pos + 1));
   $state['data_unfiltered'] = substr($state['data_unfiltered'], $last_newline_pos + 1);
   school_crawl_csv_parse($state['data'], array('stream' => array('callback' => '_hope_crawl_semester_csv_row', 'state' => &$state)));
   return strlen($data);
+}
 function _hope_crawl_semester_csv_row(&$state, $row)
+{
   $expected_columns =& $state['expected_columns'];
   $fields =& $state['fields'];
   $rollover_values =& $state['rollover_values'];
   $school_crawl_log =& $state['school_crawl_log'];
   $semester = $state['semester'];
   switch ($state['state'])
+    {
     case SP_HOPE_CRAWL_STATE_PREHEADER:
       if (count($row) < 2)
 	/*
 	 * Skip the introductory lines, seeking for the field headers.
 	 */
 	break;
       /*
        * Came upon the header line… parse the header and switch to
        * sections mode.
        */
       foreach ($row as $column => $name)
     if (!empty($name))
       $fields[$name] = $column;
   $expected_columns = max($fields);
   foreach ($fields as $name => $location)
     if ($location === FALSE)
+      {
 	school_crawl_logf($school_crawl_log, 2, "Cannot find column named %s in CSV. The column headings line looks like ``%s''.",
-			  $name, implode(',', $sections_csv[$i]));
+			      $name, implode(',', $row));
 	return 1;
+      }
   /* Label the days of the week and Times column */
   foreach (array('M', 'T', 'W', 'R', 'F', 'S', 'U', 'Times') as $offset => $name)
     $fields[$name] = $fields['Meeting Days/Times'] + $offset;
   for ($i ++; $i < count($sections_csv); $i ++)
+    {
       $section_csv = $sections_csv[$i];
       $state['state'] = SP_HOPE_CRAWL_STATE_SECTIONS;
       break;
     case SP_HOPE_CRAWL_STATE_SECTIONS:
       $section_csv = $row;
       if (count($section_csv) < $expected_columns)
+	{
 	  school_crawl_logf($school_crawl_log, 8, "Skipping row which has fewer entries than expected (%d): %s",
 			    $expected_columns, implode(', ', $section_csv));
 	  continue;
+	}
       /*
        * If a section has multiple meetings, each extra meeting is
        * placed on a row following the first section's entry. However,
        * the course/synonym/section/subject are all blank on that
        * line. Therefore, we must propagate these values.
        */
       foreach (array(
 	'subject_id' => 'Subject',
 	'course_id' => 'Course Number',
 	'title' => 'Title',
 	'section_id' => 'Section Number',
 	'synonym' => 'CRN',
 	'instructor' => 'Instructor',
 	'location' => 'Location',
       ) as $var => $field)
+	{
 	  $rollover_values += array($var => ''); /*< (Inefficient) */
 	  ${$var} =& $rollover_values[$var];
 	if (strlen(trim($section_csv[$fields[$field]])))
 	  ${$var} = trim($section_csv[$fields[$field]]);
+	}
       if ($section_csv[$fields['M']] == 'TBA'
 	  || $section_csv[$fields['Times']] == 'TBA')
+	{
 	  $semester->class_add(new Course($subject_id . '-' . $course_id,
 					  $section_csv[$fields['Title']]));
 	  school_crawl_logf($school_crawl_log, 8, "Course %s-%s-%s has a section meeting with a TBA time, adding dummy course.",
 			    $subject_id, $course_id, $section_id);
 	  continue;
+	}
       $date_start = $date_end = NULL;
@@ @@ -316,15 +398,15 @@ function hope_crawl_semester(array $scho @@
 					    $location,
 					    $type,
 					    $instructor,
 					    $date_start, $date_end);
       $semester->section_meeting_add($subject_id,
 				     $course_id,
 				     $title,
 				     $section_id,
 				     $synonym,
 				     $section_meeting,
 				     $type,
 				     $section_csv[$fields['Cred']]);
       break;
+    }
   return 0;
+}

0 comments (0 inline, 0 general)