# HG changeset patch # User Nathan Phillip Brink # Date 2012-11-22 15:53:37 # Node ID 1838a6f6fa2089f44f078a5c39d8ec9f46f244fa # Parent 37146f2e0683bc183acf325b612a1630df2a877d Update Hope College crawler to be stream/chunk based, lowering its memory usage. diff --git a/inc/school.crawl.inc b/inc/school.crawl.inc --- a/inc/school.crawl.inc +++ b/inc/school.crawl.inc @@ -298,24 +298,43 @@ function school_crawl_meeting_type($meet * \param $loopspin * An internal variable to prevent us from following perpetual * redirects. + * \param $options + * Extra optional arguments with keys as follows: + * - 'writefunc': A curl-compatible write function of the form + * function($state, $data) and returns the number of eaten bytes + * which must be equal to the number of bytes received unless if + * the transfer should be aborted. Settings this and using + * $follow_meta_refresh are mutually exclusive and will cause + * undefined behavior. + * - 'writestate': The value which should be passed to writefunc as + * the $state parameter. * \return * The body of the document returned by the server (normally - * malformed HTML, especially with Calvin's WebAdvisor - * installation). + * malformed HTML, especially with Calvin's WebAdvisor installation) + * or, if 'writestate' and 'writefunc' are set, the value stored in + * 'writestate'. */ -function school_crawl_geturi(&$uri, &$cookies, array &$school_crawl_log, $post = NULL, $follow_meta_refresh = FALSE, $curlsetup_hook = NULL, $loopspin = 0) +function school_crawl_geturi(&$uri, &$cookies, array &$school_crawl_log, $post = NULL, $follow_meta_refresh = FALSE, $curlsetup_hook = NULL, $options = array(), $loopspin = 0) { - global $school_crawl_geturi_write_buf, $school_crawl_geturi_headers_buf; + global $school_crawl_geturi_writefunc, + $school_crawl_geturi_writestate, + $school_crawl_geturi_headers_buf; school_crawl_logf($school_crawl_log, 7, "school_crawl_geturi('%s').", $uri); + $options += array( + 'writefunc' => 'school_crawl_geturi_writefunc_cb', + 'writestate' => '', + ); + $school_crawl_geturi_writefunc = $options['writefunc']; + $GLOBALS['school_crawl_geturi_writestate'] = &$options['writestate']; + $curl = curl_init(); curl_setopt($curl, CURLOPT_USERAGENT, SP_PACKAGE_NAME . '/' . SP_PACKAGE_VERSION); if ($curlsetup_hook !== NULL) $curlsetup_hook($curl); - $school_crawl_geturi_write_buf = ''; $school_crawl_geturi_headers_buf = ''; curl_setopt($curl, CURLOPT_URL, $uri); @@ -405,7 +424,7 @@ function school_crawl_geturi(&$uri, &$co if ($follow_meta_refresh) { $dom = new DOMDocument(); - $dom->loadHTML($school_crawl_geturi_write_buf); + $dom->loadHTML($options['writestate']); foreach ($dom->getElementsByTagName('meta') as $meta_node) if ($meta_node->hasAttribute('http-equiv') && !strcasecmp('refresh', $meta_node->getAttribute('http-equiv'))) @@ -424,13 +443,13 @@ function school_crawl_geturi(&$uri, &$co } } - school_crawl_logf($school_crawl_log, 10, "%s", $school_crawl_geturi_write_buf); + school_crawl_logf($school_crawl_log, 10, "%s", $options['writestate']); if ($location && $loopspin < 6) { $uri = $location; - return school_crawl_geturi($uri, $cookies, $school_crawl_log, $post, $follow_meta_refresh, $curlsetup_hook, $loopspin + 1); + return school_crawl_geturi($uri, $cookies, $school_crawl_log, $post, $follow_meta_refresh, $curlsetup_hook, $options, $loopspin + 1); } - return $school_crawl_geturi_write_buf; + return $options['writestate']; } function school_crawl_geturi_header_cb($curl, $header_buf) @@ -440,11 +459,16 @@ function school_crawl_geturi_header_cb($ return strlen($header_buf); } +function school_crawl_geturi_writefunc_cb(&$writebuf, $data) +{ + $writebuf .= $data; + return strlen($data); +} + function school_crawl_geturi_write_cb($curl, $write_buf) { - global $school_crawl_geturi_write_buf; - $school_crawl_geturi_write_buf .= $write_buf; - return strlen($write_buf); + global $school_crawl_geturi_writefunc, $school_crawl_geturi_writestate; + return $school_crawl_geturi_writefunc($school_crawl_geturi_writestate, $write_buf); } /** @@ -742,7 +766,7 @@ function _school_crawl_csv_parse_eol($da /** * \brief - * Read a line of CSV and return it as an array. + * Read a string of CSV and return it as an array of row arrays. * * \param $data * CSV data to parse. Parsed data shall be deleted. @@ -755,12 +779,19 @@ function _school_crawl_csv_parse_eol($da * will assume that it cannot assume that there is an implicit * newline. Some improper files don't have the extra newline at * their end and thus this is needed to support them. + * - stream (unset): If set to an array containing the keys + * 'callback' and 'state', will call the 'callback' which is a + * function($state, $row) with $state set to the value in 'state' + * instead of storing all rows and returning them all. * \return * An array with an entry for each line in the CSV file where each * line's entry is an array of the items in that row. An empty array * will be returned in the case that there is insufficient data to * read a line (or insufficient data to tell if the line is - * complete, see $options['eof']). + + * complete, see $options['eof']). If the 'stream' option is set in + * $options, then the return value shall be the number of rows + * parsed. */ function school_crawl_csv_parse(&$data, array $options = array()) { @@ -769,10 +800,14 @@ function school_crawl_csv_parse(&$data, 'eof' => FALSE, ); - $ret = array(); $i = 0; $last_line_i = $i; $strlen_data = strlen($data); + $streammode = !empty($options['stream']); + if ($streammode) + $ret = 0; + else + $ret = array(); while ($i < $strlen_data) { @@ -861,7 +896,13 @@ function school_crawl_csv_parse(&$data, $i = $next_i; $last_line_i = $i; $row[] = $entry; - $ret[] = $row; + if ($streammode) + { + $options['stream']['callback']($options['stream']['state'], $row); + $ret ++; + } + else + $ret[] = $row; } if (!empty($last_line_i)) diff --git a/school.d/hope.crawl.inc b/school.d/hope.crawl.inc --- a/school.d/hope.crawl.inc +++ b/school.d/hope.crawl.inc @@ -18,6 +18,9 @@ * along with slate_permutate. If not, see . */ +define('SP_HOPE_CRAWL_STATE_PREHEADER', 1); +define('SP_HOPE_CRAWL_STATE_SECTIONS', 2); + /** * \brief * Start a Hope crawling session. @@ -175,19 +178,6 @@ function hope_crawl_semester(array $scho $sections_form_action = $sections_form->getAttribute('action'); if (!empty($sections_form_action)) $uri = school_crawl_url($uri, $sections_form_action); - $sections_csv = school_crawl_geturi($uri, $cookies, $school_crawl_log, school_crawl_form($sections_form)); - - /* - * Oracle likes to put random `"' into the middle of a quoted string - * instead of properly escaping it like ``"This is a string with a - * "" in it"''. This regex blasts away such doublequotes which are - * not adjacent to delimiters (hopefully). - */ - $sections_csv = preg_replace('/([^,\\n\\r])"([^,\\n\\r])/', '$1""$2', $sections_csv); - $sections_csv = school_crawl_csv_parse($sections_csv, array('eof' => TRUE)); - /* Skip the introductory lines, seeking for the field headers */ - for ($i = 0; $i < count($sections_csv) && count($sections_csv[$i]) < 2; $i ++) - ; $fields = array( 'Status' => FALSE /*< OPEN, RESTRICTED, IN PROGRESS, or empty */, @@ -222,26 +212,114 @@ function hope_crawl_semester(array $scho 'Date' => FALSE, 'Weeks' => FALSE /*< The total number of weeks the course meets */, ); + $state = array( + 'semester' => $semester, + 'fields' => $fields, + 'data' => '', + 'data_unfiltered' => '', /*< Data not yet passed through _hope_crawl_semester_csv_filter() */ + 'expected_columns' => 0, /*< The number of columns expected to be in a section row, calculated when parsing the header row. */ + 'rollover_values' => array(), /*< The values of columns which may be used multiple times, such as for sections with multiple meetings. */ + 'school_crawl_log' => &$school_crawl_log, + 'state' => SP_HOPE_CRAWL_STATE_PREHEADER, + ); + $sections_csv = school_crawl_geturi($uri, $cookies, $school_crawl_log, school_crawl_form($sections_form), + FALSE, NULL, array( + 'writefunc' => '_hope_crawl_semester_csv', + 'writestate' => &$state, + )); + /* Deliver the EOF */ + $state['data'] .= _hope_crawl_semester_csv_filter($state['data_unfiltered']); + school_crawl_csv_parse($state['data'], array('eof' => TRUE, 'stream' => array('callback' => '_hope_crawl_semester_csv_row', 'state' => &$state))); +} - foreach ($sections_csv[$i] as $column => $name) - if (!empty($name)) - $fields[$name] = $column; - $expected_columns = max($fields); - foreach ($fields as $name => $location) - if ($location === FALSE) - { - school_crawl_logf($school_crawl_log, 2, "Cannot find column named %s in CSV. The column headings line looks like ``%s''.", - $name, implode(',', $sections_csv[$i])); - return 1; - } +/** + * \brief + * Filter the CSV so that doublequotes are properly escaped. + * + * \param $lines + * One or more complete lines of CSV. Partial lines should be + * withheld for later filtering. + */ +function _hope_crawl_semester_csv_filter($lines) +{ + /* + * Oracle likes to put random `"' into the middle of a quoted string + * instead of properly escaping it like ``"This is a string with a + * "" in it"''. This regex blasts away such doublequotes which are + * not adjacent to delimiters (hopefully). + */ + return preg_replace('/([^,\\n\\r])"([^,\\n\\r])/', '$1""$2', $lines); +} + +/** + * \brief + * libcurl WRITEFUNC callback for parsing CSV. + * + * \param $state + * The state. + * \param $data + * The data read so far. + * + * \return + * The number of bytes in $data or a different number to indicate + * error. + */ +function _hope_crawl_semester_csv(&$state, $data) +{ + $state['data_unfiltered'] .= $data; + $last_newline_pos = strrpos($state['data_unfiltered'], "\n"); + if ($last_newline_pos === FALSE) + /* Not enough new data */ + return strlen($data); + $state['data'] .= _hope_crawl_semester_csv_filter(substr($state['data_unfiltered'], 0, $last_newline_pos + 1)); + $state['data_unfiltered'] = substr($state['data_unfiltered'], $last_newline_pos + 1); + + school_crawl_csv_parse($state['data'], array('stream' => array('callback' => '_hope_crawl_semester_csv_row', 'state' => &$state))); - /* Label the days of the week and Times column */ - foreach (array('M', 'T', 'W', 'R', 'F', 'S', 'U', 'Times') as $offset => $name) - $fields[$name] = $fields['Meeting Days/Times'] + $offset; + return strlen($data); +} + +function _hope_crawl_semester_csv_row(&$state, $row) +{ + $expected_columns =& $state['expected_columns']; + $fields =& $state['fields']; + $rollover_values =& $state['rollover_values']; + $school_crawl_log =& $state['school_crawl_log']; + $semester = $state['semester']; + + switch ($state['state']) + { + case SP_HOPE_CRAWL_STATE_PREHEADER: + if (count($row) < 2) + /* + * Skip the introductory lines, seeking for the field headers. + */ + break; - for ($i ++; $i < count($sections_csv); $i ++) - { - $section_csv = $sections_csv[$i]; + /* + * Came upon the header line… parse the header and switch to + * sections mode. + */ + foreach ($row as $column => $name) + if (!empty($name)) + $fields[$name] = $column; + $expected_columns = max($fields); + foreach ($fields as $name => $location) + if ($location === FALSE) + { + school_crawl_logf($school_crawl_log, 2, "Cannot find column named %s in CSV. The column headings line looks like ``%s''.", + $name, implode(',', $row)); + return 1; + } + + /* Label the days of the week and Times column */ + foreach (array('M', 'T', 'W', 'R', 'F', 'S', 'U', 'Times') as $offset => $name) + $fields[$name] = $fields['Meeting Days/Times'] + $offset; + + $state['state'] = SP_HOPE_CRAWL_STATE_SECTIONS; + break; + case SP_HOPE_CRAWL_STATE_SECTIONS: + $section_csv = $row; if (count($section_csv) < $expected_columns) { @@ -265,8 +343,12 @@ function hope_crawl_semester(array $scho 'instructor' => 'Instructor', 'location' => 'Location', ) as $var => $field) - if (strlen(trim($section_csv[$fields[$field]]))) - ${$var} = trim($section_csv[$fields[$field]]); + { + $rollover_values += array($var => ''); /*< (Inefficient) */ + ${$var} =& $rollover_values[$var]; + if (strlen(trim($section_csv[$fields[$field]]))) + ${$var} = trim($section_csv[$fields[$field]]); + } if ($section_csv[$fields['M']] == 'TBA' || $section_csv[$fields['Times']] == 'TBA') @@ -325,6 +407,6 @@ function hope_crawl_semester(array $scho $section_meeting, $type, $section_csv[$fields['Cred']]); + break; } - return 0; }