* * This file is a part of slate_permutate. * * slate_permutate is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * slate_permutate is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with slate_permutate. If not, see . */ /** * \file * Routines that are only useful when crawling schools' websites for * autofill section data. */ /** * \brief * Initialize a school_crawl_log handle. * * \param $school * The school for which this crawl handle is. * \param $opts * An array optionally with one of the following keys: * - stream: an fopen()-compatible stream to fwrite()/fprintf() output to. * - page: a Page object used to help format HTML output. * - verbosity: A number from 0 through 10 describing the desired * verbosity. */ function school_crawl_log_init(array $school, $opts = array()) { $opts += array('verbosity' => 5); return array('school' => $school, 'out' => array('html' => array(), 'plain' => array())) + $opts; } /** * \brief * Log progress of a crawler. * * This function's arguments take the same style as fprintf() does. * * \param $school_crawl_log * The logging resource. * \param $verboseness * The verbosity level at which to log the message. Should be a * value from 0 to 10, where 0 is unconditionally printed and 5 is * the default. * \param $format * The printf()-style format string. */ function school_crawl_logf(array $school_crawl_log, $verboseness, $format) { $args = func_get_args(); array_shift($args); array_shift($args); if ($verboseness > $school_crawl_log['verbosity']) /* * The given message gives us more detail than we want. Therefore, * discard it. */ return; $log_line = call_user_func_array('sprintf', $args); /* store output in a place where it's retrievable */ $school_crawl_log['out']['plain'][] = sprintf("%s_crawl(): %s\n", $school_crawl_log['school']['id'], $log_line); /* store the output in a retrievable list of outputs */ if (isset($school_crawl_log['page'])) $school_crawl_log['out']['html'][] = sprintf("
%s_crawl(): %s

\n", $school_crawl_log['school']['id'], htmlentities($log_line), $school_crawl_log['page']->element_self_close()); /* print to a stream potentially */ if (isset($school_crawl_log['stream'])) fprintf($school_crawl_log['stream'], "%s_crawl(): %s\n", $school_crawl_log['school']['id'], $log_line); return 0; } /** * \brief * Recover stored crawling log stuffage. * * \param $html * Whether to retrieve formatted HTML output if it's available. * \return * An array of output lines. */ function school_crawl_log_fetch(array $school_crawl_log, $html = FALSE) { if ($html) if (isset($school_crawl_log['page'])) return $school_crawl_log['out']['html']; else return nl2br(htmlentities($school_crawl_log['out']['plain'])); return $school_crawl_log['out']['plain']; } /** * \brief * Parse a simple time string into slate_permutate's time * representation. * * \param $time * An array compatible with the return value of strptime(). The only * fields we use are 'tm_hour', which is from 0 through 23, and * 'tm_min', which may be from 0 through 50. */ function school_crawl_time_format($time) { return sprintf('%02d%02d', $time['tm_hour'], $time['tm_min']); } /** * \brief * Equivalent of mktime() except that it accepts strptime()'s output * format as an input. * * \param $tm * An array formatted as the output of strptime(). * \return * A unix timestamp. */ function school_crawl_mktime(array $tm) { return mktime($tm['tm_hour'], $tm['tm_min'], $tm['tm_sec'], $tm['tm_mon'] + 1, $tm['tm_mday'], $tm['tm_year'] + 1900); } /** * \brief * Take an array of day names and assemble them into * slate_permutate's internal (weird) representation of a set of * weekdays. * * This function is intended to make it easy for one to take the * output of an explode() call. For example, to decode $days_str = * 'Monday, Tuesday, Friday', one would do * school_crawl_days_format($school_crawl_log, explode(', ', $days_str)); * * \param $school_crawl_log * A school_crawl_log handle to report errors to. * \param $days * An array of day names. These may be common abbreviations or * truncations (any truncations must be two chars long for * simplicity. One-char representations are supported, however, but * use 'm', 't', 'w', 'h', 'f' to distinguish Thursday and * Tuesday. 'r' may also be used for Thursday.). Case does not * matter. 's' is for Saturday, based on CCBCMD. * \return * slate_permutate's strange internal days representation. */ function school_crawl_days_format(array $school_crawl_log, $days) { static $daymap_1 = array('m' => 'm', 't' => 't', 'w' => 'w', 'h' => 'h', 'r' => 'h', 'f' => 'f', 's' => 's'); static $daymap_2 = array('th' => 'h'); $my_days = array(); foreach ($days as $day) { $day_orig = $day; $day = strtolower(substr(trim($day), 0, 2)); /* * convert from two-char representation to one-char * representation.n */ if (strlen($day) > 1) { if (isset($daymap_2[$day])) $day = $daymap_2[$day]; else $day = substr($day, 0, 1); } if (isset($daymap_1[$day])) $my_days[$daymap_1[$day]] = TRUE; else school_crawl_logf($school_crawl_log, 5, "school_crawl_days_format() got invalid day specifier: `%s' => `%s'.", $day_orig, $day); } $day_str = ''; foreach ($my_days as $day_val => $junk) $day_str .= $day_val; return $day_str; } /** * \brief * Take a string of day initials and format it. * * \param $school_crawl_log * The school_crawl_log handle to write errors out to. * \param $days_str * Example input: 'mwf', 'TR'. * \return * Same as school_crawl_days_format() */ function school_crawl_days_str_format(array $school_crawl_log, $days_str) { $day_initials = array(); for ($i = 0; $i < strlen($days_str); $i ++) $day_initials[] = $days_str[$i]; return school_crawl_days_format($school_crawl_log, $day_initials); } /** * \brief * Try to guess a more standardized section_meeting type. * * \param $meeting_type * The upstream's meeting_type, such as 'LEC', 'lec', 'LAB', * etc. New mappings should be added to this function as long as * they are general enough. */ function school_crawl_meeting_type($meeting_type = 'lecture') { static $meeting_type_maps = array( 'lec' => 'lecture', 'lab' => 'lab', 'dis' => 'discussion', ); if (empty($meeting_type)) $meeting_type = 'lecture'; $meeting_type = strtolower(trim($meeting_type)); if (!empty($meeting_type_maps[$meeting_type])) $meeting_type = $meeting_type_maps[$meeting_type]; elseif (!empty($meeting_type_maps[substr($meeting_type, 0, 3)])) $meeting_type = $meeting_type_maps[substr($meeting_type, 0, 3)]; return $meeting_type; } /** * \brief * Simulate some aspects of a web browser while retreiving a * document. * * This allows us to view our cookies in an associative array and to * have the server's response automatically update our cookies. * * If $post is specified as an associative array, an HTTP POST is * performed and the data is encoded properly as if we were performing * a form submission. * * Follows redirects. If there is a redirect, the page from which you * are redirected is lost... but few people put any information on * those pages anyways ;-). * * \param $uri * The URL to fetch. If a redirect occurs, this is updated. * \param $cookies * An associative array of cookies and where to save new cookies. * \param $school_crawl_log * The school_crawl_log handle to use. * \param $post * If not NULL, causes an HTTP POST. In that case, should be an * associative array of form keys/values. * \param $follow_meta_refresh * Parse the resultant HTML with http://docs.php.net/dom and if it * contains a line that looks like ``'', * follow that URL. * \param $curlsetup_hook * A function which is passed a curl handle which allows the caller * to do silly things like setting CURLOPT_SSLVERSION for silly * sites like ccbcmd's registration site. * \param $loopspin * An internal variable to prevent us from following perpetual * redirects. * \return * The body of the document returned by the server (normally * malformed HTML, especially with Calvin's WebAdvisor * installation). */ function school_crawl_geturi(&$uri, &$cookies, array &$school_crawl_log, $post = NULL, $follow_meta_refresh = FALSE, $curlsetup_hook = NULL, $loopspin = 0) { global $school_crawl_geturi_write_buf, $school_crawl_geturi_headers_buf; school_crawl_logf($school_crawl_log, 7, "school_crawl_geturi('%s').", $uri); $curl = curl_init(); if ($curlsetup_hook !== NULL) $curlsetup_hook($curl); $school_crawl_geturi_write_buf = ''; $school_crawl_geturi_headers_buf = ''; curl_setopt($curl, CURLOPT_URL, $uri); $cookies_str = ''; foreach ($cookies as $key => $val) { if (strlen($cookies_str)) $cookies_str .= ';'; $cookies_str .= $key . '=' . $val; } school_crawl_logf($school_crawl_log, 10, "cookies sent: %s", $cookies_str); curl_setopt($curl, CURLOPT_COOKIE, $cookies_str); curl_setopt($curl, CURLOPT_HEADERFUNCTION, 'school_crawl_geturi_header_cb'); curl_setopt($curl, CURLOPT_WRITEFUNCTION, 'school_crawl_geturi_write_cb'); if ($post != NULL && is_array($post)) { /* var_dump($post); */ $posttxt = ''; foreach ($post as $postkey => $postvals) { /* * This not escaping MEMBER thing is Calvin-specific * too. Maybe we need a way to ask for some particular char * not to be encoded? */ /* * Apparently, browsers like seamonkey will send multiple * versions of if another input exists with name="field", like: * field=1&field=blah. It seems like the webserver for * ccbcmd cares about having these multiple values too... * * Yes, sending subj_sel=dummy&subj_sel=%25 made _all_ of * the difference. Wow. */ if (!is_array($postvals)) $postvals = array($postvals); foreach ($postvals as $postval) $posttxt .= (strlen($posttxt) ? '&' : '') . urlencode($postkey) . '=' . (strpos($postkey, 'MEMBER') === FALSE ? urlencode($postval) : $postval); } school_crawl_logf($school_crawl_log, 10, "Setting POST to %s", $posttxt); /* curl_setopt($curl, CURLOPT_POST, TRUE); */ curl_setopt($curl, CURLOPT_POSTFIELDS, $posttxt); } curl_exec($curl); curl_close($curl); $location = NULL; foreach (explode("\r\n", $school_crawl_geturi_headers_buf) as $header) { /* * yes, we don't want the line if the first char is a ':' or if it has no ':' */ if (!strpos($header, ':')) continue; list($header_name, $header_val) = explode(': ', $header, 2); school_crawl_logf($school_crawl_log, 9, "%s: %s", $header_name, $header_val); switch($header_name) { case 'Set-Cookie': list($cookie_name, $cookie_val) = explode('=', $header_val, 2); if (isset($cookies[$cookie_name])) school_crawl_logf($school_crawl_log, 10, "Replacing cookie %s=%s with...", $cookie_name, $cookies[$cookie_name]); school_crawl_logf($school_crawl_log, 10, "...new cookie %s=%s.", $cookie_name, $cookie_val); $cookies[$cookie_name] = $cookie_val; break; case 'Location': $location = $header_val; /* yes, a calvin-specific replacement :-/ */ $location = preg_replace(';(kvdata\.calvin\.edu/)(WebAdvisor);', '\1walive/\2', $location) . "\n"; $post = NULL; break; } } if ($follow_meta_refresh) { $dom = new DOMDocument(); $dom->loadHTML($school_crawl_geturi_write_buf); foreach ($dom->getElementsByTagName('meta') as $meta_node) if ($meta_node->hasAttribute('http-equiv') && !strcasecmp('refresh', $meta_node->getAttribute('http-equiv'))) { $meta_content = $meta_node->getAttribute('content'); school_crawl_logf($school_crawl_log, 7, "Following http-equiv Refresh: %s", $meta_content); if (!(preg_match('/^[0-9]+; *url=(.*)$/', $meta_content, $meta_matches))) { school_crawl_logf($school_crawl_log, 0, "Error following http-equiv Refresh: %s", $meta_content); } else { $location = $meta_matches[1]; $post = NULL; } } } school_crawl_logf($school_crawl_log, 10, "%s", $school_crawl_geturi_write_buf); if ($location && $loopspin < 6) { $uri = $location; return school_crawl_geturi($uri, $cookies, $school_crawl_log, $post, $follow_meta_refresh, $curlsetup_hook, $loopspin + 1); } return $school_crawl_geturi_write_buf; } function school_crawl_geturi_header_cb($curl, $header_buf) { global $school_crawl_geturi_headers_buf; $school_crawl_geturi_headers_buf .= $header_buf; return strlen($header_buf); } function school_crawl_geturi_write_cb($curl, $write_buf) { global $school_crawl_geturi_write_buf; $school_crawl_geturi_write_buf .= $write_buf; return strlen($write_buf); } /** * \brief * Finds the closest parent of a DOM element with a certain tag * name. * * Useful for finding the
element associated with a given * s so that the form's action="" * parameter may be found. * * The node itself passed in will be considered for whether or not it * matches the $element_name. * * \param $node * The dom node whose ancestor should be found. * \param $element_name * The name of the ancestor element which is requested. * \return * The DOMElement sought or NULL if not found. */ function school_crawl_element_ancestor(DOMElement $node, $element_name) { if (!strcmp($node->tagName, $element_name)) return $node; if ($node->parentNode) return school_crawl_element_ancestor($node->parentNode, $element_name); return NULL; } /** * \brief * Create an array based on an HTML form for submitting the form. * * Currently, this will only support the and 's