# HG changeset patch # User Nathan Phillip Brink # Date 2011-02-05 22:44:18 # Node ID f19ed2cbeb20c0e04e59df869755a5d98073f753 # Parent 23515b1d9eeef27e4be5195ba761d258c08fce15 Add school ccbcmd (The Community College of Bultimore County) with crawler. Also, add additional crawler utility functions and add Saturday support to the crawling utility functions. diff --git a/inc/school.crawl.inc b/inc/school.crawl.inc --- a/inc/school.crawl.inc +++ b/inc/school.crawl.inc @@ -56,13 +56,13 @@ function school_crawl_time_format($time) * simplicity. One-char representations are supported, however, but * use 'm', 't', 'w', 'h', 'f' to distinguish Thursday and * Tuesday. 'r' may also be used for Thursday.). Case does not - * matter. + * matter. 's' is for Saturday, based on CCBCMD. * \return * slate_permutate's strange internal days representation. */ function school_crawl_days_format($days) { - static $daymap_1 = array('m' => 'm', 't' => 't', 'w' => 'w', 'h' => 'h', 'r' => 'h', 'f' => 'f'); + static $daymap_1 = array('m' => 'm', 't' => 't', 'w' => 'w', 'h' => 'h', 'r' => 'h', 'f' => 'f', 's' => 's'); static $daymap_2 = array('th' => 'h'); $my_days = array(); @@ -137,6 +137,14 @@ function school_crawl_days_str_format($d * \param $post * If not NULL, causes an HTTP POST. In that case, should be an * associative array of form keys/values. + * \param $follow_meta_refresh + * Parse the resultant HTML with http://docs.php.net/dom and if it + * contains a line that looks like ``'', + * follow that URL. + * \param $curlsetup_hook + * A function which is passed a curl handle which allows the caller + * to do silly things like setting CURLOPT_SSLVERSION for silly + * sites like ccbcmd's registration site. * \param $verbosity * How verbose to be. * \param $loopspin @@ -147,7 +155,7 @@ function school_crawl_days_str_format($d * malformed HTML, especially with Calvin's WebAdvisor * installation). */ -function school_crawl_geturi(&$uri, &$cookies, $post = NULL, $verbosity = 0, $loopspin = 0) +function school_crawl_geturi(&$uri, &$cookies, $post = NULL, $follow_meta_refresh = FALSE, $curlsetup_hook = NULL, $verbosity = 0, $loopspin = 0) { global $school_crawl_geturi_write_buf, $school_crawl_geturi_headers_buf, $school_crawl_geturi_verbosity; @@ -160,6 +168,9 @@ function school_crawl_geturi(&$uri, &$co $curl = curl_init(); + if ($curlsetup_hook !== NULL) + $curlsetup_hook($curl); + $school_crawl_geturi_verbosity = $verbosity; $school_crawl_geturi_write_buf = ''; $school_crawl_geturi_headers_buf = ''; @@ -185,9 +196,28 @@ function school_crawl_geturi(&$uri, &$co /* var_dump($post); */ $posttxt = ''; - foreach ($post as $postkey => $postval) + foreach ($post as $postkey => $postvals) { - $posttxt .= (strlen($posttxt) ? '&' : '') + /* + * This not escaping MEMBER thing is Calvin-specific + * too. Maybe we need a way to ask for some particular char + * not to be encoded? + */ + + /* + * Apparently, browsers like seamonkey will send multiple + * versions of if another input exists with name="field", like: + * field=1&field=blah. It seems like the webserver for + * ccbcmd cares about having these multiple values too... + * + * Yes, sending subj_sel=dummy&subj_sel=%25 made _all_ of + * the difference. Wow. + */ + if (!is_array($postvals)) + $postvals = array($postvals); + foreach ($postvals as $postval) + $posttxt .= (strlen($posttxt) ? '&' : '') . urlencode($postkey) . '=' . (strpos($postkey, 'MEMBER') === FALSE ? urlencode($postval) : $postval); } if ($verbosity > 8) @@ -229,18 +259,42 @@ function school_crawl_geturi(&$uri, &$co case 'Location': $location = $header_val; + /* yes, a calvin-specific replacement :-/ */ $location = preg_replace(';(kvdata\.calvin\.edu/)(WebAdvisor);', '\1walive/\2', $location) . "\n"; $post = NULL; break; } } + if ($follow_meta_refresh) + { + $dom = new DOMDocument(); + $dom->loadHTML($school_crawl_geturi_write_buf); + foreach ($dom->getElementsByTagName('meta') as $meta_node) + if ($meta_node->hasAttribute('http-equiv') + && !strcasecmp('refresh', $meta_node->getAttribute('http-equiv'))) + { + $meta_content = $meta_node->getAttribute('content'); + if ($verbosity > 2) + echo 'Following http-equiv Refresh: ' . $meta_content . PHP_EOL; + if (!(preg_match('/^[0-9]+; *url=(.*)$/', $meta_content, $meta_matches))) + { + echo 'Error following http-equiv Refresh: ' . $meta_content . PHP_EOL; + } + else + { + $location = $meta_matches[1]; + $post = NULL; + } + } + } + if ($verbosity > 9) echo $school_crawl_geturi_write_buf; if ($location && $loopspin < 6) { $uri = $location; - return school_crawl_geturi($uri, $cookies, $post, $loopspin + 1); + return school_crawl_geturi($uri, $cookies, $post, $follow_meta_refresh, $curlsetup_hook, $verbosity, $loopspin + 1); } return $school_crawl_geturi_write_buf; } @@ -258,3 +312,165 @@ function school_crawl_geturi_write_cb($c $school_crawl_geturi_write_buf .= $write_buf; return strlen($write_buf); } + +/** + * \brief + * Finds the closest parent of a DOM element with a certain tag + * name. + * + * Useful for finding the
element associated with a given + * s so that the form's action="" + * parameter may be found. + * + * The node itself passed in will be considered for whether or not it + * matches the $element_name. + * + * \param $node + * The dom node whose ancestor should be found. + * \param $element_name + * The name of the ancestor element which is requested. + * \return + * The DOMElement sought or NULL if not found. + */ +function school_crawl_element_ancestor(DOMElement $node, $element_name) +{ + if (!strcmp($node->tagName, $element_name)) + return $node; + if ($node->parentNode) + return school_crawl_element_ancestor($node->parentNode, $element_name); + return NULL; +} + +/** + * \brief + * Create an array based on an HTML form for submitting the form. + * + * Currently, this will only support the and