diff --git a/school.d/calvin.crawl.inc b/school.d/calvin.crawl.inc --- a/school.d/calvin.crawl.inc +++ b/school.d/calvin.crawl.inc @@ -18,6 +18,8 @@ * along with slate_permutate. If not, see . */ +require_once dirname(dirname(__FILE__)) . DIRECTORY_SEPARATOR . 'inc' . DIRECTORY_SEPARATOR . 'school.crawl.webadvisor.inc'; + /** * \brief * Retrieve a list of crawlable semesters from Calvin College. @@ -32,73 +34,7 @@ */ function calvin_crawl_semester_list(array $school, array &$semesters, &$school_crawl_log) { - $season_map = array( - 'FA' => Semester::SEASON_FALL, - 'IN' => 'interim', - 'SP' => Semester::SEASON_SPRING, - 'MA' => 'may', - /* I don't know if SU is a valid Calvin Semester ID or not */ - 'SU' => Semester::SEASON_SUMMER); - - /** - * The first link we start at is the one from KV into WebAdvisor. - * - * 1. https://kvdata.calvin.edu/walive/WebAdvisor?type=P&pid=ST-WESTS12A&LASTTOKEN=NULL - * - * - * Calls javascript:getWindowHTML(). This merely adds - * TOKENIDX=NULL to the query string, so we can skip this step - * and just have TOKENIDX=NULL. - * - * 2. https://kvdata.calvin.edu/walive/WebAdvisor?type=P&pid=ST-WESTS12A&LASTTOKEN=NULL&TOKENIDX=NULL - * - * - * In the above, the second argument to setWindowHTML() is - * random. Thus, we have to capture this value. - */ - - $cookies = array(); - $uri = 'https://kvdata.calvin.edu/walive/WebAdvisor?type=P&pid=ST-WESTS12A&LASTTOKEN=NULL'; - $semesters_html = calvin_crawl_geturi($uri, $cookies, $school_crawl_log); - - $semesters_dom = new DOMDocument(); - $semesters_dom->loadHTML($semesters_html); - - /* - * Discover the available semesters - */ - $semesters_var1 = $semesters_dom->getElementById('VAR1'); - if (empty($semesters_var1)) - { - school_crawl_logf($school_crawl_log, 0, "Error: Unable to load list of semesters."); - return 1; - } - $semesters_select_nodes = $semesters_var1->childNodes; - foreach ($semesters_select_nodes as $semester_node) - { - if ($semester_node->tagName != 'option' - || !$semester_node->hasAttribute('value') - || !strlen($semester_node->getAttribute('value'))) - continue; - - $semester_str = $semester_node->getAttribute('value'); - - if (empty($season_map[substr($semester_str, 3)])) - { - school_crawl_logf($school_crawl_log, 6, "Warning: Unknown semester identification chars: %s. Skipping this semester.", - $semester_str); - continue; - } - $season = $season_map[substr($semester_str, 3)]; - $year_timespec = strptime(substr($semester_str, 0, 2), '%y'); - $year = $year_timespec['tm_year'] + 1900; - - $semester = new Semester($year, $season); - $semesters[$semester_str] = $semester; - } - $semester = array_reverse($semesters, TRUE); - - return 0; + return school_crawl_webadvisor_semester_list($school, $semesters, $school_crawl_log); } /** @@ -114,470 +50,7 @@ function calvin_crawl_semester_list(arra */ function calvin_crawl_semester(array $school, Semester $semester, &$school_crawl_log) { - $cookies = array(); - $uri = 'https://kvdata.calvin.edu/walive/WebAdvisor?type=P&pid=ST-WESTS12A&LASTTOKEN=NULL'; - $html = calvin_crawl_geturi($uri, $cookies, $school_crawl_log); - $form_uri = $uri; - $seed_dom = new DOMDocument(); - $seed_dom->loadHTML($html); - $return_url = dom_input_value($seed_dom, 'RETURN.URL'); - - /* - * First, read all of the friendly subject/department names. They're - * not in the output, but they're in the ``Subjects'' dropdown of - * the input form. The element and return its value attribute. - * - * \param $domdocument - * The DOMDocument to search. - * \param $name - * The name attribute of the element. - * \return - * The value attribute of the input element or NULL if not found. - */ -function dom_input_value($domdocument, $name) -{ - $xpath = new DOMXPath($domdocument); - $input_node_list = $xpath->query('/descendant::input[attribute::name="' . $name . '"]'); - - if (!$input_node_list->length) - return NULL; - $input_node = $input_node_list->item(0); - if (!$input_node->hasAttribute('value')) - return NULL; - return $input_node->getAttribute('value'); -} - -/** - * \brief - * Returns the content of an element with the given ID. - * - * A convenience function. - * - * \param $domdocument - * A DOMDocument to search. - * \param $id - * The id attribute of the element whose content are requested. - * \return - * A UTF-8 string of the contents of the given element or NULL if - * the element isn't found. - */ -function dom_id_content($domdocument, $id) -{ - $node = $domdocument->getElementById($id); - if ($node) - { - return $node->nodeValue; - } - return NULL; -} - -/** - * \brief - * Searches for and removes a

in a docs <head />, which - * is quite bad invalid HTML so that DOM can't handle it. - * - * \param $html - * The input HTML to filter. - * \return - * The fixed HTML. - */ -function calvin_crawl_noscript_filter($html) -{ - return preg_replace(';\<(noscript)\>.*?\</\1\>;s', '', $html); -} - -/** - * \brief - * Follows a URL with support for WebAdvisor's silly TOKENIDX= - * thing. - * - * Automatically filters with calvin_crawl_noscript_filter(). - * - * \param $uri - * The URL. - * \param $cookies - * The cookies (yum!). - * \param $school_crawl_log - * The school_crawl_log. - */ -function calvin_crawl_geturi(&$uri, array &$cookies, &$school_crawl_log) -{ - if (strpos($uri, 'TOKENIDX') === FALSE) - { - if (strpos($uri, '?') === FALSE) - $uri .= '?'; - else - $uri .= '&'; - - /* Starting value. */ - $uri .= 'TOKENIDX=NULL'; - } - - $token_html = calvin_crawl_noscript_filter(school_crawl_geturi($uri, $cookies, $school_crawl_log)); - - if (!preg_match('/setWindowHTML$\'\', \'([0-9]+)\'$;/', $token_html, $matches)) - return $token_html; -$token = $matches[1]; - - school_crawl_logf($school_crawl_log, 7, "Using WebAdvisor token: %s.", $token); - school_crawl_logf($school_crawl_log, 7, ""); - - /* - * setWindowHTML() will first remove the query string parameters - * 'CLONE' and 'FORCEIDX'. Then it appends TOKENIDX=<token> to the - * query parameters. - * - * Example, where TOKENIDX does not start out as NULL but where a - * CLONE=Y command is being sent: - * - * Input: HTTPS://kvdata.calvin.edu/walive/WebAdvisor?TYPE=P&PID=ST-WESTS13C&CLONE=Y&CLONE_PROCESS=Y&SPAUKQ=708501792841963&TOKENIDX=1507971558 - * - * Result: HTTPS://kvdata.calvin.edu/walive/WebAdvisor?TYPE=P&PID=ST-WESTS13C&CLONE_PROCESS=Y&SPAUKQ=708501792841963&TOKENIDX=2281086932 - */ - $uri = preg_replace('/([?&])TOKENIDX=[^&]+/', '$1TOKENIDX=' . $token, - preg_replace('/([?&])(CLONE|FORCEIDX)=[^&]+&?/', '$1', $uri)); - - return calvin_crawl_noscript_filter(school_crawl_geturi($uri, $cookies, $school_crawl_log)); + return school_crawl_webadvisor_semester($school, $semester, $school_crawl_log); } /** @@ -593,7 +66,7 @@ function calvin_crawl_geturi(&$uri, arra * The course_id which, with the department string, forms a * fully-qualified course_id. */ -function calvin_crawl_course_add(Semester $semester, $department, $course_id, $title) +function calvin_crawl_course_add_(Semester $semester, $department, $course_id, $title) { if ($semester->class_get($department, $course_id) == NULL) $semester->class_add(new Course($department . '-' . $course_id, $title));