diff --git a/inc/school.crawl.webadvisor.inc b/inc/school.crawl.webadvisor.inc new file mode 100644 --- /dev/null +++ b/inc/school.crawl.webadvisor.inc @@ -0,0 +1,719 @@ + + * + * This file is a part of slate_permutate. + * + * slate_permutate is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * slate_permutate is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with slate_permutate. If not, see . + */ + + +/** + * \file + * + * A crawler for the WebAdvisor webapp. + */ + +$incdir = dirname(__FILE__) . DIRECTORY_SEPARATOR; +require_once $incdir . 'class.semester.inc'; +require_once $incdir . 'class.course.inc'; +require_once $incdir . 'class.section.php'; +require_once $incdir . 'class.section_meeting.inc'; + +define('_SCHOOL_CRAWL_WEBADVISOR_START_FORM', '?type=P&pid=ST-WESTS12A&LASTTOKEN=NULL'); + +function _school_crawl_webadvisor_common_prep(array &$school, array &$options) +{ + $school += array('webadvisor_url' => $school['url'] . 'WebAdvisor'); + $options += array( + 'season_mapper' => 'school_crawl_webadvisor_season_mapper', + 'curlsetup_hook' => NULL, + ); +} + +/* + * \brief + * Crawl the list of semesters available from a + * WebAdvisor-compatible school. + * + * \param $school + * The school’s info array/handle. + * \param $semesters + * The array to populate with various semesters available at this + * college. + * \param $school_crawl_log + * The school_crawl_log handle. + * \param $season_mapper + * A function($term, $term_value) which maps term names onto + * semester/season names. See + * school_crawl_webadvisor_season_mapper() for the default + * implementation. + */ +function school_crawl_webadvisor_semester_list(array $school, array &$semesters, &$school_crawl_log, array $options = array()) +{ + _school_crawl_webadvisor_common_prep($school, $options); + + $cookies = array(); + $uri = $school['webadvisor_url'] . _SCHOOL_CRAWL_WEBADVISOR_START_FORM; + $semesters_html = school_crawl_webadvisor_geturi($uri, $cookies, $school_crawl_log, $options); + + $semesters_dom = new DOMDocument(); + $semesters_dom->loadHTML($semesters_html); + + /* + * Discover the available semesters + */ + $semesters_var1 = $semesters_dom->getElementById('VAR1'); + if (empty($semesters_var1)) + { + school_crawl_logf($school_crawl_log, 0, "Error: Unable to load list of semesters."); + return 1; + } + $semesters_select_nodes = $semesters_var1->childNodes; + foreach ($semesters_select_nodes as $semester_node) + { + if ($semester_node->tagName != 'option' + || !$semester_node->hasAttribute('value') + || !strlen($semester_node->getAttribute('value'))) + continue; + + $term = $semester_node->textContent; + $term_value = $semester_node->getAttribute('value'); + $semester = $options['season_mapper']($term, $term_value, $school_crawl_log); + + /* + * We need a way to map a semester back to a list of + * term_values. We can tack an extra member variable onto any + * object in PHP, so we use that method. + */ + if (!empty($semester)) + if (empty($semesters[$semester->id()])) + { + $semester->_school_crawl_webadvisor_term_values = array($term_value); + $semesters[$semester->id()] = $semester; + } + else + /* + * A semester associated with this year/season already + * exists. Append an additional term value to be associated + * with this Semester so that they can be aggregated when + * crawled later. + */ + $semesters[$semester->id()]->_school_crawl_webadvisor_term_values[] = $term_value; + } + + return 0; +} + +function school_crawl_webadvisor_geturi(&$uri, array &$cookies, &$school_crawl_log, array $options) +{ + /** + * We have to handle the case where the user is first browing to + * WebAdvisor. For example, with the ST-WESTS12A sequence: + * + * Start the ST-WESTS12A sequence. + * + * 1. WebAdvisor?type=P&pid=ST-WESTS12A&LASTTOKEN=NULL + * + * + * Calls javascript:getWindowHTML(). This merely adds + * TOKENIDX=NULL to the query string, so we can skip this step + * and just have TOKENIDX=NULL. + * + * 2. WebAdvisor?type=P&pid=ST-WESTS12A&LASTTOKEN=NULL&TOKENIDX=NULL + * + * + * 3. WebAdvisor?type=P&pid=ST-WESTS12A&TOKENIDX=7699844013 In #2, + * the second argument to setWindowHTML() is random. Thus, we + * have to capture this value and set it as GET parameter named + * “TOKENIDX”. + */ + if (strpos($uri, 'TOKENIDX') === FALSE) + { + if (strpos($uri, '?') === FALSE) + $uri .= '?'; + else + $uri .= '&'; + + /* Starting value. */ + $uri .= 'TOKENIDX=NULL'; + } + + $html = school_crawl_webadvisor_noscript_filter(school_crawl_geturi($uri, $cookies, $school_crawl_log, NULL, FALSE, $options['curlsetup_hook'])); + + if (!preg_match('/setWindowHTML\\(\'\', \'([0-9]+)\'\\);/', $html, $matches)) + /* + * The user already had a valid TOKENIDX, so we’re good to go. + */ + return $html; + + $token = $matches[1]; + school_crawl_logf($school_crawl_log, 7, "Using WebAdvisor TOKENIDX=%s.", $token); + + /* + * setWindowHTML() will first remove the query string parameters + * 'CLONE' and 'FORCEIDX'. Then it appends TOKENIDX= to the + * query parameters. + * + * Example, where TOKENIDX does not start out as NULL but where a + * CLONE=Y command is being sent: + * + * Input: HTTPS://kvdata.calvin.edu/walive/WebAdvisor?TYPE=P&PID=ST-WESTS13C&CLONE=Y&CLONE_PROCESS=Y&SPAUKQ=708501792841963&TOKENIDX=1507971558 + * + * Result: HTTPS://kvdata.calvin.edu/walive/WebAdvisor?TYPE=P&PID=ST-WESTS13C&CLONE_PROCESS=Y&SPAUKQ=708501792841963&TOKENIDX=2281086932 + */ + $uri = preg_replace('/([?&])TOKENIDX=[^&]+/', '$1TOKENIDX=' . $token, + preg_replace('/([?&])(CLONE|FORCEIDX)=[^&]+&?/', '$1', $uri)); + + return school_crawl_webadvisor_noscript_filter(school_crawl_geturi($uri, $cookies, $school_crawl_log, NULL, FALSE, $options['curlsetup_hook'])); +} + +/** + * \brief + * Searches for and removes a