* * This file is a part of slate_permutate. * * slate_permutate is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * slate_permutate is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with slate_permutate. If not, see . */ /** * \file * * A crawler for the WebAdvisor webapp. */ $incdir = dirname(__FILE__) . DIRECTORY_SEPARATOR; require_once $incdir . 'class.semester.inc'; require_once $incdir . 'class.course.inc'; require_once $incdir . 'class.section.php'; require_once $incdir . 'class.section_meeting.inc'; define('_SCHOOL_CRAWL_WEBADVISOR_START_FORM', '?type=P&pid=ST-WESTS12A&LASTTOKEN=NULL'); function _school_crawl_webadvisor_common_prep(array &$school, array &$options) { $school += array('webadvisor_url' => $school['url'] . 'WebAdvisor'); $options += array( 'season_mapper' => 'school_crawl_webadvisor_season_mapper', 'curlsetup_hook' => NULL, ); } /* * \brief * Crawl the list of semesters available from a * WebAdvisor-compatible school. * * \param $school * The school’s info array/handle. * \param $semesters * The array to populate with various semesters available at this * college. * \param $school_crawl_log * The school_crawl_log handle. * \param $season_mapper * A function($term, $term_value) which maps term names onto * semester/season names. See * school_crawl_webadvisor_season_mapper() for the default * implementation. */ function school_crawl_webadvisor_semester_list(array $school, array &$semesters, &$school_crawl_log, array $options = array()) { _school_crawl_webadvisor_common_prep($school, $options); $cookies = array(); $uri = $school['webadvisor_url'] . _SCHOOL_CRAWL_WEBADVISOR_START_FORM; $semesters_html = school_crawl_webadvisor_geturi($uri, $cookies, $school_crawl_log, $options); $semesters_dom = new DOMDocument(); $semesters_dom->loadHTML($semesters_html); /* * Discover the available semesters */ $semesters_var1 = $semesters_dom->getElementById('VAR1'); if (empty($semesters_var1)) { school_crawl_logf($school_crawl_log, 0, "Error: Unable to load list of semesters."); return 1; } $semesters_select_nodes = $semesters_var1->childNodes; foreach ($semesters_select_nodes as $semester_node) { if ($semester_node->tagName != 'option' || !$semester_node->hasAttribute('value') || !strlen($semester_node->getAttribute('value'))) continue; $term = $semester_node->textContent; $term_value = $semester_node->getAttribute('value'); $semester = $options['season_mapper']($term, $term_value, $school_crawl_log); /* * We need a way to map a semester back to a list of * term_values. We can tack an extra member variable onto any * object in PHP, so we use that method. */ if (!empty($semester)) if (empty($semesters[$semester->id()])) { $semester->_school_crawl_webadvisor_term_values = array($term_value); $semesters[$semester->id()] = $semester; } else /* * A semester associated with this year/season already * exists. Append an additional term value to be associated * with this Semester so that they can be aggregated when * crawled later. */ $semesters[$semester->id()]->_school_crawl_webadvisor_term_values[] = $term_value; } return 0; } function school_crawl_webadvisor_geturi(&$uri, array &$cookies, &$school_crawl_log, array $options) { /** * We have to handle the case where the user is first browing to * WebAdvisor. For example, with the ST-WESTS12A sequence: * * Start the ST-WESTS12A sequence. * * 1. WebAdvisor?type=P&pid=ST-WESTS12A&LASTTOKEN=NULL * * * Calls javascript:getWindowHTML(). This merely adds * TOKENIDX=NULL to the query string, so we can skip this step * and just have TOKENIDX=NULL. * * 2. WebAdvisor?type=P&pid=ST-WESTS12A&LASTTOKEN=NULL&TOKENIDX=NULL * * * 3. WebAdvisor?type=P&pid=ST-WESTS12A&TOKENIDX=7699844013 In #2, * the second argument to setWindowHTML() is random. Thus, we * have to capture this value and set it as GET parameter named * “TOKENIDX”. */ if (strpos($uri, 'TOKENIDX') === FALSE) { if (strpos($uri, '?') === FALSE) $uri .= '?'; else $uri .= '&'; /* Starting value. */ $uri .= 'TOKENIDX=NULL'; } $html = school_crawl_webadvisor_noscript_filter(school_crawl_geturi($uri, $cookies, $school_crawl_log, NULL, FALSE, $options['curlsetup_hook'])); if (!preg_match('/setWindowHTML\\(\'\', \'([0-9]+)\'\\);/', $html, $matches)) /* * The user already had a valid TOKENIDX, so we’re good to go. */ return $html; $token = $matches[1]; school_crawl_logf($school_crawl_log, 7, "Using WebAdvisor TOKENIDX=%s.", $token); /* * setWindowHTML() will first remove the query string parameters * 'CLONE' and 'FORCEIDX'. Then it appends TOKENIDX= to the * query parameters. * * Example, where TOKENIDX does not start out as NULL but where a * CLONE=Y command is being sent: * * Input: HTTPS://kvdata.calvin.edu/walive/WebAdvisor?TYPE=P&PID=ST-WESTS13C&CLONE=Y&CLONE_PROCESS=Y&SPAUKQ=708501792841963&TOKENIDX=1507971558 * * Result: HTTPS://kvdata.calvin.edu/walive/WebAdvisor?TYPE=P&PID=ST-WESTS13C&CLONE_PROCESS=Y&SPAUKQ=708501792841963&TOKENIDX=2281086932 */ $uri = preg_replace('/([?&])TOKENIDX=[^&]+/', '$1TOKENIDX=' . $token, preg_replace('/([?&])(CLONE|FORCEIDX)=[^&]+&?/', '$1', $uri)); return school_crawl_webadvisor_noscript_filter(school_crawl_geturi($uri, $cookies, $school_crawl_log, NULL, FALSE, $options['curlsetup_hook'])); } /** * \brief * Searches for and removes a