* * This file is a part of slate_permutate. * * slate_permutate is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * slate_permutate is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with slate_permutate. If not, see . */ /** * \brief * Crawl's Calvin's registration course listing pages. * * \param $semester * The Semester object which I should populate. * \param $verbosity * How verbose I should be. Sensicle range is from 0 through 10. */ function calvin_crawl(Semester $semester, $verbosity = 1) { /** * collect a few pbasic stats */ $skipped_sections = array('incomplete meeting info' => 0, 'invalid meeting info format' => 0); /** * The first link we start at is the one from KV into WebAdvisor. * * 1. https://kvdata.calvin.edu/walive/WebAdvisor?CONSTITUENCY=WBST&type=P&pid=ST-WESTS12A&LASTTOKEN=NULL * * * Calls javascript:getWindowHTML(). This merely adds * TOKENIDX=NULL to the query string, so we can skip this step * and just have TOKENIDX=NULL. * * 2. https://kvdata.calvin.edu/walive/WebAdvisor?CONSTITUENCY=WBST&type=P&pid=ST-WESTS12A&LASTTOKEN=NULL&TOKENIDX=NULL * * * In the above, the second argument to setWindowHTML() is * random. Thus, we have to capture this value. */ $cookies = array(); $baseuri = 'https://kvdata.calvin.edu/walive/WebAdvisor?CONSTITUENCY=WBST&type=P&pid=ST-WESTS12A&LASTTOKEN=NULL'; $token_uri = $baseuri . '&TOKENIDX=NULL'; $token_html = calvin_crawl_noscript_filter(geturi($token_uri, $cookies)); if (!preg_match('/setWindowHTML$\'\', \'([0-9]+)\'$;/', $token_html, $matches)) { fprintf(STDERR, "Could not steal the token\n"); return 1; } $token = $matches[1]; if ($verbosity > 5) { echo 'token: ' . $token . "\n"; echo "\n"; } /* * here we have arrived at the main webadvisor screen which lists the * search form. From here, we can get a list of all of the departments * that Calvin College has and then know enough to query each * individual department for courses. */ $uri = $baseuri . '&TOKENIDX=' . $token; $departments_html = calvin_crawl_noscript_filter(geturi($uri, $cookies)); $departments_dom = new DOMDocument(); $departments_dom->loadHTML($departments_html); /* * Discover the available semesters */ $semesters_select_nodes = $departments_dom->getElementById('VAR1')->childNodes; $semester_strs = array(); foreach ($semesters_select_nodes as $semester_node) { if ($semester_node->tagName != 'option' || !$semester_node->hasAttribute('value')) continue; $semester_strs[$semester_node->getAttribute('value')] = $semester_node->nodeValue; } $departments_select_nodes = $departments_dom->getElementById('LIST_VAR1_1')->childNodes; $departments = array(); foreach ($departments_select_nodes as $dept_node) { if ($dept_node->tagName != 'option' || !$dept_node->hasAttribute('value')) continue; $departments[$dept_node->getAttribute('value')] = $dept_node->nodeValue; } /* * get all of the different possible course levels... dynamically * rather than hardcodedly ;-). */ $departments_select_nodes = $departments_dom->getElementById('LIST_VAR1_2')->childNodes; $course_levels = array(); foreach ($departments_select_nodes as $courselevel_node) { if ($courselevel_node->tagName != 'option' || !$courselevel_node->hasAttribute('value')) continue; $course_levels[] = $courselevel_node->getAttribute('value'); } $return_url = dom_input_value($departments_dom, 'RETURN.URL'); /* ARCT only has >=200 level courses */ $dept = ''; $course_level = ''; $semester_str = substr($semester->year_get(), 2) . '/'; switch ($semester->season_get()) { case Semester::SEASON_SPRING: $semester_str .= 'SP'; break; case Semester::SEASON_FALL: $semester_str .= 'FA'; break; } if (!isset($semester_strs[$semester_str])) error_log('Couldn\'t find a semester in Calvin\'s database for ' . $semester_str . ' (' . $semester->season_get() . ', ' . $semester->year_get() . ')'); /* * LIST.VAR_: is the column, is the row. There * are apparently a max of 5 rows (see the LIST.VAR_MAX * below). * * Columns: * LIST.VAR1: department * LIST.VAR2: course_level * LIST.VAR3: IIRC, a course identifier, such as 156 from MATH-156 * LIST.VAR4: I forget * */ $form = array('VAR1' => $semester_str, 'LIST.VAR1_1' => $dept, 'LIST.VAR2_1' => $course_level, ); /* * other form items we're not querying but which need to be * sent blankly */ $form += array( 'RETURN.URL' => $return_url, 'SUBMIT_OPTIONS' => '', /* * The submit button... its value="" key is * apparently sent with the form... makes a * little bit of sense I guess ;-). */ /*'SUBMIT2' => 'SUBMIT',*/ 'DATE.VAR1' => '', 'DATE.VAR2' => '', 'LIST.VAR1_CONTROLLER' => 'LIST.VAR1', 'LIST.VAR1_MEMBERS' => 'LIST.VAR1*LIST.VAR2*LIST.VAR3*LIST.VAR4', ); foreach (array('1', '2', '3', '4') as $list_col) { $colname = 'LIST.VAR' . $list_col; if (!isset($form[$colname . '_MAX'])) $form[$colname . '_MAX'] = '5'; foreach (array('1', '2', '3', '4', '5') as $list_row) { $rowname = $colname . '_' . $list_row; if (!isset($form[$rowname])) $form[$rowname] = ''; } } /* * VAR7 and VAR 8 is a constraint of times during which * courses meet */ $form['VAR7'] = ''; $form['VAR8'] = ''; /* ``course title keywords'' */ $form['VAR3'] = ''; /* ? */ $form['VAR6'] = ''; $form['VAR21'] = ''; /* instructor's last name */ $form['VAR9'] = ''; /* * VAR10 through VAR16 are Monday through Sunday checkboxes * for days of the week that classes meet. * * But we specify no days of the week to avoid this being a * constraint ;-). */ /* for ($day = 10; $day <= 16; $day ++) $form['VAR' . $day] = ''; */ /* * pages is populated by preg_match() below after the first looping. */ $pages = array(1 => 0, 2=> 1); while ($pages[1] < $pages[2]) { $html = calvin_crawl_noscript_filter(geturi($uri, $cookies, $form)); $results_dom = new DOMDocument(); $results_dom->loadHTML($html); $list_done = FALSE; for ($list_row = 1; !$list_done; $list_row ++) { /* either 'Open' (or 'Closed'?) */ $openness = dom_input_value($results_dom, 'LIST.VAR1_' . $list_row); $sec_short_title = dom_id_content($results_dom, 'SEC_SHORT_TITLE_' . $list_row); $sec_meeting_info = dom_id_content($results_dom, 'SEC_MEETING_INFO_' . $list_row); /* check if we're done with this particular page */ if (!strlen($openness) && !strlen($sec_short_title) && !strlen($sec_meeting_info)) { $list_done = TRUE; break; } /* * the same info below should be gettable with * dom_id_content($results_dom, 'SEC_FACULTY_INFO_' . $list_row); */ $faculty_name = dom_input_value($results_dom, 'SEC.FACULTY.INFO_' . $list_row); $credits = dom_input_value($results_dom, 'SEC.MIN.CRED_' . $list_row); /* or id="SEC_FACULTY_INFO_$list_row" */ $comment = dom_id_content($results_dom, 'SEC_COMMENTS_' . $list_row); /* or name="SEC.COMMENTS_$list_row" */ /* parse */ $section_id = Section::parse($sec_short_title); $synonym = NULL; if (preg_match(';$([0-9]+)$;', $sec_short_title, $matches)) $synonym = $matches[1]; if ($verbosity > 6) { echo "\n"; echo implode('-', $section_id) . ': ' . $sec_short_title . "\n"; echo $openness . "\n"; echo $sec_meeting_info . "\n"; echo $faculty_name . "\n"; echo $credits . "\n"; echo $comment . "\n"; } /* * The input format for this is, thankfully, pretty rigid * :-D. Example input format: * * '01/31/2011-05/11/2011 Lecture Monday, Wednesday 01:00PM - 03:50PM, Spoelhof Center, Room 101' * * OR * * '01/31/2011-05/18/2011 Practicum Days to be Announced, Times to be AnnouncedTo Be Arranged, Room TBA' * * OR * * '01/31/2011-05/12/2011 Music Ensemble Monday, Wednesday, Thursday, Friday 03:30PM - 04:20PM, Covenant Fine Arts Center, Room 135' * * In the second case.... we'll just ignore the section. In * the last case, we have to be careful about parsing out * Monday. * * At this point, we don't parse most tokens. We group them * off. We get the first date, the second date, the type * ('Lecture', 'Practicum', or some other unknown value), * the list of days of week the section meets, the start * time, the end time, and then the meeting location. */ if (strpos($sec_meeting_info, 'Times to be Announced') !== FALSE || strpos($sec_meeting_info, 'Days to be Announced') !== FALSE) { if ($verbosity > 2) error_log('Skipping class because of incomplete meeting time information: ' . implode('-', $section_id) . ' has meeting info of `' . $sec_meeting_info . '\''); $skipped_sections['incomplete meeting info'] ++; continue; } if (!preg_match(';^([0-9]{2}/[0-9]{2}/[0-9]{4})-([0-9]{2}/[0-9]{2}/[0-9]{4}) (([^ ,]+ )+)([^0-9]+) ([^ ]+) - ([^ ]+), (.*)$;', $sec_meeting_info, $meeting_info_matches)) { error_log('Unable to parse calvin section meeting info string into start/end/days information for ' . implode('-', $section_id) . ': ``' . $sec_meeting_info . '\'\''); $skipped_sections['invalid meeting info format'] ++; continue; } $date_start = $meeting_info_matches[1]; $date_end = $meeting_info_matches[2]; /* e.g., 'Lecture', 'Practicum' */ $meeting_type = $meeting_info_matches[3]; $days = school_crawl_days_format(explode(', ', $meeting_info_matches[5])); $time_start = school_crawl_time_format(strptime($meeting_info_matches[6], '%I:%M%p')); $time_end = school_crawl_time_format(strptime($meeting_info_matches[7], '%I:%M%p')); $meeting_place = $meeting_info_matches[8]; if ($verbosity > 5) foreach (array('date_start', 'date_end', 'meeting_type', 'days', 'time_start', 'time_end', 'meeting_place') as $var) echo $var . ':' . ${$var} . "\n"; $section = new Section($section_id['section'], array(new SectionMeeting($days, $time_start, $time_end, $meeting_place)), $synonym, $faculty_name); $semester->section_add($section_id['department'], $section_id['course'], $section); } if (!preg_match(';Page ([0-9]+) of ([0-9]+)\$;m', $html, $pages)) { error_log('Unable to determine the number of pages in this Calvin resultset'); break; } if ($verbosity > 0) { echo 'calvin_crawl(): finished page ' . $pages[1] . ' of ' . $pages[2] . ' with ' . ($list_row - 1) . " courses.\n"; } $form = array( 'ACTION*Grp:WSS.COURSE.SECTIONS' => 'NEXT', ); } $has_stat = FALSE; if ($verbosity > 1) foreach ($skipped_sections as $reason => $num) { if (!$num) continue; if (!$has_stat) error_log('Skipped some sections for : :'); error_log($reason . ': ' . $num); } return 0; } /** * \brief * Simulate some aspects of a web browser while retreiving a * document. * * This allows us to view our cookies in an associative array and to * have the server's response automatically update our cookies. * * If $post is specified as an associative array, an HTTP POST is * performed and the data is encoded properly as if we were performing * a form submission. * * Follows redirects. If there is a redirect, the page from which you * are redirected is lost... but few people put any information on * those pages anyways ;-). * * \param $uri * The URL to fetch. If a redirect occurs, this is updated. * \param $cookies * An associative array of cookies and where to save new cookies. * \param $post * If not NULL, causes an HTTP POST. In that case, should be an * associative array of form keys/values. * \param $verbosity * How verbose to be. * \param $loopspin * An internal variable to prevent us from following perpetual * redirects. * \return * The body of the document returned by the server (normally * malformed HTML, especially with Calvin's WebAdvisor * installation). */ function geturi(&$uri, &$cookies, $post = NULL, $verbosity = 0, $loopspin = 0) { global $geturi_write_buf, $geturi_headers_buf, $geturi_verbosity; if ($verbosity > 5) { echo "\n"; echo 'geturi(' . $uri . ")\n"; echo "\n"; } $curl = curl_init(); $geturi_verbosity = $verbosity; $geturi_write_buf = ''; $geturi_headers_buf = ''; curl_setopt($curl, CURLOPT_URL, $uri); $cookies_str = ''; foreach ($cookies as $key => $val) { if (strlen($cookies_str)) $cookies_str .= ';'; $cookies_str .= $key . '=' . $val; } if ($verbosity > 8) echo 'cookies sent: ' . $cookies_str . "\n"; curl_setopt($curl, CURLOPT_COOKIE, $cookies_str); curl_setopt($curl, CURLOPT_HEADERFUNCTION, 'geturi_header_cb'); curl_setopt($curl, CURLOPT_WRITEFUNCTION, 'geturi_write_cb'); if ($post != NULL && is_array($post)) { /* var_dump($post); */ $posttxt = ''; foreach ($post as $postkey => $postval) { $posttxt .= (strlen($posttxt) ? '&' : '') . urlencode($postkey) . '=' . (strpos($postkey, 'MEMBER') === FALSE ? urlencode($postval) : $postval); } if ($verbosity > 8) echo 'setting POST to ' . $posttxt . "\n"; /* curl_setopt($curl, CURLOPT_POST, TRUE); */ curl_setopt($curl, CURLOPT_POSTFIELDS, $posttxt); } curl_exec($curl); curl_close($curl); $location = NULL; foreach (explode("\r\n", $geturi_headers_buf) as $header) { /* * yes, we don't want the line if the first char is a ':' or if it has no ':' */ if (!strpos($header, ':')) continue; list($header_name, $header_val) = explode(': ', $header, 2); if ($verbosity > 8) echo $header_name . ' : ' . $header_val . "\n"; switch($header_name) { case 'Set-Cookie': list($cookie_name, $cookie_val) = explode('=', $header_val, 2); if ($verbosity > 9) { if (isset($cookies[$cookie_name])) echo 'Replacing cookie ' . $cookie_name . '=' . $cookies[$cookie_name] . ' with '; echo 'new cookie ' . $cookie_name . '=' . $cookie_val . "\n"; } $cookies[$cookie_name] = $cookie_val; break; case 'Location': $location = $header_val; $location = preg_replace(';(kvdata\.calvin\.edu/)(WebAdvisor);', '\1walive/\2', $location) . "\n"; $post = NULL; break; } } if ($verbosity > 9) echo $geturi_write_buf; if ($location && $loopspin < 6) { $uri = $location; return geturi($uri, $cookies, $post, $loopspin + 1); } return $geturi_write_buf; } function geturi_header_cb($curl, $header_buf) { global $geturi_headers_buf; $geturi_headers_buf .= $header_buf; return strlen($header_buf); } function geturi_write_cb($curl, $write_buf) { global $geturi_write_buf; $geturi_write_buf .= $write_buf; return strlen($write_buf); } /** * \brief * Find an element and return its value attribute. * * \param $domdocument * The DOMDocument to search. * \param $name * The name attribute of the element. * \return * The value attribute of the input element or NULL if not found. */ function dom_input_value($domdocument, $name) { $xpath = new DOMXPath($domdocument); $input_node_list = $xpath->query('/descendant::input[attribute::name="' . $name . '"]'); if (!$input_node_list->length) return NULL; $input_node = $input_node_list->item(0); if (!$input_node->hasAttribute('value')) return NULL; return $input_node->getAttribute('value'); } /** * \brief * Returns the content of an element with the given ID. * * A convenience function. * * \param $domdocument * A DOMDocument to search. * \param $id * The id attribute of the element whose content are requested. * \return * A UTF-8 string of the contents of the given element or NULL if * the element isn't found. */ function dom_id_content($domdocument, $id) { $node = $domdocument->getElementById($id); if ($node) { return $node->nodeValue; } return NULL; } /** * \brief * Searches for and removes a

in a docs <head />, which * is quite bad invalid HTML so that DOM can't handle it. * * \param $html * The input HTML to filter. * \return * The fixed HTML. */ function calvin_crawl_noscript_filter($html) { return preg_replace(';\<(noscript)\>.*?\</\1\>;s', '', $html); }