# HG changeset patch # User Nathan Phillip Brink # Date 2010-10-16 11:57:07 # Node ID 6cb196f112d9fe514cf6769f12d85b5099619a60 # Parent 18c6d2ea6fe7dc25c799c4ef50cb17e4514301d4 A school website-crawling infrastructure. Supports crawling Calvin's website and producing JSON for jqueryui's autocomplete functionality. Also creates a JSON description of the list sections for each course, awaiting JS-support for AJAX section autocreation. diff --git a/admin/rehash.php b/admin/rehash.php --- a/admin/rehash.php +++ b/admin/rehash.php @@ -26,11 +26,19 @@ * school listing used for the ``choose your school list''. */ -require_once(dirname(__FILE__) . DIRECTORY_SEPARATOR . '..' . DIRECTORY_SEPARATOR . 'inc' . DIRECTORY_SEPARATOR . 'school.inc'); +$inc_base = dirname(__FILE__) . DIRECTORY_SEPARATOR . '..' . DIRECTORY_SEPARATOR . 'inc' . DIRECTORY_SEPARATOR; +require_once($inc_base . 'school.inc'); +require_once($inc_base . 'school.crawl.inc'); +require_once($inc_base . 'class.semester.inc'); + return main($argc, $argv); function main($argc, $argv) { + $crawl = TRUE; + $crawl_semester_year = '2011'; + $crawl_semester_season = Semester::SEASON_SPRING; + $school_id_list = school_list(); if (!$school_id_list) return 1; @@ -38,13 +46,16 @@ function main($argc, $argv) $schools = array(); foreach ($school_id_list as $school_id) { - $school = school_load($school_id); + $school = school_load($school_id, TRUE); if (!$school) { fprintf(STDERR, "Error loading school with school_id=%s\n", $school_id); return 1; } + + school_crawl($school, $crawl_semester_year, $crawl_semester_season); + $schools[] = $school; } @@ -106,10 +117,6 @@ function school_cmp($school_a, $school_b * Write out the cache file which remembers the list of available * schools. * - * \todo - * If the list of displayed schools is to be sorted, this is the - * place to do it. - * * \param $schools * An array of school handles. */ @@ -117,11 +124,17 @@ function school_cache($schools) { $list_cache = array(); $domain_cache = array(); + + $cache_dir_name = dirname(__FILE__) . DIRECTORY_SEPARATOR . '..' + . DIRECTORY_SEPARATOR . 'cache' . DIRECTORY_SEPARATOR; + $cache_auto_dir_name = $cache_dir_name . 'auto' . DIRECTORY_SEPARATOR; + foreach ($schools as $school) { $list_cache[$school['id']] = array( 'name' => $school['name'], 'url' => $school['url'], + 'crawled' => $school['crawled'], ); foreach ($school['domains'] as $school_domain) { @@ -143,14 +156,54 @@ function school_cache($schools) $domain_part = array_shift($domain_parts); $domain_cache_ptr[$domain_part] = $school['id']; } + + + /* autocomplete stuff -- per school */ + if ($school['crawled']) + { + $semester = $school['crawled_semester']; + + $cache_auto_school_dir_name = $cache_auto_dir_name . $school['id'] . DIRECTORY_SEPARATOR; + if (!is_dir($cache_auto_school_dir_name)) + { + if (!mkdir($cache_auto_school_dir_name, 0777, TRUE)) + error_log('Unable to create needed directory: `' . $cache_auto_dir_name . '\''); + } + + $departments = $semester->departments_get(); + sort($departments); + + $dept_file = fopen($cache_auto_school_dir_name . '-depts', 'wb'); + fwrite($dept_file, serialize($departments)); + fclose($dept_file); + + /* now per-department autocomplete */ + foreach ($departments as $department) + { + $classes = $semester->department_classes_get($department); + $classes_file = fopen($cache_auto_school_dir_name . $department . '.sects', 'wb'); + fwrite($classes_file, serialize($classes)); + fclose($classes_file); + + /* now individual section informations, pre-JSON-ized */ + foreach ($classes as $class) + { + if (!is_dir($cache_auto_school_dir_name . $department)) + mkdir($cache_auto_school_dir_name . $department); + $class_file = fopen($cache_auto_school_dir_name . $department . DIRECTORY_SEPARATOR . $class, 'wb'); + fwrite($class_file, json_encode($semester->class_get($department, $class)->to_json_array())); + fclose($class_file); + } + } + } + + } uasort($list_cache, 'school_cmp'); $cache = array('list' => $list_cache, 'domains' => $domain_cache); - - $cache_file_name = dirname(__FILE__) . DIRECTORY_SEPARATOR . '..' - . DIRECTORY_SEPARATOR . 'cache' . DIRECTORY_SEPARATOR . 'schools'; + $cache_file_name = $cache_dir_name . 'schools'; $cache_file = fopen($cache_file_name, 'wb'); if ($cache_file === FALSE) { @@ -163,3 +216,63 @@ function school_cache($schools) return 0; } + +/** + * \brief + * Invoke a school's registration data crawler. + * + * Each school may export registration data on publically accessible + * websites. Thus, we populate some autocomplete information by + * crawling these pages and storing the information in a special set + * of caches. + * + * Because crawling code can be non-trivial, it should be separated + * from a school's main .inc file. Thus, if a school supports + * crawling, it will have a file called + * schools.d/.crawl.inc. In this file, a function called + * _crawl($semester) must be defined. It must accept one + * argument, the Semester object which defines the time of year for + * which courses should be retrieved. It must populate this empty + * Semester object with Course object and populate those courses with + * the sections with as much detail as possible. + * + * If the crawling is successful, a 'crawl' key is added to the + * $school handle. school_cache() will use this to help indicate that + * a school _has_ autocomplete information, which might affect the + * appearance and JS stuff for the input.php page. + * + * \param $school + * The school which should be checked for crawl functionality and + * crawled. + * \param $semester_year + * The year of the semester for which we should grab data. + * \param $semester_season + * The season of the year of the semester for which we should grab + * data. + */ +function school_crawl(&$school, $semester_year, $semester_season, $verbosity = 1) +{ + $school['crawled'] = FALSE; + + $school_crawl_func = $school['id'] . '_crawl'; + if (!function_exists($school_crawl_func)) + return; + + $semester = new Semester($semester_year, $semester_season); + + if ($verbosity > 0) + fprintf(STDERR, "%s()\n", $school_crawl_func); + $ret = $school_crawl_func($semester, $verbosity); + if ($ret) + { + fprintf(STDERR, "Crawling %s failed: %s() returned nonzero\n", + $school['id'], $school_crawl_func); + fwrite(STDERR, "\n"); + return; + } + $school['crawled'] = TRUE; + $school['crawled_semester'] = $semester; + + if ($verbosity > 0) + fwrite(STDERR, "\n"); +} diff --git a/auto.php b/auto.php new file mode 100644 --- /dev/null +++ b/auto.php @@ -0,0 +1,144 @@ + + * + * This file is a part of slate_permutate. + * + * slate_permutate is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * slate_permutate is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with slate_permutate. If not, see . + */ + +/** + * \file + * This file's purpose is to autocomplete class names for supporting + * the autocomplete JS based off of crawling schools' registration + * websites. This shall only perform the autocompletion of class + * names. + * + * Since we output JSON, no special Page classes and stuff + * :-p. Except we still call the Page class's session_start() + * function because we apparently need sessions.... oh yeah, for + * school profile supports ;-). + */ + +require_once('inc/school.inc'); +require_once('inc/class.page.php'); +require_once('class.class.php'); + +Page::session_start(); + +if (isset($_REQUEST['txt'])) + header('Content-Type: text/plain; encoding=utf-8'); +else + header('Content-Type: application/json; encoding=utf-8'); + +if (!isset($_REQUEST['term'])) + clean_empty_exit(); + +$getsections = FALSE; +if (isset($_REQUEST['getsections'])) + $getsections = TRUE; + +$term = $_REQUEST['term']; +$term_parts = Classes::parse($term); +if (!count($term_parts)) + clean_empty_exit(); + +$school = school_load_guess(); +if (!$school['crawled']) + clean_empty_exit(); + +$cache_dir = dirname(__FILE__) . DIRECTORY_SEPARATOR . 'cache' . DIRECTORY_SEPARATOR . 'auto' . DIRECTORY_SEPARATOR . $school['id'] . DIRECTORY_SEPARATOR; + +/* + * autocomplete the list of departments. If the user has already + * entered a valid department name _and_ delimitted it, however, go on + * to the next autocompletion step. + */ +$term_strlen = strlen($term); +$dept_strlen = strlen($term_parts['department']); +$dept = $term_parts['department']; +if (!$getsections && count($term_parts) == 1 && $term_strlen == strlen($dept)) + { + $dept_file = $cache_dir . '-depts'; + if (!file_exists($dept_file)) + clean_empty_exit(); + $deptartments = unserialize(file_get_contents($dept_file)); + foreach ($deptartments as $key => $department) + { + if (!strncmp($department, $dept, $term_strlen)) + $departments[$key] = $department . '-'; + else + unset($departments[$key]); + } + echo json_encode($departments); + exit(0); + } + +if ($getsections) + { + $section_file = $cache_dir . $dept . DIRECTORY_SEPARATOR . $term_parts['course']; + if (file_exists($section_file)) + { + readfile($section_file); + exit(0); + } + /* section not found! */ + header('HTTP/1.1 404: Not found'); + header('Content-Type: text/plain; encoding=utf-8'); + echo 'Could not find course ' . implode('-', $term_parts) . "\n"; + exit(0); + } + +/* + * if a department is fully entered, life gets slightly more + * complicated. I suppose I only want to autocomplete the first digit + * of the course/class number. I.e., CS-2 for CS-262 for when the + * student has entered CS- or 'CS'. But for now we can just dump the entire department at the user ;-). + */ +$classes_file = $cache_dir . $dept . '.sects'; +if (file_exists($classes_file)) + { + $classes = unserialize(file_get_contents($classes_file)); + $class_start = ''; + if (count($term_parts) > 1) + $class_start = $term_parts['course']; + $class_start_strlen = strlen($class_start); + + /* reduce/create resultset */ + $json_classes = array(); + foreach ($classes as $class) + if (!strncmp($class, $class_start, $class_start_strlen)) + { + $json_classes[] = $dept . '-' . $class; + } + + echo json_encode($json_classes); + exit(0); + } + +/** + * Nothing caught.. + */ +echo '["Oops"]'; +exit(0); + +/** + * \brief + * Send an empty JSON array and exit. + */ +function clean_empty_exit() +{ + echo '[]'; + exit(0); +} diff --git a/class.class.php b/class.class.php --- a/class.class.php +++ b/class.class.php @@ -31,7 +31,17 @@ class Classes $this->sections[$this->nsections] = new Section($l, $p, $s, $e, $d); $this->nsections++; } - + + /** + * \brief + * Adds an already-instantiated section to this class. + */ + public function section_add(Section $section) + { + $this->sections[$this->nsections] = $section; + $this->nsections ++; + } + //-------------------------------------------------- // Returns the number of sections in the class. //-------------------------------------------------- @@ -84,4 +94,55 @@ class Classes return $out; } + + /** + * \brief + * Split up a user-friendly course specification into components. + * + * This will only return the 'department' and 'course' components of + * the given course identifier. Otherwise, it acts the same as + * Section::parse. + * + * \see Section::parse() + * + * \param $course_spec + * A course specifier to parse, such as 'cs262' or 'MATH-156'. + * \return + * An array with normalized output having keys of 'department' and + * 'course'. If the user's input has less than these two keys of + * information, the returned array may have zero or one elements. + */ + public static function parse($course_spec) + { + $section_parts = Section::parse($course_spec); + if (isset($section_parts['section'])) + unset($section_parts['section']); + + return $section_parts; + } + + /** + * \brief + * Represent this class as a string. + */ + public function __toString() + { + return $this->getName(); + } + + /** + * \brief + * Represent this class as an array of sections ready to be JSONized. + */ + public function to_json_array() + { + $json_array = array('class' => $this->getName(), + 'sections' => array()); + foreach ($this->sections as $section) + { + $json_array['sections'][] = $section->to_json_array(); + } + + return $json_array; + } } diff --git a/class.section.php b/class.section.php --- a/class.section.php +++ b/class.section.php @@ -278,4 +278,82 @@ class Section return $out; } + + /** + * \brief + * Splits up a section specifier into dept, course number, and + * section. + * + * For example, will return array('CS', '262', 'A') for 'CS-262-A' + * or 'CS262A' or 'cs-262a'. This function is not for dealing with + * course synonyms. + * + * \param $section_spec + * A string starting with a section specifier. If only the + * department is found, an array of size one is returned. If the + * course number is also found, both department and course id are + * returned. If all three are found, the array has three elements. + * + * This array is keyed, so the found items may be referred to as + * 'deptartment', 'course', and 'section'. + * + * \return + * An array with the department, course number, and section + * identifier. This array may be empty or have from one through + * three elements depending on the validity and precision of the + * $section_spec. + */ + public static function parse($section_spec) + { + $ret = array(); + + $section_spec = trim($section_spec); + if (!preg_match(';([a-zA-Z]+)[^0-9]*;', $section_spec, $dept_matches)) + return $ret; + + /* + * remove away the already-parsed stuff, including gunk between the + * dept and the course num. + */ + $section_spec = trim(substr($section_spec, strlen($dept_matches[0]))); + $ret['department'] = strtoupper($dept_matches[1]); + + if (!preg_match(';([0-9]+)[^a-zA-Z0-9]*;', $section_spec, $course_matches)) + return $ret; + + /* skip gunk */ + $section_spec = trim(substr($section_spec, strlen($course_matches[0]))); + $ret['course'] = $course_matches[1]; + + /* + * we accept _either_ alphabetic section _or_ numeric section (the + * latter is for cedarville, particulaly) + */ + if (!preg_match(';([0-9]+|[a-zA-Z]+);', $section_spec, $section_matches)) + return $ret; + + $ret['section'] = strtoupper($section_matches[1]); + + return $ret; + } + + /** + * \brief + * Get an array of information needed by the AJAX stuff. + */ + public function to_json_array() + { + static $daymap = array(0 => 'm', 1 => 't', 2 => 'w', 3 => 'u', 4 => 'f'); + + $json_array = array('section' => $this->letter, + 'prof' => $this->prof, + 'time_start' => $this->start, + 'time_end' => $this->tend, + 'days' => array(), + ); + for ($day = 0; $day < 5; $day ++) + $json_array['days'][$daymap[$day]] = $this->getDay($day); + + return $json_array; + } } diff --git a/inc/class.semester.inc b/inc/class.semester.inc new file mode 100644 --- /dev/null +++ b/inc/class.semester.inc @@ -0,0 +1,176 @@ + + * + * This file is a part of slate_permutate. + * + * slate_permutate is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * slate_permutate is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with slate_permutate. If not, see . + */ + +$root_dir = dirname(__FILE__) . DIRECTORY_SEPARATOR . '..' . DIRECTORY_SEPARATOR; +require_once($root_dir . 'class.class.php'); +require_once($root_dir . 'class.section.php'); + +/** + * \brief + * Identifies a school semester and acts as a container for courses + * offered in a semester. + */ +class Semester +{ + /** + * \brief + * The Fall season. + */ + const SEASON_FALL = 'fall'; + + /** + * \brief + * The Spring season. + */ + const SEASON_SPRING = 'spring'; + + /** + * \brief + * Instantiate an empty Semester. + * + * \param $year + * The year of this semester. Must be four digits. + * \param $season + * The season of this semester. Currently, only + * Semester::SEASON_SPRING and Semester::SEASON_FALL are valid. + */ + function __construct($year, $season) + { + if (!in_array($season, array(self::SEASON_SPRING, self::SEASON_FALL))) + throw new ErrorException('Attempt to construct a Semester with a $season which is neither Semester::SEASON_SPRING nor Semester::SEASON_FALL. `' . $season . '\' was given.'); + $this->season = $season; + + if (strlen($year) != 4) + throw new ErrorException('Attempt to construct a Semester with an invalid year. The given year is `' . $year . '\''); + $this->year = $year; + + $this->departments = array(); + } + + /** + * \brief + * Add a class to this Semester. + * + * \param $class + * The class/course to add. + */ + public function class_add(Classes $class) + { + $class_parts = Classes::parse($class->getName()); + if (!isset($class_parts['course'])) + throw ErrorException('I was given a class with an invalid name: `' . $class->getName() . '\''); + + if (!isset($this->departments[$class_parts['department']])) + $this->departments[$class_parts['department']] = array(); + $department =& $this->departments[$class_parts['department']]; + + $department[$class_parts['course']] = $class; + } + + /** + * \brief + * Retrieve a class. + * + * \param $dept + * The class's department. 'CS' for 'CS-262'. + * \param $class + * The course/class number. '262' for 'cs-262'. + * \return + * A Classes or NULL if not found. + */ + public function class_get($dept, $class) + { + if (!isset($this->departments[$dept][$class])) + return NULL; + + return $this->departments[$dept][$class]; + } + + /** + * \brief + * Gets a list of departments available in this semester. + */ + public function departments_get() + { + return array_keys($this->departments); + } + + /** + * \brief + * Gets a list of class/course numbers available for a particular + * department. + */ + public function department_classes_get($dept) + { + if (!isset($this->departments[$dept])) + throw new ErrorException('I was asked for a department I don\'t own: ' . $dept); + + return array_keys($this->departments[$dept]); + } + + /** + * \brief + * Utility function to add a section to the semester, + * automatically creating classes as necessary. + * + * \param $dept + * The department this section belongs to. + * \param $class + * The class this section belongs to. + * \param $section + * The section itself. + */ + public function section_add($dept, $class, Section $section) + { + $dept = strtoupper($dept); + $class = strtoupper($class); + + if (!isset($this->departments[$dept]) + || !isset($this->departments[$dept][$class])) + { + $classobj = new Classes($dept . '-' . $class); + $this->class_add($classobj); + } + else + { + $classobj = $this->departments[$dept][$class]; + } + + $classobj->section_add($section); + } + + /** + * \brief + * Get a semester's year. + */ + public function year_get() + { + return $this->year; + } + + /** + * \brief + * Get a semester's season. + */ + public function season_get() + { + return $this->season; + } +} diff --git a/inc/school.crawl.inc b/inc/school.crawl.inc new file mode 100644 --- /dev/null +++ b/inc/school.crawl.inc @@ -0,0 +1,96 @@ + + * + * This file is a part of slate_permutate. + * + * slate_permutate is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * slate_permutate is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with slate_permutate. If not, see . + */ + +/** + * \file + * Routines that are only useful when crawling schools' websites for + * autofill section data. + */ + +/** + * \brief + * Parse a simple time string into slate_permutate's time + * representation. + * + * \param $time + * An array compatible with the return value of strptime(). The only + * fields we use are 'tm_hour', which is from 0 through 23, and + * 'tm_min', which may be from 0 through 50. + */ +function school_crawl_time_format($time) +{ + return sprintf('%02d%02d', $time['tm_hour'], $time['tm_min']); +} + +/** + * \brief + * Take an array of day names and assemble them into + * slate_permutate's internal (weird) representation of a set of + * weekdays. + * + * This function is intended to make it easy for one to take the + * output of an explode() call. For example, to decode $days_str = + * 'Monday, Tuesday, Friday', one would do + * school_crawl_days_format(explode(', ', $days_str)); + * + * \param $days + * An array of day names. These may be common abbreviations or + * truncations (any truncations must be two chars long for + * simplicity. One-char representations are supported, however, but + * use 'm', 't', 'w', 'h', 'f' to distinguish thursday and + * friday). Case does not matter. + * \return + * slate_permutate's strange internal days representation. + */ +function school_crawl_days_format($days) +{ + static $daymap_1 = array('m' => 1, 't' => 2, 'w' => 3, 'h' => 4, 'f' => 5); + static $daymap_2 = array('th' => 'h'); + + $my_days = array(); + foreach ($days as $day) + { + $day_orig = $day; + $day = strtolower(substr(trim($day), 0, 2)); + + /* + * convert from two-char representation to one-char + * representation.n + */ + if (strlen($day) > 1) + { + if (isset($daymap_2[$day])) + $day = $daymap_2[$day]; + else + $day = substr($day, 0, 1); + } + if (isset($daymap_1[$day])) + $my_days[$daymap_1[$day]] = TRUE; + else + error_log('school_crawl_days_format() got invalid day specifier:' + . ' `' . $day_orig . '\' => `' . $day . '\''); + } + + $day_str = ''; + foreach ($my_days as $day_val => $junk) + $day_str .= $day_val; + + return $day_str; +} diff --git a/inc/school.inc b/inc/school.inc --- a/inc/school.inc +++ b/inc/school.inc @@ -46,27 +46,48 @@ * \param $school_id * The school's alphanumeric identifier (which determines the name * of the school's *.inc file). + * \param $load_all_inc + * Asks for a school's extraneous .inc files to be loaded + * to. Intended for use by rehash.php only. * \return * A school_profile handle or NULL on error. */ -function school_load($school_id) +function school_load($school_id, $load_all_inc = FALSE) { $school = array('id' => $school_id); /* guard against cracking attempts (protects against '../' and friends) */ if (!preg_match('/^[0-9a-z]+$/', $school_id)) return NULL; - $school_file_name = dirname(__FILE__) . DIRECTORY_SEPARATOR - . '..' . DIRECTORY_SEPARATOR . 'school.d' . DIRECTORY_SEPARATOR . $school_id . '.inc'; + $school_file_name_base = dirname(__FILE__) . DIRECTORY_SEPARATOR + . '..' . DIRECTORY_SEPARATOR . 'school.d' . DIRECTORY_SEPARATOR; + $school_file_name = $school_file_name_base . $school_id . '.inc'; if (!file_exists($school_file_name)) return NULL; require_once($school_file_name); + if ($load_all_inc) + { + $school_crawl_file_name = $school_file_name_base . $school_id . '.crawl.inc'; + if (file_exists($school_crawl_file_name)) + require_once($school_crawl_file_name); + } $school_info = $school_id . '_info'; $school += $school_info(); + /* + * append small amount of info from the cache entry for this school: + * whether or not it was crawled. + * + * Perhaps this stuff should be just moved into the _info function + * for efficiency. + */ + $cache = _school_cache_load(); + if ($cache && count($cache['list'])) + $school['crawled'] = $cache['list'][$school['id']]['crawled']; + return $school; } diff --git a/school.d/calvin.crawl.inc b/school.d/calvin.crawl.inc new file mode 100644 --- /dev/null +++ b/school.d/calvin.crawl.inc @@ -0,0 +1,565 @@ + + * + * This file is a part of slate_permutate. + * + * slate_permutate is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * slate_permutate is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with slate_permutate. If not, see . + */ + +/** + * \brief + * Crawl's Calvin's registration course listing pages. + * + * \param $semester + * The Semester object which I should populate. + * \param $verbosity + * How verbose I should be. Sensicle range is from 0 through 10. + */ +function calvin_crawl(Semester $semester, $verbosity = 1) +{ + /** + * collect a few pbasic stats + */ + $skipped_sections = array('incomplete meeting info' => 0, 'invalid meeting info format' => 0); + + /** + * The first link we start at is the one from KV into WebAdvisor. + * + * 1. https://kvdata.calvin.edu/walive/WebAdvisor?CONSTITUENCY=WBST&type=P&pid=ST-WESTS12A&LASTTOKEN=NULL + * + * + * Calls javascript:getWindowHTML(). This merely adds + * TOKENIDX=NULL to the query string, so we can skip this step + * and just have TOKENIDX=NULL. + * + * 2. https://kvdata.calvin.edu/walive/WebAdvisor?CONSTITUENCY=WBST&type=P&pid=ST-WESTS12A&LASTTOKEN=NULL&TOKENIDX=NULL + * + * + * In the above, the second argument to setWindowHTML() is + * random. Thus, we have to capture this value. + */ + + $cookies = array(); + + $baseuri = 'https://kvdata.calvin.edu/walive/WebAdvisor?CONSTITUENCY=WBST&type=P&pid=ST-WESTS12A&LASTTOKEN=NULL'; + + $token_uri = $baseuri . '&TOKENIDX=NULL'; + $token_html = calvin_crawl_noscript_filter(geturi($token_uri, $cookies)); + if (!preg_match('/setWindowHTML\(\'\', \'([0-9]+)\'\);/', $token_html, $matches)) + { + fprintf(STDERR, "Could not steal the token\n"); + return 1; + } + $token = $matches[1]; + + if ($verbosity > 5) + { + echo 'token: ' . $token . "\n"; + echo "\n"; + } + + /* + * here we have arrived at the main webadvisor screen which lists the + * search form. From here, we can get a list of all of the departments + * that Calvin College has and then know enough to query each + * individual department for courses. + */ + $uri = $baseuri . '&TOKENIDX=' . $token; + $departments_html = calvin_crawl_noscript_filter(geturi($uri, $cookies)); + + $departments_dom = new DOMDocument(); + $departments_dom->loadHTML($departments_html); + + /* + * Discover the available semesters + */ + $semesters_select_nodes = $departments_dom->getElementById('VAR1')->childNodes; + $semester_strs = array(); + foreach ($semesters_select_nodes as $semester_node) + { + if ($semester_node->tagName != 'option' + || !$semester_node->hasAttribute('value')) + continue; + $semester_strs[$semester_node->getAttribute('value')] = + $semester_node->nodeValue; + } + + $departments_select_nodes = $departments_dom->getElementById('LIST_VAR1_1')->childNodes; + $departments = array(); + foreach ($departments_select_nodes as $dept_node) + { + if ($dept_node->tagName != 'option' + || !$dept_node->hasAttribute('value')) + continue; + $departments[$dept_node->getAttribute('value')] = + $dept_node->nodeValue; + } + + + /* + * get all of the different possible course levels... dynamically + * rather than hardcodedly ;-). + */ + $departments_select_nodes = $departments_dom->getElementById('LIST_VAR1_2')->childNodes; + $course_levels = array(); + foreach ($departments_select_nodes as $courselevel_node) + { + if ($courselevel_node->tagName != 'option' + || !$courselevel_node->hasAttribute('value')) + continue; + $course_levels[] = $courselevel_node->getAttribute('value'); + } + + $return_url = dom_input_value($departments_dom, 'RETURN.URL'); + + + /* ARCT only has >=200 level courses */ + $dept = ''; + $course_level = ''; + $semester_str = substr($semester->year_get(), 2) . '/'; + switch ($semester->season_get()) + { + case Semester::SEASON_SPRING: + $semester_str .= 'SP'; + break; + + case Semester::SEASON_FALL: + $semester_str .= 'FA'; + break; + } + if (!isset($semester_strs[$semester_str])) + error_log('Couldn\'t find a semester in Calvin\'s database for ' . $semester_str . ' (' . $semester->season_get() . ', ' . $semester->year_get() . ')'); + + + /* + * LIST.VAR_: is the column, is the row. There + * are apparently a max of 5 rows (see the LIST.VAR_MAX + * below). + * + * Columns: + * LIST.VAR1: department + * LIST.VAR2: course_level + * LIST.VAR3: IIRC, a course identifier, such as 156 from MATH-156 + * LIST.VAR4: I forget + * + */ + $form = array('VAR1' => $semester_str, + 'LIST.VAR1_1' => $dept, + 'LIST.VAR2_1' => $course_level, + ); + + /* + * other form items we're not querying but which need to be + * sent blankly + */ + $form += array( + 'RETURN.URL' => $return_url, + 'SUBMIT_OPTIONS' => '', + /* + * The submit button... its value="" key is + * apparently sent with the form... makes a + * little bit of sense I guess ;-). + */ + /*'SUBMIT2' => 'SUBMIT',*/ + + 'DATE.VAR1' => '', + 'DATE.VAR2' => '', + + 'LIST.VAR1_CONTROLLER' => 'LIST.VAR1', + 'LIST.VAR1_MEMBERS' => 'LIST.VAR1*LIST.VAR2*LIST.VAR3*LIST.VAR4', + ); + foreach (array('1', '2', '3', '4') as $list_col) + { + $colname = 'LIST.VAR' . $list_col; + if (!isset($form[$colname . '_MAX'])) + $form[$colname . '_MAX'] = '5'; + + foreach (array('1', '2', '3', '4', '5') as $list_row) + { + $rowname = $colname . '_' . $list_row; + if (!isset($form[$rowname])) + $form[$rowname] = ''; + } + } + + /* + * VAR7 and VAR 8 is a constraint of times during which + * courses meet + */ + $form['VAR7'] = ''; + $form['VAR8'] = ''; + + /* ``course title keywords'' */ + $form['VAR3'] = ''; + + /* ? */ + $form['VAR6'] = ''; + $form['VAR21'] = ''; + + /* instructor's last name */ + $form['VAR9'] = ''; + + /* + * VAR10 through VAR16 are Monday through Sunday checkboxes + * for days of the week that classes meet. + * + * But we specify no days of the week to avoid this being a + * constraint ;-). + */ + /* + for ($day = 10; $day <= 16; $day ++) + $form['VAR' . $day] = ''; + */ + + /* + * pages is populated by preg_match() below after the first looping. + */ + $pages = array(1 => 0, 2=> 1); + while ($pages[1] < $pages[2]) + { + $html = calvin_crawl_noscript_filter(geturi($uri, $cookies, $form)); + + $results_dom = new DOMDocument(); + $results_dom->loadHTML($html); + + $list_done = FALSE; + for ($list_row = 1; !$list_done; $list_row ++) + { + /* either 'Open' (or 'Closed'?) */ + $openness = dom_input_value($results_dom, 'LIST.VAR1_' . $list_row); + $sec_short_title = dom_id_content($results_dom, 'SEC_SHORT_TITLE_' . $list_row); + $sec_meeting_info = dom_id_content($results_dom, 'SEC_MEETING_INFO_' . $list_row); + + /* check if we're done with this particular page */ + if (!strlen($openness) && !strlen($sec_short_title) && !strlen($sec_meeting_info)) + { + $list_done = TRUE; + break; + } + + /* + * the same info below should be gettable with + * dom_id_content($results_dom, 'SEC_FACULTY_INFO_' . $list_row); + */ + $faculty_name = dom_input_value($results_dom, 'SEC.FACULTY.INFO_' . $list_row); + $credits = dom_input_value($results_dom, 'SEC.MIN.CRED_' . $list_row); /* or id="SEC_FACULTY_INFO_$list_row" */ + $comment = dom_id_content($results_dom, 'SEC_COMMENTS_' . $list_row); /* or name="SEC.COMMENTS_$list_row" */ + + /* parse */ + $section_id = Section::parse($sec_short_title); + + if ($verbosity > 6) + { + echo "\n"; + echo implode('-', $section_id) . ': ' . $sec_short_title . "\n"; + echo $openness . "\n"; + echo $sec_meeting_info . "\n"; + echo $faculty_name . "\n"; + echo $credits . "\n"; + echo $comment . "\n"; + } + + /* + * The input format for this is, thankfully, pretty rigid + * :-D. Example input format: + * + * '01/31/2011-05/11/2011 Lecture Monday, Wednesday 01:00PM - 03:50PM, Spoelhof Center, Room 101' + * + * OR + * + * '01/31/2011-05/18/2011 Practicum Days to be Announced, Times to be AnnouncedTo Be Arranged, Room TBA' + * + * In this latter case.... we'll just ignore the section. + * + * At this point, we don't parse most tokens. We group them + * off. We get the first date, the second date, the type + * ('Lecture', 'Practicum', or some other unknown value), + * the list of days of week the section meets, the start + * time, the end time, and then the meeting location. + */ + if (strpos($sec_meeting_info, 'Times to be Announced') !== FALSE + || strpos($sec_meeting_info, 'Days to be Announced') !== FALSE) + { + if ($verbosity > 2) + error_log('Skipping class because of incomplete meeting time information: ' + . implode('-', $section_id) . ' has meeting info of `' + . $sec_meeting_info . '\''); + $skipped_sections['incomplete meeting info'] ++; + continue; + } + + if (!preg_match(';^([0-9]{2}/[0-9]{2}/[0-9]{4})-([0-9]{2}/[0-9]{2}/[0-9]{4}) ([^ ]+) ([^0-9]+) ([^ ]+) - ([^ ]+), (.*)$;', $sec_meeting_info, $meeting_info_matches)) + { + error_log('Unable to parse calvin section meeting info string into start/end/days information for ' + . implode('-', $section_id) . ': ``' . $sec_meeting_info . '\'\''); + $skipped_sections['invalid meeting info format'] ++; + continue; + } + $date_start = $meeting_info_matches[1]; + $date_end = $meeting_info_matches[2]; + /* e.g., 'Lecture', 'Practicum' */ + $meeting_type = $meeting_info_matches[3]; + $days = school_crawl_days_format(explode(', ', $meeting_info_matches[4])); + $time_start = school_crawl_time_format(strptime($meeting_info_matches[5], '%I:%M%p')); + $time_end = school_crawl_time_format(strptime($meeting_info_matches[6], '%I:%M%p')); + $meeting_place = $meeting_info_matches[7]; + + if ($verbosity > 5) + foreach (array('date_start', 'date_end', 'meeting_type', 'days', 'time_start', 'time_end', 'meeting_place') as $var) + echo $var . ':' . ${$var} . "\n"; + + $section = new Section($section_id['section'], $faculty_name, $time_start, $time_end, $days); + $semester->section_add($section_id['department'], $section_id['course'], $section); + } + + if (!preg_match(';Page ([0-9]+) of ([0-9]+)\$;m', $html, $pages)) + { + error_log('Unable to determine the number of pages in this Calvin resultset'); + break; + } + + if ($verbosity > 0) + { + echo 'calvin_crawl(): finished page ' . $pages[1] . ' of ' . $pages[2] . ' with ' . ($list_row - 1) . " courses.\n"; + } + + $form = array( + 'ACTION*Grp:WSS.COURSE.SECTIONS' => 'NEXT', + ); + } + + $has_stat = FALSE; + if ($verbosity > 1) + foreach ($skipped_sections as $reason => $num) + { + if (!$num) + continue; + if (!$has_stat) + error_log('Skipped some sections for : :'); + error_log($reason . ': ' . $num); + } + + return 0; +} + +/** + * \brief + * Simulate some aspects of a web browser while retreiving a + * document. + * + * This allows us to view our cookies in an associative array and to + * have the server's response automatically update our cookies. + * + * If $post is specified as an associative array, an HTTP POST is + * performed and the data is encoded properly as if we were performing + * a form submission. + * + * Follows redirects. If there is a redirect, the page from which you + * are redirected is lost... but few people put any information on + * those pages anyways ;-). + * + * \param $uri + * The URL to fetch. If a redirect occurs, this is updated. + * \param $cookies + * An associative array of cookies and where to save new cookies. + * \param $post + * If not NULL, causes an HTTP POST. In that case, should be an + * associative array of form keys/values. + * \param $verbosity + * How verbose to be. + * \param $loopspin + * An internal variable to prevent us from following perpetual + * redirects. + * \return + * The body of the document returned by the server (normally + * malformed HTML, especially with Calvin's WebAdvisor + * installation). + */ +function geturi(&$uri, &$cookies, $post = NULL, $verbosity = 0, $loopspin = 0) +{ + global $geturi_write_buf, $geturi_headers_buf, $geturi_verbosity; + + if ($verbosity > 5) + { + echo "\n"; + echo 'geturi(' . $uri . ")\n"; + echo "\n"; + } + + $curl = curl_init(); + + $geturi_verbosity = $verbosity; + $geturi_write_buf = ''; + $geturi_headers_buf = ''; + curl_setopt($curl, CURLOPT_URL, $uri); + + $cookies_str = ''; + foreach ($cookies as $key => $val) + { + if (strlen($cookies_str)) + $cookies_str .= ';'; + $cookies_str .= $key . '=' . $val; + } + + if ($verbosity > 8) + echo 'cookies sent: ' . $cookies_str . "\n"; + curl_setopt($curl, CURLOPT_COOKIE, $cookies_str); + curl_setopt($curl, CURLOPT_HEADERFUNCTION, 'geturi_header_cb'); + curl_setopt($curl, CURLOPT_WRITEFUNCTION, 'geturi_write_cb'); + + if ($post != NULL && is_array($post)) + { + + /* var_dump($post); */ + + $posttxt = ''; + foreach ($post as $postkey => $postval) + { + $posttxt .= (strlen($posttxt) ? '&' : '') + . urlencode($postkey) . '=' . (strpos($postkey, 'MEMBER') === FALSE ? urlencode($postval) : $postval); + } + if ($verbosity > 8) + echo 'setting POST to ' . $posttxt . "\n"; + + /* curl_setopt($curl, CURLOPT_POST, TRUE); */ + curl_setopt($curl, CURLOPT_POSTFIELDS, $posttxt); + } + + curl_exec($curl); + curl_close($curl); + + $location = NULL; + foreach (explode("\r\n", $geturi_headers_buf) as $header) + { + /* + * yes, we don't want the line if the first char is a ':' or if it has no ':' + */ + if (!strpos($header, ':')) + continue; + list($header_name, $header_val) = explode(': ', $header, 2); + + if ($verbosity > 8) + echo $header_name . ' : ' . $header_val . "\n"; + + switch($header_name) + { + case 'Set-Cookie': + list($cookie_name, $cookie_val) = explode('=', $header_val, 2); + if ($verbosity > 9) + { + if (isset($cookies[$cookie_name])) + echo 'Replacing cookie ' . $cookie_name . '=' . $cookies[$cookie_name] + . ' with '; + echo 'new cookie ' . $cookie_name . '=' . $cookie_val . "\n"; + } + $cookies[$cookie_name] = $cookie_val; + break; + + case 'Location': + $location = $header_val; + $location = preg_replace(';(kvdata\.calvin\.edu/)(WebAdvisor);', '\1walive/\2', $location) . "\n"; + $post = NULL; + break; + } + } + + if ($verbosity > 9) + echo $geturi_write_buf; + if ($location && $loopspin < 6) + { + $uri = $location; + return geturi($uri, $cookies, $post, $loopspin + 1); + } + return $geturi_write_buf; +} + +function geturi_header_cb($curl, $header_buf) +{ + global $geturi_headers_buf; + $geturi_headers_buf .= $header_buf; + return strlen($header_buf); +} + +function geturi_write_cb($curl, $write_buf) +{ + global $geturi_write_buf; + $geturi_write_buf .= $write_buf; + return strlen($write_buf); +} + +/** + * \brief + * Find an element and return its value attribute. + * + * \param $domdocument + * The DOMDocument to search. + * \param $name + * The name attribute of the element. + * \return + * The value attribute of the input element or NULL if not found. + */ +function dom_input_value($domdocument, $name) +{ + $xpath = new DOMXPath($domdocument); + $input_node_list = $xpath->query('/descendant::input[attribute::name="' . $name . '"]'); + + if (!$input_node_list->length) + return NULL; + $input_node = $input_node_list->item(0); + if (!$input_node->hasAttribute('value')) + return NULL; + return $input_node->getAttribute('value'); +} + +/** + * \brief + * Returns the content of an element with the given ID. + * + * A convenience function. + * + * \param $domdocument + * A DOMDocument to search. + * \param $id + * The id attribute of the element whose content are requested. + * \return + * A UTF-8 string of the contents of the given element or NULL if + * the element isn't found. + */ +function dom_id_content($domdocument, $id) +{ + $node = $domdocument->getElementById($id); + if ($node) + { + return $node->nodeValue; + } + return NULL; +} + +/** + * \brief + * Searches for and removes a