# HG changeset patch # User Nathan Phillip Brink # Date 2011-02-08 22:50:38 # Node ID 3b78fdf04ce48b84ba1dce75ee1c129a5a8d3c51 # Parent 7845f9d7ad5e8f76e2c5bdb3d244a0dbbcf15c1e Support multiple semesters. Changed crawler API to accept an array to which semesters are appended instead of passing the crawlers an already-created semester object to fill. diff --git a/auto.php b/auto.php --- a/auto.php +++ b/auto.php @@ -63,8 +63,10 @@ if (!count($term_parts)) { if (!$school['crawled']) { clean_empty_exit(); } +$semester = school_semester_guess($school); -$cache_dir = dirname(__FILE__) . DIRECTORY_SEPARATOR . 'cache' . DIRECTORY_SEPARATOR . 'auto' . DIRECTORY_SEPARATOR . $school['id'] . DIRECTORY_SEPARATOR; +$cache_dir = dirname(__FILE__) . DIRECTORY_SEPARATOR . 'cache' . DIRECTORY_SEPARATOR . 'auto' + . DIRECTORY_SEPARATOR . $school['id'] . DIRECTORY_SEPARATOR . $semester['id'] . DIRECTORY_SEPARATOR; /* * autocomplete the list of departments. If the user has already diff --git a/inc/admin.inc b/inc/admin.inc --- a/inc/admin.inc +++ b/inc/admin.inc @@ -126,43 +126,76 @@ function school_cache($schools) */ if ($school['crawled'] && !isset($school['crawled_notreally'])) { - $semester = $school['crawled_semester']; - $cache_auto_school_dir_name = $cache_auto_dir_name . $school['id'] . DIRECTORY_SEPARATOR; if (!is_dir($cache_auto_school_dir_name)) { - if (!mkdir($cache_auto_school_dir_name, 0777, TRUE)) + if (!mkdir($cache_auto_school_dir_name, 0755, TRUE)) error_log('Unable to create needed directory: `' . $cache_auto_dir_name . '\''); } - $departments = $semester->departments_get(); - sort($departments); + $semesters = array(); + $semester_weights = 0; + /* + * Try to presort the semesters into the proper order based + * on time_start/time_end. We want the older semesters to be + * nearer to the end of the list. This way, the crawler + * doesn't have to decide how to sort the semesters itself: + */ + usort($semesters, function ($sem_a, $sem_b) + { + return $sem_a->time_start_get() - $sem_b->time_start_get(); + }); - $dept_file = fopen($cache_auto_school_dir_name . '-depts', 'wb'); - fwrite($dept_file, serialize($departments)); - fclose($dept_file); - - /* now per-department autocomplete */ - foreach ($departments as $department) + foreach ($school['crawled_semesters'] as $semester) { - $classes = $semester->department_classes_get($department); - $classes_file = fopen($cache_auto_school_dir_name . $department . '.sects', 'wb'); - fwrite($classes_file, serialize($classes)); - fclose($classes_file); + $semesters[$semester->id()] = array( + 'id' => $semester->id(), + 'time_start' => $semester->time_start_get(), + 'time_end' => $semester->time_end_get(), + 'weight' => $semester_weights ++, + 'name' => $semester->name_get(), + ); - /* now individual section informations, pre-JSON-ized */ - foreach ($classes as $class) + $cache_auto_school_semester_dir_name = $cache_auto_school_dir_name . $semester->id() . DIRECTORY_SEPARATOR; + if (!is_dir($cache_auto_school_semester_dir_name)) { - if (!is_dir($cache_auto_school_dir_name . $department)) - mkdir($cache_auto_school_dir_name . $department); - $class_file = fopen($cache_auto_school_dir_name . $department . DIRECTORY_SEPARATOR . $class, 'wb'); - fwrite($class_file, json_encode($semester->class_get($department, $class)->to_json_array())); - fclose($class_file); + if (!mkdir($cache_auto_school_semester_dir_name, 0755, TRUE)) + error_log('Unable to create needed directory: `' . $cache_auto_school_semester_dir_name . '\''); } - } + + $departments = $semester->departments_get(); + sort($departments); + + $dept_file = fopen($cache_auto_school_semester_dir_name . '-depts', 'wb'); + fwrite($dept_file, serialize($departments)); + fclose($dept_file); + + /* now per-department autocomplete */ + foreach ($departments as $department) + { + $classes = $semester->department_classes_get($department); + $classes_file = fopen($cache_auto_school_semester_dir_name . $department . '.sects', 'wb'); + fwrite($classes_file, serialize($classes)); + fclose($classes_file); + + /* now individual section informations, pre-JSON-ized */ + foreach ($classes as $class) + { + if (!is_dir($cache_auto_school_semester_dir_name . $department)) + mkdir($cache_auto_school_semester_dir_name . $department); + $class_file = fopen($cache_auto_school_semester_dir_name . $department . DIRECTORY_SEPARATOR . $class, 'wb'); + fwrite($class_file, json_encode($semester->class_get($department, $class)->to_json_array())); + fclose($class_file); + } + } + } /* foreach ( => $semester) */ + /* + * Store/cache the semester metadata: + */ + $semesters_file = fopen($cache_auto_school_dir_name . '-semesters', 'wb'); + fwrite($semesters_file, serialize($semesters)); + fclose($semesters_file); } - - } uasort($list_cache, 'school_cmp'); @@ -195,13 +228,15 @@ function school_cache($schools) * from a school's main .inc file. Thus, if a school supports * crawling, it will have a file called * schools.d/.crawl.inc. In this file, a function called - * _crawl($semester) must be defined. It must accept one - * argument, the Semester object which defines the time of year for - * which courses should be retrieved. It must populate this empty - * Semester object with Course object and populate those courses with - * the sections with as much detail as possible. + * _crawl(array &$semesters, $verbosity = 1) must be + * defined. It must accept at least one argument, the array to be + * filled with Semester objects. It must populate this array with + * individual Semester objects and fill those with Course objects and + * populate those courses with the sections with as much detail as + * possible. This function may return 1 to indicate an error must + * return 0 to indicate success. * - * If the crawling is successful, a 'crawl' key is added to the + * If the crawling is successful, a 'crawled' key is added to the * $school handle. school_cache() will use this to help indicate that * a school _has_ autocomplete information, which might affect the * appearance and JS stuff for the input.php page. @@ -215,7 +250,7 @@ function school_cache($schools) * The season of the year of the semester for which we should grab * data. */ -function school_crawl(&$school, $semester_year, $semester_season, $verbosity = 1) +function school_crawl(array &$school, $verbosity = 1) { $school['crawled'] = FALSE; @@ -223,11 +258,11 @@ function school_crawl(&$school, $semeste if (!function_exists($school_crawl_func)) return; - $semester = new Semester($semester_year, $semester_season); + $semesters = array(); if ($verbosity > 0) fprintf(STDERR, "%s()\n", $school_crawl_func); - $ret = $school_crawl_func($semester, $verbosity); + $ret = $school_crawl_func($semesters, $verbosity); if ($ret) { fprintf(STDERR, "Crawling %s failed: %s() returned nonzero\n", @@ -236,7 +271,7 @@ function school_crawl(&$school, $semeste return; } $school['crawled'] = TRUE; - $school['crawled_semester'] = $semester; + $school['crawled_semesters'] = $semesters; if ($verbosity > 0) fwrite(STDERR, "\n"); @@ -256,9 +291,6 @@ function school_crawl(&$school, $semeste */ function school_cache_recreate($crawl_only = NULL, $verbosity = 1) { - $crawl_semester_year = '2011'; - $crawl_semester_season = Semester::SEASON_SPRING; - $school_id_list = school_list(); if (!$school_id_list) { @@ -292,7 +324,7 @@ function school_cache_recreate($crawl_on if ($crawl_only === NULL || in_array($school['id'], $crawl_only)) { - school_crawl($school, $crawl_semester_year, $crawl_semester_season, $verbosity); + school_crawl($school, $verbosity); } else { diff --git a/inc/class.page.php b/inc/class.page.php --- a/inc/class.page.php +++ b/inc/class.page.php @@ -152,6 +152,7 @@ class page /* everything that needs sessions started to work: */ $this->school = school_load_guess(); + $this->semester = school_semester_guess($this->school); if($immediate && $ntitle != "NOHEAD") @@ -229,7 +230,11 @@ class page '

SlatePermutate

'. PHP_EOL . '

'. PHP_EOL . ' '.$this->pagetitle.''. PHP_EOL . - ' Profile: '.$this->school['name'].' (change)'. PHP_EOL . + ' ' . PHP_EOL + . ' Profile: '.$this->school['name'].' (change)' . PHP_EOL; + if ($this->semester !== NULL) + echo ' Semester: ' . $this->semester['name'] . '(change)' . PHP_EOL; + echo ' '. PHP_EOL . '

'. PHP_EOL . ' '. PHP_EOL . ' '. PHP_EOL . @@ -319,6 +324,52 @@ class page /** * \brief + * Display a list of semesters the user might be interested in. + * \param $linkto + * The link to which a &semester= or ?semester= query string + * should be appended. + */ + public function showSemesters($linkto = 'input.php') + { + if (strpos($linkto, '?')) + $linkto .= '&'; + else + $linkto .= '?'; + /* + * We can pre-htmlentities() $linkto because we're only appending + * a safe string. + */ + $linkto = htmlentities($linkto . 'semester='); + + $time = time(); + + echo "

\n"; + echo "

    \n"; + foreach (school_semesters($this->school) as $semester) + { + $text_extra = array(); + $class_extra = ''; + if ($semester['id'] == $this->semester['id']) + { + $class_extra = ' highlight'; + $text_extra[] = 'selected'; + } + + if ($semester['time_start'] < $time && $semester['time_end'] > $time) + $text_extra[] = 'current'; + + $text_extra = implode($text_extra, ', '); + if (strlen($text_extra)) + $text_extra = ' (' . $text_extra . ')'; + + echo '
  • ' . htmlentities($semester['name']) . '' . $text_extra . "
  • \n"; + } + echo "
\n"; + echo "

\n"; + } + + /** + * \brief * Display school-specific instructions for using slate_permutate. */ public function showSchoolInstructions() diff --git a/inc/class.semester.inc b/inc/class.semester.inc --- a/inc/class.semester.inc +++ b/inc/class.semester.inc @@ -43,21 +43,35 @@ class Semester /** * \brief + * The Summer season. + */ + const SEASON_SUMMER = 'summer'; + + /** + * \brief * Instantiate an empty Semester. * * \param $year * The year of this semester. Must be four digits. * \param $season - * The season of this semester. Currently, only - * Semester::SEASON_SPRING and Semester::SEASON_FALL are valid. + * The season of this semester. Please use the constants + * Semester::SEASON_FALL, Semester::SEASON_SPRING, or + * Semester::SEASON_SUMMER if possible. + * \param $time_start + * Specify a timestamp which roughly estimates when this semester + * starts to aid the algorithm for guessing the current + * semester. See Semester::time_start_set(), which may be used + * instead of this parameter + * \param $time_end + * This may be specified now or via Semester::time_end_set(). */ - function __construct($year, $season) + function __construct($year, $season, $time_start = 0, $time_end = 0) { - if (!in_array($season, array(self::SEASON_SPRING, self::SEASON_FALL))) - throw new ErrorException('Attempt to construct a Semester with a $season which is neither Semester::SEASON_SPRING nor Semester::SEASON_FALL. `' . $season . '\' was given.'); + $this->time_start = 0; + $this->time_end = 0; $this->season = $season; - if (strlen($year) != 4) + if (strlen($year) != 4 || !is_numeric($year)) throw new ErrorException('Attempt to construct a Semester with an invalid year. The given year is `' . $year . '\''); $this->year = $year; @@ -158,6 +172,82 @@ class Semester /** * \brief + * Update the time_end. + * + * The time_end is a unix timestamp roughly estimating the time at + * which a semester starts. It is used when guessing what semester a + * user is interested in. + * + * \param $time_end + * The new time_end. + */ + public function time_end_set($time_end) + { + $this->time_end = $time_end; + } + + /** + * \brief + * Set the time_end only if it would make the semester end later. + * + * Useful for crawler scripts incrementally guessing the endtime of + * a semester. + * + * \param $time_end + * The new time_end to consider. + */ + public function time_end_set_test($time_end) + { + if ($time_end && $time_end > $this->time_end) + $this->time_end_set($time_end); + } + + public function time_end_get() + { + return $this->time_end; + } + + /** + * \brief + * Update the time_start. + * + * The time_start is a unix timestamp roughly estimating the time at + * which a semester starts. It is used when guessing what semester a + * user is interested in. + * + * \param $time_start + * The new time_start. + */ + public function time_start_set($time_start) + { + $this->time_start = $time_start; + } + + /** + * \brief + * Only update the time_start if the time_start isn't yet set or + * if the given time_start is earlier than the stored one. + * + * This should allow crawlers to easily accumulate proper time_start + * and time_end values, see Semester::time_end_set_test(); + * + * \param $time_start + * The new estimation of the semester's start. + */ + public function time_start_set_test($time_start) + { + if ($time_start && + (!$this->time_start || $time_start < $this->time_start)) + $this->time_start_set($time_start); + } + + public function time_start_get() + { + return $this->time_start; + } + + /** + * \brief * Get a semester's year. */ public function year_get() @@ -176,10 +266,38 @@ class Semester /** * \brief + * Get a semester's friendly name: + * + * \return + * A string, the semester's friendly name. + */ + public function name_get() + { + return ucfirst($this->season_get()) . ' ' . $this->year_get(); + } + + /** + * \brief + * Return an identification string for this semester. + * + * Hopefully this identification string should be unique. Also, this + * identification string is filesystem-safe. + * + * \return + * A string which may be used in paths or to uniquely identify + * this semester in the context of its school. + */ + public function id() + { + return $this->year_get() . '_' . $this->season_get(); + } + + /** + * \brief * Enumerate all valid seasons. */ public static function seasons_get_all() { - return array(SEASON_SPRING, SEASON_FALL); + return array(self::SEASON_SPRING, self::SEASON_SUMMER, self::SEASON_FALL); } } diff --git a/inc/school.crawl.inc b/inc/school.crawl.inc --- a/inc/school.crawl.inc +++ b/inc/school.crawl.inc @@ -41,6 +41,22 @@ function school_crawl_time_format($time) /** * \brief + * Equivalent of mktime() except that it accepts strptime()'s output + * format as an input. + * + * \param $tm + * An array formatted as the output of strptime(). + * \return + * A unix timestamp. + */ +function school_crawl_mktime(array $tm) +{ + return mktime($tm['tm_hour'], $tm['tm_min'], $tm['tm_sec'], + $tm['tm_mon'] + 1, $tm['tm_mday'], $tm['tm_year'] + 1900); +} + +/** + * \brief * Take an array of day names and assemble them into * slate_permutate's internal (weird) representation of a set of * weekdays. diff --git a/inc/school.inc b/inc/school.inc --- a/inc/school.inc +++ b/inc/school.inc @@ -43,6 +43,7 @@ * - name: a friendly name for the school. Must be a valid XHTML attribute string. * - url: the school's website URL as a valid XHTML attribute string. (i.e., escape ampersands). * - example_course_id: An example course identifier representative of a school's course IDs. (e.g., CS-101 for Calvin). + * - id: The school's ID. * * \param $school_id * The school's alphanumeric identifier (which determines the name @@ -60,8 +61,7 @@ function school_load($school_id, $load_a /* guard against cracking attempts (protects against '../' and friends) */ if (!preg_match('/^[0-9a-z]+$/', $school_id)) return NULL; - $school_file_name_base = dirname(__FILE__) . DIRECTORY_SEPARATOR - . '..' . DIRECTORY_SEPARATOR . 'school.d' . DIRECTORY_SEPARATOR; + $school_file_name_base = dirname(dirname(__FILE__)) . DIRECTORY_SEPARATOR . 'school.d' . DIRECTORY_SEPARATOR; $school_file_name = $school_file_name_base . $school_id . '.inc'; if (!file_exists($school_file_name)) @@ -87,7 +87,17 @@ function school_load($school_id, $load_a */ $cache = _school_cache_load(); if ($cache && count($cache['list']) && isset($cache['list'][$school['id']])) - $school['crawled'] = $cache['list'][$school['id']]['crawled']; + { + $school['crawled'] = $cache['list'][$school['id']]['crawled']; + + $school_semesters_filename = dirname(dirname(__FILE__)) . DIRECTORY_SEPARATOR . 'cache' + . DIRECTORY_SEPARATOR . 'auto' . DIRECTORY_SEPARATOR . $school['id'] + . DIRECTORY_SEPARATOR . '-semesters'; + if (file_exists($school_semesters_filename)) + $school['semesters'] = unserialize(file_get_contents($school_semesters_filename)); + else + $school['semesters'] = array(); + } return $school; } @@ -272,6 +282,68 @@ function school_instructions_html($schoo /** * \brief + * Return information about available semesters. + * + * \param $school + * The school. + * \return + * An array with keys being semester IDs ordered by weights with + * lowest first and keys of 'id' (the semester's ID), 'name' (the + * friendly name), and 'weight' (lower numbers mean these semesters + * should be earlier, may be positive or negative). 'time_start', + * 'time_end' are unix timestamps estimating the begin and end point + * of each semester. + */ +function school_semesters(array $school) +{ + if (!$school['crawled']) + return array(); + return $school['semesters']; +} + +/** + * \brief + * Return the semester which either the user has selected or which + * makes the most sense. + * + * \param $school + * The school for which a semester should be guessed. + * \return + * An array with the keys 'id', 'name', and 'weight' corresponding + * to the same keys in the arrays returned by school_semesters() or + * NULL if no semester can be found. + */ +function school_semester_guess(array $school) +{ + $semesters = school_semesters($school); + + if (!empty($_REQUEST['semester']) + && isset($semesters[$_REQUEST['semester']])) + { + $semester = $semesters[$_REQUEST['semester']]; + $_SESSION['semester'] = $semester['id']; + return $semester; + } + + if (!empty($_SESSION['semester']) + && isset($semesters[$_SESSION['semester']])) + return $semesters[$_SESSION['semester']]; + + $time = time(); + $next_semester = FALSE; + $semester = NULL; + foreach ($semesters as $semester) + { + if ($next_semester) + return $semester; + if ($semester['time_start'] < $time) + $next_semester = TRUE; + } + return $semester; +} + +/** + * \brief * Return an array of default classes for a particular school. * * \param $school diff --git a/input.php b/input.php --- a/input.php +++ b/input.php @@ -101,6 +101,21 @@ if (!empty($_REQUEST['selectschool']) exit; } +if (!empty($_REQUEST['selectsemester'])) + { +?> +

Semester Selection

+

+ Choose the semester for which you wish you make a schedule from the + list below. If any semester is missing, please let us know. +

+showSemesters(); + $inputPage->foot(); + exit; + } + $inputPage->showSavedScheds($_SESSION); ?>

diff --git a/school.d/calvin.crawl.inc b/school.d/calvin.crawl.inc --- a/school.d/calvin.crawl.inc +++ b/school.d/calvin.crawl.inc @@ -22,19 +22,15 @@ * \brief * Crawl's Calvin's registration course listing pages. * - * \param $semester - * The Semester object which I should populate. + * \param $semesters + * An array to be filled with Semester objects which I should + * populate. * \param $verbosity * How verbose I should be. Sensicle range is from 0 through 10. */ -function calvin_crawl(Semester $semester, $verbosity = 1) +function calvin_crawl(array &$semesters, $verbosity = 1) { /** - * collect a few pbasic stats - */ - $skipped_sections = array('incomplete meeting info' => 0, 'invalid meeting info format' => 0); - - /** * The first link we start at is the one from KV into WebAdvisor. * * 1. https://kvdata.calvin.edu/walive/WebAdvisor?CONSTITUENCY=WBST&type=P&pid=ST-WESTS12A&LASTTOKEN=NULL @@ -90,11 +86,13 @@ function calvin_crawl(Semester $semester foreach ($semesters_select_nodes as $semester_node) { if ($semester_node->tagName != 'option' - || !$semester_node->hasAttribute('value')) + || !$semester_node->hasAttribute('value') + || !strlen($semester_node->getAttribute('value'))) continue; $semester_strs[$semester_node->getAttribute('value')] = $semester_node->nodeValue; } + $semester_strs = array_reverse($semester_strs, TRUE); $departments_select_nodes = $departments_dom->getElementById('LIST_VAR1_1')->childNodes; $departments = array(); @@ -125,23 +123,45 @@ function calvin_crawl(Semester $semester $return_url = dom_input_value($departments_dom, 'RETURN.URL'); - /* ARCT only has >=200 level courses */ - $dept = ''; - $course_level = ''; - $semester_str = substr($semester->year_get(), 2) . '/'; - switch ($semester->season_get()) + if ($verbosity > 4) + fprintf(STDERR, "Available semesters: %s\n", implode($semester_strs, ', ')); + + $semester_start_uri = $uri; + + $season_map = array( + 'FA' => Semester::SEASON_FALL, + 'IN' => 'interim', + 'SP' => Semester::SEASON_SPRING, + 'MA' => 'may', + /* I don't know if SU is a valid Calvin Smester ID or not */ + 'SU' => Semester::SEASON_SUMMER); + foreach ($semester_strs as $semester_str => $semester_info) { - case Semester::SEASON_SPRING: - $semester_str .= 'SP'; - break; + if (empty($season_map[substr($semester_str, 3)])) + { + fprintf(STDERR, "Warning: Unknown semester identification chars: %s. Skipping this semester.\n", + $semester_str); + continue; + } + $season = $season_map[substr($semester_str, 3)]; + $year_timespec = strptime(substr($semester_str, 0, 2), '%y'); + $year = $year_timespec['tm_year'] + 1900; + + $semester = new Semester($year, $season); - case Semester::SEASON_FALL: - $semester_str .= 'FA'; - break; - } - if (!isset($semester_strs[$semester_str])) - error_log('Couldn\'t find a semester in Calvin\'s database for ' . $semester_str . ' (' . $semester->season_get() . ', ' . $semester->year_get() . ')'); + /* useful and necessary stats */ + $skipped_sections = array('incomplete meeting info' => 0, 'invalid meeting info format' => 0); + + $semester_start_min = 0; + $semester_end_max = 0; + $dept = ''; + $course_level = ''; + $uri = $semester_start_uri; + + if ($verbosity) + fprintf(STDERR, "Crawling semester %s->%s\n", + $semester_str, $semester_info); /* * LIST.VAR_: is the column, is the row. There @@ -158,13 +178,11 @@ function calvin_crawl(Semester $semester $form = array('VAR1' => $semester_str, 'LIST.VAR1_1' => $dept, 'LIST.VAR2_1' => $course_level, - ); - /* - * other form items we're not querying but which need to be - * sent blankly - */ - $form += array( + /* + * Other form items we're not querying but which need + * to be sent blankly. + */ 'RETURN.URL' => $return_url, 'SUBMIT_OPTIONS' => '', /* @@ -179,7 +197,7 @@ function calvin_crawl(Semester $semester 'LIST.VAR1_CONTROLLER' => 'LIST.VAR1', 'LIST.VAR1_MEMBERS' => 'LIST.VAR1*LIST.VAR2*LIST.VAR3*LIST.VAR4', - ); + ); foreach (array('1', '2', '3', '4') as $list_col) { $colname = 'LIST.VAR' . $list_col; @@ -339,6 +357,25 @@ function calvin_crawl(Semester $semester $section = new Section($section_id['section'], array(new SectionMeeting($days, $time_start, $time_end, $meeting_place, $meeting_type)), $synonym, $faculty_name); $semester->section_add($section_id['department'], $section_id['course'], $section); + + /* + * Try to update semester's longetivity stats to help the + * school_semester_guess() function: + */ + $date_start_time = strptime($date_start, '%m/%d/%Y'); + $date_end_time = strptime($date_end, '%m/%d/%Y'); + if ($date_start_time !== FALSE) + { + $date_start_time = school_crawl_mktime($date_start_time); + if (!$semester_start_min || $semester_start_min > $date_start_time) + $semester_start_min = $date_start_time; + } + if ($date_end_time !== FALSE) + { + $date_end_time = school_crawl_mktime($date_end_time); + if ($semester_end_max < $date_end_time) + $semester_end_max = $date_end_time; + } } if (!preg_match(';Page ([0-9]+) of ([0-9]+)\$;m', $html, $pages)) @@ -368,6 +405,15 @@ function calvin_crawl(Semester $semester error_log($reason . ': ' . $num); } + $semester->time_end_set($semester_end_max); + $semester->time_start_set($semester_start_min); + + $semesters[] = $semester; + + if ($verbosity) + fprintf(STDERR, "\n"); + } + return 0; } diff --git a/school.d/ccbcmd.crawl.inc b/school.d/ccbcmd.crawl.inc --- a/school.d/ccbcmd.crawl.inc +++ b/school.d/ccbcmd.crawl.inc @@ -29,7 +29,7 @@ * \return * 1 on failure, 0 on success. */ -function ccbcmd_crawl(Semester $semester, $verbosity = 1) +function ccbcmd_crawl(array &$semesters, $verbosity = 1) { $cookies = array(); @@ -49,42 +49,46 @@ function ccbcmd_crawl(Semester $semester return 1; } - $semester_strings = array($semester->year_get(), ucfirst($semester->season_get())); - $semester_value = NULL; - foreach ($semesters_select_node->childNodes as $semesters_option_node) - { - $semester_match = TRUE; - foreach ($semester_strings as $semester_string) - if (stripos($semesters_option_node->textContent, $semester_string) === FALSE) - { - $semester_match = FALSE; - break; - } - if ($semester_match) - { - $semester_value = $semesters_option_node->getAttribute('value'); - break; - } - } + $semester_stage_uri = $uri; - $semester_string = implode(' ', $semester_strings); - if ($semester_value === NULL) - { - fprintf(STDERR, "Could not find the desired semester, ``%s'', in the list of available semesters.\n", - $semester_string); - return 1; - } - - if ($verbosity > 1) - fprintf(STDERR, "Found semester: %s=``%s''=``%s''.\n", - $semester_value, $semester_string, trim($semesters_option_node->textContent)); $semesters_form = school_crawl_element_ancestor($semesters_select_node, 'form'); if ($semesters_form === NULL) { fprintf(STDERR, "Unable to find

associated with semester.\n"); return 1; } - $semesters_post = school_crawl_form($semesters_form); + $semesters_post_save = school_crawl_form($semesters_form); + + foreach ($semesters_select_node->childNodes as $semesters_option_node) + { + $semester_text = $semesters_option_node->textContent; + $semester_value = $semesters_option_node->getAttribute('value'); + if (empty($semester_value)) + /* skip the empty ``None'' semester */ + continue; + + if (stripos($semester_text, 'continuing') !== FALSE) + /* skip the year-long semesters dedicated to continuing education */ + continue; + + $semester_text_parts = explode(' ', $semester_text); + $semester_season = $semester_text_parts[0]; + $semester_year = $semester_text_parts[1]; + + /* the college has two separate summer sessions, so distinguish between them */ + if (preg_match(';session ([0-9]+);i', $semester_text, $matches)) + $semester_season .= '_' . $matches[1]; + + if ($verbosity) + fprintf(STDERR, "Crawling semester %s:%s -> %s.\n", $semester_year, $semester_season, $semester_text); + $semester = new Semester($semester_year, strtolower($semester_season)); + + if ($verbosity > 1) + fprintf(STDERR, "Found semester: %s=``%s''=``%s''.\n", + $semester_value, $semester->id(), trim($semesters_option_node->textContent)); + /* load stored semester-page URI / form data */ + $semesters_post = $semesters_post_save; + $uri = $semester_stage_uri; $semesters_post[$semesters_select_node->getAttribute('name')] = $semester_value; $subjects_dom = new DOMDocument(); @@ -126,6 +130,7 @@ function ccbcmd_crawl(Semester $semester /* there's a boolean column which says whether or not the course has any prerequisites/corequisites.... */ 'credits' => school_crawl_table_resolve_column($tr_header_node, 'credhrs'), /* there's a column for the number of contact hours, vs. credit hours */ + 'dates' => school_crawl_table_resolve_column($tr_header_node, 'sessiondates'), ); foreach (array('title', 'days', 'times', 'instructor', 'location') as $column_key) $section_offsets[$column_key] = school_crawl_table_resolve_column($tr_header_node, $column_key); @@ -230,12 +235,23 @@ function ccbcmd_crawl(Semester $semester $section_meetings[] = new SectionMeeting($days, school_crawl_time_format($time_start), school_crawl_time_format($time_end), $children->item($section_offsets['location'])->textContent); + + /* check if a semester's date range should be increased */ + $section_dates = $children->item($section_offsets['dates'])->textContent; + if (preg_match(';^([0-9]+)/([0-9]+)-([0-9]+)/([0-9]+)$;', $section_dates, $section_dates_matches)) + { + $semester->time_start_set_test(mktime(0, 0, 0, $section_dates_matches[1], $section_dates_matches[2], $semester->year_get())); + $semester->time_end_set_test( mktime(0, 0, 0, $section_dates_matches[3], $section_dates_matches[4], $semester->year_get())); + } } $semester->section_add($section_id_parts['department'], $section_id_parts['course'], new Section($section_id_parts['section'], $section_meetings, $registration_number, $instructor)); } + $semesters[] = $semester; + } + return 0; } diff --git a/school.d/cedarville.crawl.inc b/school.d/cedarville.crawl.inc --- a/school.d/cedarville.crawl.inc +++ b/school.d/cedarville.crawl.inc @@ -53,13 +53,8 @@ function table_parse($html) } /** Crawls Cedarville course listings. $season is "fa" or "sp", year is 4-digit year */ -function cedarville_crawl($semester, $verbosity = 1) +function cedarville_crawl(array &$semesters, $verbosity = 1) { - - $season = strtolower(substr($semester->season_get(), 0, 2)); - $year = $semester->year_get(); - $season_string = $year . $season; - $basepath = 'http://cedarville.edu/courses/schedule/'; if ($verbosity) @@ -67,12 +62,46 @@ function cedarville_crawl($semester, $ve if ($verbosity > 1) echo "cedarville_crawl(): Determining list of departments.\n"; + + if ($verbosity > 1) + fprintf(STDERR, "cedarville_crawl(): Determining list of semesters.\n"); + $semesters_dom = new DOMDocument(); + $semesters_dom->loadHTML(file_get_contents($basepath)); + + $content_div_dom = $semesters_dom->getElementById('contenttext'); + if (!$content_div_dom) + { + fprintf(STDERR, "cedarville_crawl(): Error finding location of the list of departments.\n"); + return 1; + } + $departments_xpath = new DOMXPath($semesters_dom); + foreach ($departments_xpath->query('.//li/a') as $department_a_dom) + { + $semester_href = $department_a_dom->getAttribute('href'); + $semester_href_parts = split('_', $semester_href); + + $semester_name = $department_a_dom->textContent; + if (stripos($semester_name, 'graduate') !== FALSE + || strpos($semester_href, 'index') === FALSE) + /* cedarville has about 1 graduate course, lol */ + continue; + $semester_name_parts = split(' ', $semester_name); + + $semester_year = $semester_name_parts[0]; + $semester_season = strtolower($semester_name_parts[1]); + + $semester = new Semester($semester_year, $semester_season); + + if ($verbosity > 1) + fprintf(STDERR, "cedarville_crawl(): Crawling semester: %s.\n", + $semester_name); + /* * We need two passes because the first department's code name is * not accessible available in the first pageload. */ $departments = array(); - if (cedarville_crawl_departments_get($basepath . $year . $season . '_index.htm', $departments, $season_string)) + if (cedarville_crawl_departments_get($basepath . $semester_href, $departments, $semester_href_parts[0])) return 1; if (!count($departments)) { @@ -80,14 +109,15 @@ function cedarville_crawl($semester, $ve return 1; } /* find the first department whose name we don't yet know */ - if (cedarville_crawl_departments_get($basepath . $year . $season . '_' . current(array_keys($departments)) . '_all.htm', $departments, $season_string)) + if (cedarville_crawl_departments_get($basepath . $semester_href_parts[0] . '_' . current(array_keys($departments)) . '_all.htm', $departments, $semester_href_parts[0])) return 1; $tables = array(); foreach ($departments as $department => $dept_name) { - echo 'cedarville_crawl(): Crawling department ' . $department . ' (' . $dept_name . ")...\n"; - $html = file_get_contents($basepath . $year . $season . '_' . $department . '_' . 'all.htm'); + if ($verbosity > 2) + echo 'cedarville_crawl(): Crawling department ' . $department . ' (' . $dept_name . ")...\n"; + $html = file_get_contents($basepath . $semester_href_parts[0] . '_' . $department . '_' . 'all.htm'); if (!$html) continue; $tables[$department] = table_parse(cedarville_html_fix($html)); @@ -209,6 +239,9 @@ function cedarville_crawl($semester, $ve } } + $semesters[] = $semester; + } + return 0; }