# HG changeset patch # User Nathan Phillip Brink # Date 2011-10-08 01:31:20 # Node ID 3db76bd5a41c1af3ebb86359301c5b4e786256d0 # Parent 40a1382a9358076ffb29dfee42e86aec5b2890c4 Refactor crawling to write out data a semester at a time instead of for all semesters at once, enabling PHP to use a reasonable memory limit. Convert calvin, cedarville, and ccbcmd to the new crawling method. diff --git a/inc/admin.inc b/inc/admin.inc --- a/inc/admin.inc +++ b/inc/admin.inc @@ -152,39 +152,6 @@ function school_cache($schools) 'weight' => $semester_weights ++, 'name' => $semester->name_get(), ); - - $cache_auto_school_semester_dir_name = $cache_auto_school_dir_name . $semester->id() . DIRECTORY_SEPARATOR; - if (!is_dir($cache_auto_school_semester_dir_name)) - { - if (!mkdir($cache_auto_school_semester_dir_name, 0755, TRUE)) - error_log('Unable to create needed directory: `' . $cache_auto_school_semester_dir_name . '\''); - } - - $departments = $semester->departments_get(); - sort($departments); - - $dept_file = fopen($cache_auto_school_semester_dir_name . '-depts', 'wb'); - fwrite($dept_file, serialize($departments)); - fclose($dept_file); - - /* now per-department autocomplete */ - foreach ($departments as $department) - { - $classes = $semester->department_classes_get($department); - $classes_file = fopen($cache_auto_school_semester_dir_name . $department . '.sects', 'wb'); - fwrite($classes_file, serialize($classes)); - fclose($classes_file); - - /* now individual section informations, pre-JSON-ized */ - foreach ($classes as $class) - { - if (!is_dir($cache_auto_school_semester_dir_name . $department)) - mkdir($cache_auto_school_semester_dir_name . $department); - $class_file = fopen($cache_auto_school_semester_dir_name . $department . DIRECTORY_SEPARATOR . $class, 'wb'); - fwrite($class_file, json_encode($semester->class_get($department, $class)->to_json_array())); - fclose($class_file); - } - } } /* foreach ( => $semester) */ /* * Store/cache the semester metadata: @@ -260,10 +227,20 @@ function school_cache_semesters_sort (Se */ function school_crawl(array &$school, Page $page = NULL, $verbosity = 1) { + $cache_dir_name = dirname(__FILE__) . DIRECTORY_SEPARATOR . '..' + . DIRECTORY_SEPARATOR . 'cache' . DIRECTORY_SEPARATOR; + $cache_auto_dir_name = $cache_dir_name . 'auto' . DIRECTORY_SEPARATOR; + $cache_auto_school_dir_name = $cache_auto_dir_name . $school['id'] . DIRECTORY_SEPARATOR; + if (!is_dir($cache_auto_school_dir_name)) + { + if (!mkdir($cache_auto_school_dir_name, 0755, TRUE)) + error_log('Unable to create needed directory: `' . $cache_auto_dir_name . '\''); + } + $school['crawled'] = FALSE; - $school_crawl_func = $school['id'] . '_crawl'; - if (!function_exists($school_crawl_func)) + $school_crawl_semesters_list_func = $school['id'] . '_crawl_semester_list'; + if (!function_exists($school_crawl_semesters_list_func)) return; $school_crawl_log_opts = array('verbosity' => $verbosity); @@ -276,20 +253,81 @@ function school_crawl(array &$school, Pa $semesters = array(); if ($verbosity > 0) - school_crawl_logf($school_crawl_log, 2, "Calling crawler..."); + school_crawl_logf($school_crawl_log, 2, "Calling %s()...", $school_crawl_semesters_list_func); - $ret = $school_crawl_func($semesters, $school_crawl_log, $verbosity); + $ret = $school_crawl_semesters_list_func($school, $semesters, $school_crawl_log); if ($ret) { school_crawl_logf($school_crawl_log, 1, "Crawling %s failed: %s() returned nonzero", - $school['id'], $school_crawl_func); + $school['id'], $school_crawl_semesters_list_func); school_crawl_logf($school_crawl_log, 6, ""); return; } + + $successful_semesters = array(); + $school_crawl_semester_func = $school['id'] . '_crawl_semester'; + if (!function_exists($school_crawl_semester_func)) + { + school_crawl_logf($school_crawl_log, 3, "%s() is defined but %s() isn't.", + $school_crawl_semesters_list_func, $school_crawl_semester_func); + return; + } + + foreach ($semesters as $semester) + { + school_crawl_logf($school_crawl_log, 2, "Calling %s(%s)...", $school_crawl_semester_func, $semester); + $ret = $school_crawl_semester_func($school, $semester, $school_crawl_log); + if ($ret) + { + school_crawl_logf($school_crawl_log, 1, "Failed to crawl semester %s. Skipping semester.", $semester); + continue; + } + + /* + * Write out this semester's cache now that we're here. + */ + $cache_auto_school_semester_dir_name = $cache_auto_school_dir_name . $semester->id() . DIRECTORY_SEPARATOR; + if (!is_dir($cache_auto_school_semester_dir_name)) + { + if (!mkdir($cache_auto_school_semester_dir_name, 0755, TRUE)) + error_log('Unable to create needed directory: `' . $cache_auto_school_semester_dir_name . '\''); + } + + $departments = $semester->departments_get(); + sort($departments); + + $dept_file = fopen($cache_auto_school_semester_dir_name . '-depts', 'wb'); + fwrite($dept_file, serialize($departments)); + fclose($dept_file); + + /* now per-department autocomplete */ + foreach ($departments as $department) + { + $classes = $semester->department_classes_get($department); + $classes_file = fopen($cache_auto_school_semester_dir_name . $department . '.sects', 'wb'); + fwrite($classes_file, serialize($classes)); + fclose($classes_file); + + /* now individual section informations, pre-JSON-ized */ + foreach ($classes as $class) + { + if (!is_dir($cache_auto_school_semester_dir_name . $department)) + mkdir($cache_auto_school_semester_dir_name . $department); + $class_file = fopen($cache_auto_school_semester_dir_name . $department . DIRECTORY_SEPARATOR . $class, 'wb'); + fwrite($class_file, json_encode($semester->class_get($department, $class)->to_json_array())); + fclose($class_file); + } + } + + /* Purge the data written to disk from memory */ + $semester->purge(); + + school_crawl_logf($school_crawl_log, 6, ""); + $successful_semesters[] = $semester; + } + $school['crawled'] = TRUE; - $school['crawled_semesters'] = $semesters; - - school_crawl_logf($school_crawl_log, 6, ""); + $school['crawled_semesters'] = $successful_semesters; return $school_crawl_log; } @@ -319,7 +357,7 @@ function school_cache_recreate($crawl_on foreach ($crawl_only as $crawl_only_school_id) if (!in_array($crawl_only_school_id, $school_id_list)) { - fprintf(STDERR, "error: Invalid school_id specified for crawling: %s", + fprintf(STDERR, "error: Invalid school_id specified for crawling: %s\n", $crawl_only_school_id); return 1; } diff --git a/inc/class.semester.inc b/inc/class.semester.inc --- a/inc/class.semester.inc +++ b/inc/class.semester.inc @@ -369,4 +369,13 @@ class Semester { return array(self::SEASON_SPRING, self::SEASON_SUMMER, self::SEASON_FALL); } + + /** + * \brief + * Clean the semester of all sections, keeping metadata intact. + */ + public function purge() + { + $this->departments = array(); + } } diff --git a/inc/school.crawl.inc b/inc/school.crawl.inc --- a/inc/school.crawl.inc +++ b/inc/school.crawl.inc @@ -126,18 +126,24 @@ function school_crawl_time_format($time) /** * \brief - * Equivalent of mktime() except that it accepts strptime()'s output - * format as an input. + * Equivalent of gmmktime() except that it accepts strptime()'s + * output format as an input. * * \param $tm * An array formatted as the output of strptime(). + * \param $timezone_offset + * Optional offset of the school's timezone in seconds from + * UTC. This offset gets _added_ to the resulting timestamp. So, for + * example, Eastern Daylight Time would use a value of 60*60 * -4 + * since it is -0400 during Daylight time. * \return * A unix timestamp. */ -function school_crawl_mktime(array $tm) +function school_crawl_gmmktime(array $tm, $timezone_offset = 0) { - return mktime($tm['tm_hour'], $tm['tm_min'], $tm['tm_sec'], - $tm['tm_mon'] + 1, $tm['tm_mday'], $tm['tm_year'] + 1900); + return gmmktime($tm['tm_hour'], $tm['tm_min'], $tm['tm_sec'], + $tm['tm_mon'] + 1, $tm['tm_mday'], $tm['tm_year'] + 1900) + + $timezone_offset; } /** diff --git a/school.d/calvin.crawl.inc b/school.d/calvin.crawl.inc --- a/school.d/calvin.crawl.inc +++ b/school.d/calvin.crawl.inc @@ -20,16 +20,26 @@ /** * \brief - * Crawl's Calvin's registration course listing pages. + * Retrieve a list of crawlable semesters from Calvin College. * + * \param $school + * The calvin school handle. * \param $semesters - * An array to be filled with Semester objects which I should - * populate. + * The array to populate with empty Semester objects. * \param $school_crawl_log - * A school_crawl_log handle. + * A school_crawl_log handle for informing the user/developer of + * progress. */ -function calvin_crawl(array &$semesters, &$school_crawl_log) +function calvin_crawl_semester_list(array $school, array &$semesters, &$school_crawl_log) { + $season_map = array( + 'FA' => Semester::SEASON_FALL, + 'IN' => 'interim', + 'SP' => Semester::SEASON_SPRING, + 'MA' => 'may', + /* I don't know if SU is a valid Calvin Semester ID or not */ + 'SU' => Semester::SEASON_SUMMER); + /** * The first link we start at is the one from KV into WebAdvisor. * @@ -48,95 +58,68 @@ function calvin_crawl(array &$semesters, */ $cookies = array(); + $uri = 'https://kvdata.calvin.edu/walive/WebAdvisor?CONSTITUENCY=WBST&type=P&pid=ST-WESTS12A&LASTTOKEN=NULL'; + $semesters_html = calvin_crawl_geturi($uri, $cookies, $school_crawl_log); - $uri = 'https://kvdata.calvin.edu/walive/WebAdvisor?CONSTITUENCY=WBST&type=P&pid=ST-WESTS12A&LASTTOKEN=NULL'; - $departments_html = calvin_crawl_geturi($uri, $cookies, $school_crawl_log); - - $departments_dom = new DOMDocument(); - $departments_dom->loadHTML($departments_html); + $semesters_dom = new DOMDocument(); + $semesters_dom->loadHTML($semesters_html); /* * Discover the available semesters */ - $semesters_select_nodes = $departments_dom->getElementById('VAR1')->childNodes; - $semester_strs = array(); + $semesters_var1 = $semesters_dom->getElementById('VAR1'); + if (empty($semesters_var1)) + { + school_crawl_logf($school_crawl_log, 0, "Error: Unable to load list of semesters."); + return 1; + } + $semesters_select_nodes = $semesters_var1->childNodes; foreach ($semesters_select_nodes as $semester_node) { if ($semester_node->tagName != 'option' || !$semester_node->hasAttribute('value') || !strlen($semester_node->getAttribute('value'))) continue; - $semester_strs[$semester_node->getAttribute('value')] = - $semester_node->nodeValue; - } - $semester_strs = array_reverse($semester_strs, TRUE); - - $departments_select_nodes = $departments_dom->getElementById('LIST_VAR1_1')->childNodes; - $departments = array(); - foreach ($departments_select_nodes as $dept_node) - { - if ($dept_node->tagName != 'option' - || !$dept_node->hasAttribute('value')) - continue; - $departments[$dept_node->getAttribute('value')] = - $dept_node->nodeValue; - } - - /* - * get all of the different possible course levels... dynamically - * rather than hardcodedly ;-). - */ - $departments_select_nodes = $departments_dom->getElementById('LIST_VAR1_2')->childNodes; - $course_levels = array(); - foreach ($departments_select_nodes as $courselevel_node) - { - if ($courselevel_node->tagName != 'option' - || !$courselevel_node->hasAttribute('value')) - continue; - $course_levels[] = $courselevel_node->getAttribute('value'); - } + $semester_str = $semester_node->getAttribute('value'); - $return_url = dom_input_value($departments_dom, 'RETURN.URL'); - - - school_crawl_logf($school_crawl_log, 7, "Available semesters: %s.", implode($semester_strs, ', ')); - - $semester_start_uri = $uri; - - $season_map = array( - 'FA' => Semester::SEASON_FALL, - 'IN' => 'interim', - 'SP' => Semester::SEASON_SPRING, - 'MA' => 'may', - /* I don't know if SU is a valid Calvin Smester ID or not */ - 'SU' => Semester::SEASON_SUMMER); - foreach ($semester_strs as $semester_str => $semester_info) - { if (empty($season_map[substr($semester_str, 3)])) { school_crawl_logf($school_crawl_log, 6, "Warning: Unknown semester identification chars: %s. Skipping this semester.", - $semester_str); + $semester_str); continue; } $season = $season_map[substr($semester_str, 3)]; $year_timespec = strptime(substr($semester_str, 0, 2), '%y'); - $year = $year_timespec['tm_year'] + 1900; + $year = $year_timespec['tm_year'] + 1900; $semester = new Semester($year, $season); + $semesters[$semester_str] = $semester; + } + $semester = array_reverse($semesters, TRUE); - /* useful and necessary stats */ - $skipped_sections = array('incomplete meeting info' => 0, 'invalid meeting info format' => 0); - - $semester_start_min = 0; - $semester_end_max = 0; + return 0; +} - $dept = ''; - $course_level = ''; - $uri = $semester_start_uri; - - school_crawl_logf($school_crawl_log, 6, "Crawling semester %s->%s.", - $semester_str, $semester_info); +/** + * \brief + * Crawl the courses for a semester from Calvin College. + * + * \param $school + * The calvin school handle. + * \param $semester + * The Semester object to populate with courses. + * \param $school_crawl_log + * The logger handle. + */ +function calvin_crawl_semester(array $school, Semester $semester, &$school_crawl_log) +{ + $cookies = array(); + $uri = 'https://kvdata.calvin.edu/walive/WebAdvisor?CONSTITUENCY=WBST&type=P&pid=ST-WESTS12A&LASTTOKEN=NULL'; + $html = calvin_crawl_geturi($uri, $cookies, $school_crawl_log); + $seed_dom = new DOMDocument(); + $seed_dom->loadHTML($html); + $return_url = dom_input_value($seed_dom, 'RETURN.URL'); /* * LIST.VAR_: is the column, is the row. There @@ -150,9 +133,12 @@ function calvin_crawl(array &$semesters, * LIST.VAR4: I forget * */ + $semester_str = sprintf("%02d/%s", $semester->year_get() % 100, strtoupper(substr($semester->season, 0, 2))); + school_crawl_logf($school_crawl_log, 6, 'Using %s for a semester string.', + $semester_str); $form = array('VAR1' => $semester_str, - 'LIST.VAR1_1' => $dept, - 'LIST.VAR2_1' => $course_level, + 'LIST.VAR1_1' => '', + 'LIST.VAR2_1' => '', /* * Other form items we're not querying but which need @@ -216,10 +202,14 @@ function calvin_crawl(array &$semesters, $form['VAR' . $day] = ''; */ + $semester_start_min = 0; + $semester_end_max = 0; + + $skipped_sections = array('incomplete meeting info' => 0, 'invalid meeting info format' => 0); /* * pages is populated by preg_match() below after the first looping. */ - $pages = array(1 => 0, 2=> 1); + $pages = array(1 => 0, 2 => 1); while ($pages[1] < $pages[2]) { $html = calvin_crawl_noscript_filter(school_crawl_geturi($uri, $cookies, $school_crawl_log, $form)); @@ -243,7 +233,7 @@ function calvin_crawl(array &$semesters, } /* - * the same info below should be gettable with + * The same info below should be retrievable with * dom_id_content($results_dom, 'SEC_FACULTY_INFO_' . $list_row); */ $faculty_name = dom_input_value($results_dom, 'SEC.FACULTY.INFO_' . $list_row); @@ -396,13 +386,16 @@ function calvin_crawl(array &$semesters, $date_end_time = strptime($date_end, '%m/%d/%Y'); if ($date_start_time !== FALSE) { - $date_start_time = school_crawl_mktime($date_start_time); + $date_start_time = school_crawl_gmmktime($date_start_time, -5 * 60*60); if (!$semester_start_min || $semester_start_min > $date_start_time) - $semester_start_min = $date_start_time; + { + school_crawl_logf($school_crawl_log, 1, "Using section %s for the minimum start time.", $section_id['department'] . '-' . $section_id['course'] . '-' . $section_id['section']); + $semester_start_min = $date_start_time; + } } if ($date_end_time !== FALSE) { - $date_end_time = school_crawl_mktime($date_end_time); + $date_end_time = school_crawl_gmmktime($date_end_time, -5 * 60*60); if ($semester_end_max < $date_end_time) $semester_end_max = $date_end_time; } @@ -432,29 +425,24 @@ function calvin_crawl(array &$semesters, school_crawl_logf($school_crawl_log, 7, "%s: %d", $reason, $num); } - $semester->time_end_set($semester_end_max); - $semester->time_start_set($semester_start_min); + $semester->time_end_set($semester_end_max); + $semester->time_start_set($semester_start_min); - /* - * Calculate lab-based course dependencies. - */ - school_crawl_logf($school_crawl_log, 7, 'Adding implicit lab dependencies.'); - foreach ($semester->departments_get() as $department) - foreach ($semester->department_classes_get($department) as $course) - { - $the_course = $semester->class_get($department, $course); - $lab_course = $semester->class_get($department, $course . 'L'); - if (!empty($lab_course)) - { - $the_course->dependency_add($lab_course); - school_crawl_logf($school_crawl_log, 8, "Adding dependency of %s-%s for %s-%s.", - $department, $course . 'L', $department, $course); - } + /* + * Calculate lab-based course dependencies. + */ + school_crawl_logf($school_crawl_log, 7, 'Adding implicit lab dependencies.'); + foreach ($semester->departments_get() as $department) + foreach ($semester->department_classes_get($department) as $course) + { + $the_course = $semester->class_get($department, $course); + $lab_course = $semester->class_get($department, $course . 'L'); + if (!empty($lab_course)) + { + $the_course->dependency_add($lab_course); + school_crawl_logf($school_crawl_log, 8, "Adding dependency of %s-%s for %s-%s.", + $department, $course . 'L', $department, $course); } - - $semesters[] = $semester; - - school_crawl_logf($school_crawl_log, 6, ""); } return 0; diff --git a/school.d/ccbcmd.crawl.inc b/school.d/ccbcmd.crawl.inc --- a/school.d/ccbcmd.crawl.inc +++ b/school.d/ccbcmd.crawl.inc @@ -18,6 +18,282 @@ * along with slate_permutate. If not, see . */ +define('CCBCMD_CRAWL_URI', 'http://ccbcmd.edu/schedule/sched.html'); + +/** + * \brief + * Obtain list of crawlable semesters offered by CCBCMD. + * + * \parram $school + * The CCBCMD school handle. + * \param $semesters + * Array to populate with available semesters. + * \return + * 0 on success. + */ +function ccbcmd_crawl_semester_list(array $school, array &$semesters, &$school_crawl_log) +{ + $cookies = array(); + + /* + * It seems that http://ccbcmd.edu/schedule/sched.html is what we're + * meant to start from. That's just a redirect to some other page + * from which we get a listing of available semesters and choose + * one. + */ + $uri = CCBCMD_CRAWL_URI; + $semesters_dom = new DOMDocument(); + $semesters_dom->loadHTML(school_crawl_geturi($uri, $cookies, $school_crawl_log, NULL, TRUE, 'ccbcmd_crawl_curlhook')); + $semesters_select_node = $semesters_dom->getElementById('term_input_id'); + if ($semesters_select_node === NULL) + { + school_crawl_logf($school_crawl_log, 0, "Could not get list of available semesters to choose from."); + return 1; + } + + foreach ($semesters_select_node->childNodes as $semesters_option_node) + { + $semester_text = $semesters_option_node->textContent; + $semester_value = $semesters_option_node->getAttribute('value'); + if (empty($semester_value)) + /* skip the empty ``None'' semester */ + continue; + + if (stripos($semester_text, 'continuing') !== FALSE) + /* skip the year-long semesters dedicated to continuing education */ + continue; + + list($semester_season, $semester_year) = explode(' ', $semester_text); + + /* the college has two separate summer sessions, so distinguish between them */ + if (preg_match(';session ([0-9]+);i', $semester_text, $matches)) + $semester_season .= '_' . $matches[1]; + + $semesters[] = new Semester($semester_year, strtolower($semester_season)); + } + + return 0; +} + +/** + * \brief + * Crawl a CCBCMD semester. + * + * \param $school + * The CCBCMD school handle. + * \param $semester + * The semester to fill with courses. + */ +function ccbcmd_crawl_semester($school, $semester, &$school_crawl_log) +{ + $cookies = array(); + $uri = CCBCMD_CRAWL_URI; + $semesters_dom = new DOMDocument(); + $semesters_dom->loadHTML(school_crawl_geturi($uri, $cookies, $school_crawl_log, NULL, TRUE, 'ccbcmd_crawl_curlhook')); + $semesters_select_node = $semesters_dom->getElementById('term_input_id'); + if (empty($semesters_select_node)) + { + school_crawl_logf($school_crawl_log, 0, "Could not locate the list of semesters from which to choose."); + return 1; + } + + $semesters_form = school_crawl_element_ancestor($semesters_select_node, 'form'); + if ($semesters_form === NULL) + { + school_crawl_logf($school_crawl_log, 0, "Unable to find
associated with semester."); + return 1; + } + $semesters_post = school_crawl_form($semesters_form); + + $semester_found = FALSE; + foreach ($semesters_select_node->childNodes as $semesters_option_node) + { + $semester_text = $semesters_option_node->textContent; + $semester_value = $semesters_option_node->getAttribute('value'); + if (empty($semester_value)) + continue; + + list($semester_season, $semester_year) = explode(' ', $semester_text); + if (preg_match(';session ([0-9]+);i', $semester_text, $matches)) + $semester_season .= '_' . $matches[1]; + $semester_season = strtolower($semester_season); + + if ($semester_year == $semester->year_get() + && $semester_season == $semester->season_get()) + { + $semester_found = TRUE; + break; + } + } + if (!$semester_found) + { + school_crawl_logf($school_crawl_log, 1, "Unable to find the entry for semester %s.", $semester); + return 1; + } + + $semesters_post[$semesters_select_node->getAttribute('name')] = $semester_value; + + $subjects_dom = new DOMDocument(); + $uri = school_crawl_url($uri, $semesters_form->getAttribute('action')); + $subjects_dom->loadHTML(school_crawl_geturi($uri, $cookies, $school_crawl_log, $semesters_post, TRUE, 'ccbcmd_crawl_curlhook')); + + $subjects_form_nodelist = $subjects_dom->getElementsByTagName('form'); + if (!$subjects_form_nodelist->length) + { + school_crawl_logf($school_crawl_log, 0, "Unable to find to submit for the subjects-choosing page."); + return 1; + } + $subjects_form_node = $subjects_form_nodelist->item(0); + $subjects_post = school_crawl_form($subjects_form_node); + + $subjects_select_node = $subjects_dom->getElementById('subj_id'); + foreach ($subjects_select_node->childNodes as $subjects_option_node) + if (!strcasecmp('all', trim($subjects_option_node->textContent))) + $subjects_post[$subjects_select_node->getAttribute('name')][] = $subjects_option_node->getAttribute('value'); + + $courses_dom = new DOMDocument(); + $uri = school_crawl_url($uri, $subjects_form_node->getAttribute('action')); + $courses_dom->loadHTML(school_crawl_geturi($uri, $cookies, $school_crawl_log, $subjects_post, TRUE, 'ccbcmd_crawl_curlhook')); + + $courses_xpath = new DOMXPath($courses_dom); + + /* The second row of the table has all of the headers in it */ + $tr_header_nodelist = $courses_xpath->query('//table[@class="datadisplaytable" and position()=1]//tr[position()=2]'); + if (!$tr_header_nodelist->length) + { + school_crawl_logf($school_crawl_log, 0, "Unable to find the row of the course/section data table which gives us the mappings of column names onto columns."); + return 1; + } + $tr_header_node = $tr_header_nodelist->item(0); + + $section_offsets = array( + 'registration_number' => school_crawl_table_resolve_column($tr_header_node, 'CRN'), + 'section_id' => school_crawl_table_resolve_column($tr_header_node, 'subj/crse/sec'), + /* there's a boolean column which says whether or not the course has any prerequisites/corequisites.... */ + 'credits' => school_crawl_table_resolve_column($tr_header_node, 'credhrs'), + /* there's a column for the number of contact hours, vs. credit hours */ + 'dates' => school_crawl_table_resolve_column($tr_header_node, 'sessiondates'), + ); + foreach (array('title', 'days', 'times', 'instructor', 'location') as $column_key) + $section_offsets[$column_key] = school_crawl_table_resolve_column($tr_header_node, $column_key); + /* there's also a column for ``session dates'' */ + + /* error check and calculate the number of children that a node must have to be */ + $max_offset = 0; + foreach ($section_offsets as $name => $value) + { + if ($value === FALSE) + { + school_crawl_logf($school_crawl_log, 0, "Unable to find column offset for `%s'.", + $name); + return 1; + } + else + school_crawl_logf($school_crawl_log, 9, "%s -> %s", $name, $value); + + $max_offset = max($max_offset, $value); + } + + foreach ($courses_xpath->query('//table[@class="datadisplaytable" and position()=1]//tr') as $tr_node) + { + $children = school_crawl_table_rownodes($tr_node); + if ($children->length < $max_offset) + /* + * Skip this row because it doesn't have all of the columns we + * want and thus it can't be a row containing information + * about a section. + */ + continue; + if (!strcmp($children->item($section_offsets['section_id'])->tagName, 'th')) + /* + * We've hit one of the s filled with s. Skip this one. + */ + continue; + + /* + * There are some rows with the time set to TBA and with empty + * section_id columns. Respond to this by skipping empty + * section_id columns since there's no useful data in these + * rows. We use strlen() < 3 because trim() doesn't take care of + *   :-/ + */ + $section_id = trim($children->item($section_offsets['section_id'])->textContent); + if (strlen($section_id) < 3) + continue; + + $section_id_parts = Section::parse($section_id); + + $registration_number = $children->item($section_offsets['registration_number'])->textContent; + $instructor = $children->item($section_offsets['instructor'])->textContent; + + $section_meetings = array(); + { + $time_range_text = $children->item($section_offsets['times'])->textContent; + if (strpos($time_range_text, 'TBA') !== FALSE) + { + /* + * Add the section to the autocomplete list, just without + * any meeting info (i.e., $section_meetings is still + * empty now). + */ + $semester->section_add($section_id_parts['department'], $section_id_parts['course'], + new Section($section_id_parts['section'], $section_meetings, $registration_number)); + continue; + + } + if (($dash_pos = strpos($time_range_text, '-')) === FALSE) + { + school_crawl_logf($school_crawl_log, 0, "Unable to understand course's time range format, cannot find dash: ``%s''.", + $time_range_text); + return 1; + } + + $time_start_text = substr($time_range_text, 0, $dash_pos); + $time_start = strptime($time_start_text, '%I:%M %p'); + $time_end_text = substr($time_range_text, $dash_pos + 1); + /* + * Make sure that _only_ one date range is specified to ensure + * data integrity. I.e., make sure that the college doesn't + * suddenly support multiple meeting times without our + * anticipating that and then cause us to have invalid + * data. ;-). --binki + */ + if (strpos($time_end_text, '-') !== FALSE) + { + school_crawl_logf($school_crawl_log, 0, "College seems to support multiple meeting times per semester which we don't know how to parse (even though slate_permutate itself can handle this situation): ``%s'' time_end_text: ``%s''.", + $time_range_text, $time_end_text); + return 1; + } + $time_end = strptime($time_end_text, '%I:%M %p'); + if ($time_end === FALSE || $time_start === FALSE) + { + school_crawl_logf($school_crawl_log, 0, "Error parsing start or end time: start: ``%s'' end: ``%s''.", + $time_start_text, $time_end_text); + return 1; + } + + $days = school_crawl_days_str_format($school_crawl_log, $children->item($section_offsets['days'])->textContent); + + $section_meetings[] = new SectionMeeting($days, school_crawl_time_format($time_start), school_crawl_time_format($time_end), + $children->item($section_offsets['location'])->textContent, + $instructor); + + /* check if a semester's date range should be increased */ + $section_dates = $children->item($section_offsets['dates'])->textContent; + if (preg_match(';^([0-9]+)/([0-9]+)-([0-9]+)/([0-9]+)$;', $section_dates, $section_dates_matches)) + { + $semester->time_start_set_test(gmmktime(0, 0, 0, $section_dates_matches[1], $section_dates_matches[2], $semester->year_get())); + $semester->time_end_set_test(gmmktime(0, 0, 0, $section_dates_matches[3], $section_dates_matches[4], $semester->year_get())); + } + } + + $semester->section_add($section_id_parts['department'], $section_id_parts['course'], + new Section($section_id_parts['section'], $section_meetings, $registration_number)); + } + + return 0; +} + /** * \brief * Crawl CCBCMD's registration stuffage. diff --git a/school.d/cedarville.crawl.inc b/school.d/cedarville.crawl.inc --- a/school.d/cedarville.crawl.inc +++ b/school.d/cedarville.crawl.inc @@ -52,81 +52,107 @@ function table_parse($html) return $arr; } -/** Crawls Cedarville course listings. $season is "fa" or "sp", year is 4-digit year */ -function cedarville_crawl(array &$semesters, &$school_crawl_log) -{ - $basepath = 'http://cedarville.edu/courses/schedule/'; - - school_crawl_logf($school_crawl_log, 6, "Beginning crawl of Cedarville:"); - - school_crawl_logf($school_crawl_log, 7, "Determining list of departments."); +define('CEDARVILLE_BASE_URI', 'http://cedarville.edu/courses/schedule/'); +define('CEDARVILLE_TIMEZONE_OFFSET', 60*60 * -4); - school_crawl_logf($school_crawl_log, 8, "Determining list of semesters."); - $semesters_dom = new DOMDocument(); - $semesters_dom->loadHTML(file_get_contents($basepath)); - - $content_div_dom = $semesters_dom->getElementById('contenttext'); - if (!$content_div_dom) +/** + * \brief + * Obtain the list of crawlable semesters offered by Cedarville. + * + * \param $school + * The school's info array/handle. + * \param $semesters + * An array to insert the semesters into. + * \return + * 0 on success. + */ +function cedarville_crawl_semester_list(array $school, array &$semesters, &$school_crawl_log) +{ + $uri = CEDARVILLE_BASE_URI; + $cookies = array(); + $html = school_crawl_geturi($uri, $cookies, $school_crawl_log); + if (empty($html)) { - school_crawl_logf($school_crawl_log, 6, "Error finding location of the list of departments."); - if (count($semesters)) - { - school_crawl_logf($school_crawl_log, 6, "Assuming that I got enough info anyways, returning successful code so that the few semesters I was able to crawl will be cached."); - return 0; - } - school_crawl_logf($school_crawl_log, 0, "Couldn't find any departments."); + school_crawl_logf($school_crawl_log, 1, "Unable to fetch %s.", CEDARVILLE_BASE_URI); return 1; } + + $semesters_dom = new DOMDocument(); + $semesters_dom->loadHTML($html); + $departments_xpath = new DOMXPath($semesters_dom); - foreach ($departments_xpath->query('.//li/a') as $department_a_dom) + $have_semesters = FALSE; + foreach ($departments_xpath->query('//*[@id="contenttext"]//li/a') as $department_a_dom) { $semester_href = $department_a_dom->getAttribute('href'); - $semester_href_parts = split('_', $semester_href); + $semester_href_parts = explode('_', $semester_href); $semester_name = $department_a_dom->textContent; if (stripos($semester_name, 'graduate') !== FALSE || strpos($semester_href, 'index') === FALSE) /* cedarville has about 1 graduate course, lol */ continue; - $semester_name_parts = split(' ', $semester_name); + + $semester_name_parts = explode(' ', $semester_name); $semester_year = $semester_name_parts[0]; $semester_season = strtolower($semester_name_parts[1]); - $semester_min_date_start = 0; - $semester_max_date_end = 0; - $semester = new Semester($semester_year, $semester_season); - - school_crawl_logf($school_crawl_log, 6, "Crawling semester: %s.", - $semester_name); + $semesters[] = new Semester($semester_year, $semester_season); + $have_semesters = TRUE; + } /* - * We need two passes because the first department's code name is - * not accessible available in the first pageload. + * Prime cedarville_semester_uri()'s cache to have one fewer page + * load. + */ + cedarville_semester_uri(NULL, $school_crawl_log, $semesters_dom); + + return $have_semesters ? 0 : 1; +} + +/** + * \brief + * Crawl a given Cedarville semester. + * + * \param $school + * The school handle. + * \param $semester + * The semester to populate with courses. + */ +function cedarville_crawl_semester(array $school, Semester $semester, &$school_crawl_log) +{ + $semester_uri = cedarville_semester_uri($semester, $school_crawl_log); + if (empty($semester_uri)) + return 1; + list($season_string) = explode('_', $semester_uri); + + /* + * Two passes are needed to determine the listing of departments + * because the first department's code name is not accessible + * available in the first pageload. */ $departments = array(); - if (cedarville_crawl_departments_get($basepath . $semester_href, $departments, $semester_href_parts[0], $school_crawl_log)) + if (cedarville_crawl_departments_get(CEDARVILLE_BASE_URI . $semester_uri, $departments, $season_string, $school_crawl_log)) return 1; if (!count($departments)) { - school_crawl_logf($school_crawl_log, 6, "Unable to get a listing of departments."); - if (count($semesters)) - { - school_crawl_logf($school_crawl_log, 6, "Assuming that I got enough info anyways, returning successful code so that the few semesters I was able to crawl will be cached."); - return 0; - } - school_crawl_logf($school_crawl_log, 0, "Unable to get listing of departments."); + school_crawl_logf($school_crawl_log, 2, "Unable to get a listing of departments."); return 1; } + /* find the first department whose name we don't yet know */ - if (cedarville_crawl_departments_get($basepath . $semester_href_parts[0] . '_' . current(array_keys($departments)) . '_all.htm', $departments, $semester_href_parts[0], $school_crawl_log)) + if (cedarville_crawl_departments_get(CEDARVILLE_BASE_URI . $season_string . '_' . current(array_keys($departments)) . '_all.htm', $departments, $season_string, $school_crawl_log)) return 1; $tables = array(); + $cookies = array(); foreach ($departments as $department => $dept_name) { school_crawl_logf($school_crawl_log, 7, "Crawling department %s (%s).", $department, $dept_name); - $html = file_get_contents($basepath . $semester_href_parts[0] . '_' . $department . '_' . 'all.htm'); + + $uri = CEDARVILLE_BASE_URI . $season_string . '_' . $department . '_all.htm'; + $html = school_crawl_geturi($uri, $cookies, $school_crawl_log); if (!$html) continue; $tables[$department] = table_parse(cedarville_html_fix($html)); @@ -239,8 +265,8 @@ function cedarville_crawl(array &$semest /* check for daterange information -- i.e., if the first regex successfully matched: */ if (count($meeting_matches) > 7) { - $date_start = school_crawl_mktime(strptime($meeting_matches[6], '%m/%d/%y')); - $date_end = school_crawl_mktime(strptime($meeting_matches[7], '%m/%d/%y')); + $date_start = school_crawl_gmmktime(strptime($meeting_matches[6], '%m/%d/%y'), CEDARVILLE_TIMEZONE_OFFSET); + $date_end = school_crawl_gmmktime(strptime($meeting_matches[7], '%m/%d/%y'), CEDARVILLE_TIMEZONE_OFFSET); if (!empty($date_start) && !empty($date_end)) { $semester->time_start_set_test($date_start); @@ -258,10 +284,66 @@ function cedarville_crawl(array &$semest } } - $semesters[] = $semester; + return 0; +} + +/** + * \brief + * Look up the URI used to access information about a particular + * Cedarville semester. + * + * \param $semester + * The semester whose URI is being retrieved. + * \param $document + * Optional DOMDocument of the Cedarville semester listing page, to + * aid seeding the cache. To prime the cache, just set $semester to + * NULL and pass in $document. + * \return + * The URI for that semester's courses relative to + * CEDARVILLE_BASE_URI. + */ +function cedarville_semester_uri(Semester $semester = NULL, &$school_crawl_log, DOMDocument $document = NULL) +{ + static $semester_to_uri = array(); + + if (empty($semester_to_uri)) + { + if (empty($document)) + { + $uri = CEDARVILLE_BASE_URI; + $cookies = array(); + $html = school_crawl_geturi($uri, $cookies, $school_crawl_log); + if (empty($html)) + return NULL; + + $document = new DOMDocument(); + $document->loadHTML($html); + } + + $departments_xpath = new DOMXPath($document); + foreach ($departments_xpath->query('//*[@id="contenttext"]//li/a') as $department_a_dom) + { + $semester_href = $department_a_dom->getAttribute('href'); + + $semester_name = $department_a_dom->textContent; + + list($semester_year, $semester_season) = explode(' ', $semester_name); + $semester_season = strtolower($semester_season); + + $semester_to_uri += array($semester_year => array()); + $semester_to_uri[$semester_year][$semester_season] = $semester_href; + } } - return 0; + if (empty($semester)) + return NULL; + + $year = $semester->year_get(); + $season = $semester->season_get(); + if (empty($semester_to_uri[$year][$season])) + return NULL; + + return $semester_to_uri[$year][$season]; } /** @@ -274,7 +356,8 @@ function cedarville_crawl(array &$semest */ function cedarville_crawl_departments_get($dept_url, array &$departments, $season_string, $school_crawl_log) { - $html = file_get_contents($dept_url); + $cookies = array(); + $html = school_crawl_geturi($dept_url, $cookies, $school_crawl_log); $dept_dom = new DOMDocument(); if (!$dept_dom->loadHTML(cedarville_html_fix($html))) {