# HG changeset patch # User Ethan Zonca # Date 2011-01-31 21:43:02 # Node ID 5eef21a8b8319fe36c1e1c5fbd6eb06504798e2f # Parent 415cc2772379b78b1e186e68aa5a0e575dca8d3d # Parent bd5fe413d18405b9aa34be5717a9fbf81d18d742 Merge diff --git a/school.d/cedarville.inc b/school.d/cedarville.inc --- a/school.d/cedarville.inc +++ b/school.d/cedarville.inc @@ -99,18 +99,35 @@ function cedarville_crawl($semester, $ve $season = strtolower(substr($semester->season_get(), 0, 2)); $year = $semester->year_get(); + $season_string = $year . $season; - /* Current academic departments. Update as needed. */ - $departments = array('ad', 'be','ba','ca','ed','eg','es','hg','id','ll','ms','mu','ns','ph','py','sm','sw'); - $basepath = "http://cedarville.edu/courses/schedule/"; + $basepath = 'http://cedarville.edu/courses/schedule/'; + + if ($verbosity) + echo "cedarville_crawl(): Beginning crawl of Cedarville:\n"; - echo "cedarville_crawl(): Beginning crawl of Cedarville:\n"; + if ($verbosity > 1) + echo "cedarville_crawl(): Determining list of departments.\n"; + /* + * We need two passes because the first department's code name is + * not accessible available in the first pageload. + */ + $departments = array(); + if (cedarville_crawl_departments_get($basepath . $year . $season . '_index.htm', $departments, $season_string)) + return 1; + if (!count($departments)) + { + echo "cedarville_crawl(): Unable to get a listing of departments.\n"; + return 1; + } + /* find the first department whose name we don't yet know */ + if (cedarville_crawl_departments_get($basepath . $year . $season . '_' . current(array_keys($departments)) . '_all.htm', $departments, $season_string)) + return 1; - $season = strtolower($season); $tables = array(); - foreach($departments as $department) + foreach ($departments as $department => $dept_name) { - echo "cedarville_crawl(): Crawling department \"$department\"...\n"; + echo 'cedarville_crawl(): Crawling department ' . $department . ' (' . $dept_name . ")...\n"; $html = file_get_contents($basepath . $year . $season . '_' . $department . '_' . 'all.htm'); if (!$html) continue; @@ -213,43 +230,22 @@ function cedarville_crawl($semester, $ve /* prepare for parsing the next meeting time */ $meetings_str = substr($meetings_str, strlen($meeting_matches[0])); - if (isset($meetings[$meeting_matches[1]])) - { - if ($verbosity > 0 && !isset($meeting_multiple_types[$meeting_matches[1]])) - { - error_log('Section ' . implode('-', $section_parts) - . ' has multiple meeting times for meeting_type of ' - . $meeting_matches[1] . ' which my unflexible code which' - . ' could be made more flexible doesn\'t yet support.' - . ' Skipping the extra meeting times for this type of meeting.'); - /* only give the above error once per type. */ - $meeting_multiple_types[$meeting_matches[1]] = TRUE; - } - continue; - } + $days = school_crawl_days_str_format($meeting_matches[3]); + $time_start = school_crawl_time_format(strptime($meeting_matches[4] . 'M', '%I:%M%p')); + $time_end = school_crawl_time_format(strptime($meeting_matches[5] . 'M', '%I:%M%p')); + $room = $meeting_matches[2]; - $meetings[$meeting_matches[1]] - = array('room' => $meeting_matches[2], - 'days' => school_crawl_days_str_format($meeting_matches[3]), - 'time_start' => school_crawl_time_format(strptime($meeting_matches[4] . 'M', '%I:%M%p')), - 'time_end' => school_crawl_time_format(strptime($meeting_matches[5] . 'M', '%I:%M%p')), - 'type' => $meeting_matches[1], - ); + $type = $meeting_matches[1]; + while (isset($meeting_type_maps[$type])) + $type = $meeting_type_maps[$type]; + $type = strtolower($type); + + $meetings[] = new SectionMeeting($days, $time_start, $time_end, + $room, $type); } - $section_meetings = array(); - foreach ($meetings as $meeting) - { - $meeting_type = $meeting['type']; - if (isset($meeting_type_maps[$meeting_type])) - $meeting_type = $meeting_type_maps[$meeting_type]; - - $section_meetings[] = new SectionMeeting($meeting['days'], $meeting['time_start'], - $meeting['time_end'], $meeting['room'], - $meeting_type); - } $semester->section_add($section_parts['department'], $section_parts['course'], - new Section($section_parts['section'], $section_meetings, + new Section($section_parts['section'], $meetings, $synonym, $instructor)); } } @@ -259,10 +255,49 @@ function cedarville_crawl($semester, $ve /** * \brief + * Scan cedarville's course listing pages for departments. + * + * \return + * An associative array mapping department codes onto department + * friendly names. + */ +function cedarville_crawl_departments_get($dept_url, array &$departments, $season_string) +{ + $html = file_get_contents($dept_url); + $dept_dom = new DOMDocument(); + if (!$dept_dom->loadHTML(cedarville_html_fix($html))) + { + echo "cedarville_crawl(): Error determining list of available departments: Unable to parse HTML.\n"; + return 1; + } + $xpath = new DOMXPath($dept_dom); + + $dept_node_list = $xpath->query('/descendant::div[@id="contenttext"]/child::span[position()=1 or position()=2]/child::a'); + foreach ($dept_node_list as $dept_node) + { + $href = $dept_node->getAttribute('href'); + if (!preg_match('/^' . preg_quote($season_string, '/') . '_([a-z]+)_[a-z]+\.htm$/', $href, $matches)) + { + echo 'cedarvillege_crawl(): Error determining list of available departments: Unable to parse the department string out of href="' . $href . "\".\n"; + return 1; + } + + $dept = $matches[1]; + $departments[$dept] = $dept_node->textContent; + } + + return 0; +} + +/** + * \brief * Fix some incorrect usage of the HTML entity delimiter, the ampersand. */ function cedarville_html_fix($html) { $html = preg_replace('/&&/', '&&', $html); - return preg_replace('/&([^;]{5})/', '&$1', $html); + $html = preg_replace('/&([^;]{5})/', '&$1', $html); + $html = preg_replace('/ID="(LINKS|HERE)"/', '', $html); + + return $html; }