# HG changeset patch # User Nathan Phillip Brink # Date 2010-10-16 20:24:58 # Node ID 1d417d9e6bb36b73fabdf90d132fc00be252d239 # Parent 1d8c169ca47861b7bd96945862812f59c7b2e692 Cedarville crawler converted to use the generalized crawling interface; it now populates the Semester object with actual section data. Cedarville has some strange course constrainst, however, which would require changes to the core algorithm. diff --git a/inc/school.crawl.inc b/inc/school.crawl.inc --- a/inc/school.crawl.inc +++ b/inc/school.crawl.inc @@ -54,14 +54,15 @@ function school_crawl_time_format($time) * An array of day names. These may be common abbreviations or * truncations (any truncations must be two chars long for * simplicity. One-char representations are supported, however, but - * use 'm', 't', 'w', 'h', 'f' to distinguish thursday and - * friday). Case does not matter. + * use 'm', 't', 'w', 'h', 'f' to distinguish Thursday and + * Tuesday. 'r' may also be used for Thursday.). Case does not + * matter. * \return * slate_permutate's strange internal days representation. */ function school_crawl_days_format($days) { - static $daymap_1 = array('m' => 1, 't' => 2, 'w' => 3, 'h' => 4, 'f' => 5); + static $daymap_1 = array('m' => 1, 't' => 2, 'w' => 3, 'h' => 4, 'r' => 4, 'f' => 5); static $daymap_2 = array('th' => 'h'); $my_days = array(); @@ -94,3 +95,21 @@ function school_crawl_days_format($days) return $day_str; } + +/** + * \brief + * Take a string of day initials and format it. + * + * \param $days_str + * Example input: 'mwf', 'TR'. + * \return + * Same as school_crawl_days_format() + */ +function school_crawl_days_str_format($days_str) +{ + $day_initials = array(); + for ($i = 0; $i < strlen($days_str); $i ++) + $day_initials[] = $days_str[$i]; + + return school_crawl_days_format($day_initials); +} diff --git a/school.d/cedarville.inc b/school.d/cedarville.inc --- a/school.d/cedarville.inc +++ b/school.d/cedarville.inc @@ -28,16 +28,22 @@ function cedarville_instructions_html() EOF; } -/** Parse html at URL into array, first row is row headers */ -function table_parse($url) { +/** + * \brief + * Parse given html into an array, first row is row headers + * + * \param $html + * HTML that PHP's DOM would willingly would eat. + */ +function table_parse($html) +{ $arr = array(); $dom = new DOMDocument; - $html = file_get_contents($url); - if(!$html){ - return 1; - } + if(!$html) + return NULL; + $dom->loadHTML($html); - $dom->preserveWhiteSpace = false; + $dom->preserveWhiteSpace = FALSE; $tables = $dom->getElementsByTagName('table'); $rows = $tables->item(0)->getElementsByTagName('tr'); // Get first table on page foreach ($rows as $rownum => $row) { @@ -50,7 +56,7 @@ function table_parse($url) { } /** Crawls Cedarville course listings. $season is "fa" or "sp", year is 4-digit year */ -function cedarville_crawl($semester) +function cedarville_crawl($semester, $verbosity = 1) { $season = strtolower(substr($semester->season_get(), 0, 2)); $year = $semester->year_get(); @@ -61,9 +67,157 @@ function cedarville_crawl($semester) $season = strtolower($season); $tables = array(); - foreach($departments as $department) { - $tables[$department] = table_parse($basepath . $year . $season . '_' . $department . '_' . 'all.htm'); - } - return $tables; + foreach($departments as $department) + { + $html = file_get_contents($basepath . $year . $season . '_' . $department . '_' . 'all.htm'); + if (!$html) + continue; + $tables[$department] = table_parse(cedarville_html_fix($html)); + } + + foreach ($tables as $dept_table) + { + /* + * Discard the first row, which has the contents of the + * elements. + */ + unset($dept_table[0]); + + foreach($dept_table as $course_table) + { + /* + * format: + * 0: course synonym, an unsigned integer. + * 1: section spec, parsable by Section::parse(). + * 2: friendly course title. + * 3: Instructor name. + * 4: Number of credit hours in decimal notation. + * 5: Fee. + * 6: Meeting time, explained below. + * 7: Cap. + * 8-10: Textbook link. Most rows only have column 8, not + * all the way through 10. This information seems + * quite useless. + * + * Section meeting time/place format: + * + * Confusing example: ' ILB WI219 TR 08:30A-09:45A' + * Complete example plus lab: ' LEC TYL203 MWF 08:00A-08:50A LAB ENS118 TR 03:00P-04:30P' + * + * Appears to have format: + * : - + * + * It appears tht may be: + * LEC: normal lecture meeting. + * ONL: online course. + * ILB: ethan says a partially online course...? + * HYB: hybrid of...? + * FLD: field...? + * FE2: ? + * CLN: ? + * LAB: Lab + * LES: something for some PFMU/PLMU class? + */ + + $synonym = $course_table[0]; + $section_parts = Section::parse($course_table[1]); + if (count($section_parts) < 3) + { + error_log('Error parsing section_id. Given `' . $course_table[1] . '\', interpreted as `' + . implode('-', $section_parts) . '\'. Skipping.'); + continue; + } + + $instructor = $course_table[3]; + + /* + * Each course may have multiple meeting times associated + * with it at Cedarville. We are not sure how to handle this + * quite, because different class sections may be tied with + * different lab meetings and stuff... + */ + $meetings_str = $course_table[6]; + if (strpos($meetings_str, 'TBA') !== FALSE) + { + if ($verbosity > 1) + error_log('Skipping ' . implode('-', $section_parts) . ' because its meeting time info has `TBA\' in it.'); + continue; + } + $meetings = array(); + $meeting_multiple_types = array(); + while (strlen($meetings_str) > 5) + { + if (!preg_match(';^ ([A-Z]+) +([A-Z]+[A-Z0-9]*) +([MTWRF]{1,5}) +([0-9:AP]+)-([0-9:AP]+);', + $meetings_str, $meeting_matches)) + { + if (preg_match(';^Dates:[^0-9]+([/0-9]{8})-([/0-9]{8});', + $meetings_str, $meeting_matches)) + { + if ($verbosity > 4) + error_log('Skipping some meeting data for ' + . implode('-', $section_parts) . ' because it is a date range: `' + . $meeting_matches[0] . '\''); + $meetings_str = substr($meetings_str, strlen($meeting_matches[0])); + continue; + } + + if ($verbosity > 0) + error_log('Error parsing meeting time. Given `' . $meetings_str . '\'. Skipping ' + . implode('-', $section_parts)); + break; + } + /* prepare for parsing the next meeting time */ + $meetings_str = substr($meetings_str, strlen($meeting_matches[0])); + + if (isset($meetings[$meeting_matches[1]])) + { + if ($verbosity > 0 && !isset($meeting_multiple_types[$meeting_matches[1]])) + { + error_log('Section ' . implode('-', $section_parts) + . ' has multiple meeting times for meeting_type of ' + . $meeting_matches[1] . ' which my unflexible code which' + . ' could be made more flexible doesn\'t yet support.' + . ' Skipping the extra meeting times for this type of meeting.'); + /* only give the above error once per type. */ + $meeting_multiple_types[$meeting_matches[1]] = TRUE; + } + continue; + } + + $meetings[$meeting_matches[1]] + = array('room' => $meeting_matches[2], + 'days' => school_crawl_days_str_format($meeting_matches[3]), + 'time_start' => school_crawl_time_format(strptime($meeting_matches[4] . 'M', '%I:%M%p')), + 'time_end' => school_crawl_time_format(strptime($meeting_matches[5] . 'M', '%I:%M%p')), + 'type' => $meeting_matches[1], + ); + } + + foreach ($meetings as $meeting) + { + $section_letter = $section_parts['section']; + if ($meeting['type'] == 'LECT') + /** + * \todo this might not make much sense. + */ + $section_letter = 'L' . $section_letter; + $semester->section_add($section_parts['department'], $section_parts['course'], + new Section($section_letter, $instructor, + $meeting['time_start'], $meeting['time_end'], + $meeting['days'])); + } + } + } + + return 0; } +/** + * \brief + * Fix some incorrect usage of the HTML entity delimiter, the ampersand. + */ +function cedarville_html_fix($html) +{ + $html = preg_replace('/&&/', '&&', $html); + return preg_replace('/&([^;]{5})/', '&$1', $html); +}