@@ -22,40 +22,40 @@
* \file
* \brief
* Crawler implementation for Cedarville University.
*/
/**
* Parse given html into an array, first row is row headers.
*
* \param $html
* HTML that PHP's DOM would willingly would eat.
function table_parse($html)
function cedarville_table_parse($html)
{
libxml_use_internal_errors(true); // Suppress warnings
$arr = array();
$dom = new DOMDocument;
if(!$html)
return NULL;
$dom->loadHTML($html);
$dom->preserveWhiteSpace = FALSE;
$tables = $dom->getElementsByTagName('table');
$rows = $tables->item(0)->getElementsByTagName('tr'); // Get first table on page
foreach ($rows as $rownum => $row) {
$cols = $row->getElementsByTagName('td');
foreach($cols as $colnum => $col){
$arr[$rownum][$colnum] = $col->nodeValue;
$arr[$rownum][$colnum] = $col;
}
return $arr;
define('CEDARVILLE_BASE_URI', 'http://cedarville.edu/courses/schedule/');
define('CEDARVILLE_TIMEZONE_OFFSET', 60*60 * -4);
* Obtain the list of crawlable semesters offered by Cedarville.
@@ -146,25 +146,25 @@ function cedarville_crawl_semester(array
return 1;
$tables = array();
$cookies = array();
foreach ($departments as $department => $dept_name)
school_crawl_logf($school_crawl_log, 7, "Crawling department %s (%s).", $department, $dept_name);
$uri = CEDARVILLE_BASE_URI . $season_string . '_' . $department . '_all.htm';
$html = school_crawl_geturi($uri, $cookies, $school_crawl_log);
if (!$html)
continue;
$tables[$department] = table_parse(cedarville_html_fix($html));
$tables[$department] = cedarville_table_parse(cedarville_html_fix($html));
foreach ($tables as $dept_table)
/*
* Discard the first row, which has the contents of the <th />
* elements.
unset($dept_table[0]);
foreach($dept_table as $course_table)
@@ -193,49 +193,76 @@ function cedarville_crawl_semester(array
* It appears tht <type> may be:
* LEC: normal lecture meeting.
* ONL: online course.
* ILB: ethan says a partially online course...?
* HYB: hybrid of...?
* FLD: field...?
* FE2: ?
* CLN: ?
* LAB: Lab
* LES: something for some PFMU/PLMU class?
$synonym = $course_table[0];
$section_parts = Section::parse($course_table[1]);
$synonym = $course_table[0]->nodeValue;
$section_parts = Section::parse($course_table[1]->nodeValue);
if (count($section_parts) < 3)
school_crawl_logf($school_crawl_log, 6, "Error parsing section_id. Given `%s'; interpreted as `%s'. Skipping.",
$course_table[1], implode('-', $section_parts));
$course_table[1]->nodeValue, implode('-', $section_parts));
$instructor = $course_table[3];
$title = $course_table[2];
$title = $course_table[2]->nodeValue;
* For courses with multiple section meetings, each
* instructor for each section meeting is separated by <br/>.
$instructors = array('');
foreach ($course_table[3]->childNodes as $child)
switch ($child->nodeType)
case XML_ELEMENT_NODE:
end($instructors);
if (!strcmp($child->tagName, 'br')
&& strlen(trim($instructors[key($instructors)])))
$instructors[] = '';
else
$instructors[key($instructors)] .= $child->nodeValue;
break;
case XML_TEXT_NODE:
$instructors[key($instructors)] .= $child->data;
foreach ($instructors as $key => $instructor)
$instructors[$key] = trim($instructor);
* Each course may have multiple meeting times associated
* with it at Cedarville. We are not sure how to handle this
* quite, because different class sections may be tied with
* different lab meetings and stuff...
$meetings_str = $course_table[6];
$meetings_str = $course_table[6]->nodeValue;
if (strpos($meetings_str, 'TBA') !== FALSE)
school_crawl_logf($school_crawl_log, 8, "Skipping %s because its meeting time info has `TBA' in it.", implode('-', $section_parts));
$meetings = array();
$meeting_i = 0;
$meeting_multiple_types = array();
while (strlen($meetings_str) > 5)
$meeting_start_regex = ';^';
$meeting_base_regex = ' ([A-Z]+) +([A-Z]+[A-Z0-9]*) +([MTWRF]{1,5}) +([0-9:AP]+)-([0-9:AP]+)';
$meeting_date_regex = 'Dates:[^0-9]+([/0-9]{8})-([/0-9]{8})';
$meeting_end_regex = ';';
if (!preg_match($meeting_start_regex . $meeting_base_regex . $meeting_date_regex . $meeting_end_regex,
$meetings_str, $meeting_matches)
&& !preg_match($meeting_start_regex . $meeting_base_regex . $meeting_end_regex,
$meetings_str, $meeting_matches))
@@ -265,26 +292,37 @@ function cedarville_crawl_semester(array
/* check for daterange information -- i.e., if the first regex successfully matched: */
if (count($meeting_matches) > 7)
$date_start = school_crawl_gmmktime(strptime($meeting_matches[6], '%m/%d/%y'), CEDARVILLE_TIMEZONE_OFFSET);
$date_end = school_crawl_gmmktime(strptime($meeting_matches[7], '%m/%d/%y'), CEDARVILLE_TIMEZONE_OFFSET);
if (!empty($date_start) && !empty($date_end))
$semester->time_start_set_test($date_start);
$semester->time_end_set_test($date_end);
* The tables are made for humans, not computers. If
* there aren't enough instructors for the number of
* section meetings, just reuse the first listed
* instructor:
if ($meeting_i >= count($instructors))
$instructors[$meeting_i] = $instructors[0];
$meetings[] = new SectionMeeting($days, $time_start, $time_end,
$room, $type, $instructor);
$room, $type, $instructors[$meeting_i]);
$meeting_i ++;
$semester->section_add($section_parts['department'], $section_parts['course'],
new Section($section_parts['section'], $meetings,
$synonym), $title);
return 0;
Status change: