.
*/
/**
* \file
*
* All of the code for crawling umich.
*
* Potential startpoints:
* - http://lsa.umich.edu/cg/cg_advsearch.aspx (HTML/curl-based)
* - http://ro.umich.edu/schedule/ (harder HTML for semester guessing, one CSV download for entire semester -- <=4MB)
*
* A single download, the CSV option, is preferred to having to issue
* a series of HTTP requests. Each HTTP request has a lot of latency
* and overhead which a one-shot download doesn't.
*/
/**
* \brief
* Retrieve the list of semesters umich has available for crawling.
*
* \todo
* Some error handling.
*
* \param $school
* The school handle for umich.
* \param $semesters
* An array to which Semester objects should be appended, one for
* each potentially crawlable semester.
* \return
* 0 on success, 1 on failure.
*/
function umich_crawl_semester_list(array $school, array &$semesters, &$school_crawl_log)
{
$uri = 'http://ro.umich.edu/schedule/';
$cookies = array();
$semesters_dom = new DOMDocument();
$semesters_dom->loadHTML(school_crawl_geturi($uri, $cookies, $school_crawl_log));
$semesters_xpath = new DOMXPath($semesters_dom);
foreach ($semesters_dom->getElementsByTagName('table') as $table)
{
$table_tr = NULL;
foreach ($semesters_xpath->query('tr', $table) as $table_tr)
break;
if (empty($table_tr))
{
school_crawl_logf($school_crawl_log, 5, "Unable to find first row in table which I suspect is a table holding all of the semesters I'm interested in. I will try any other tables in this page and hopefully find one with a row in it...");
continue;
}
$semester_columns = array(
'name' => school_crawl_table_resolve_column($table_tr, 'Term'),
'csv' => school_crawl_table_resolve_column($table_tr, '/[cC][sS][vV]/', 'school_crawl_table_resolve_column_regexcmp'),
);
foreach ($semester_columns as $semester_column_name => $semester_column)
if ($semester_column === FALSE)
{
school_crawl_logf($school_crawl_log, 4, "Unable to resolve columns %s onto a column in a semester listing table. Skipping this table.",
$semester_column_name);
$semester_columns = NULL;
break;
}
if (empty($semester_columns))
continue;
$first = TRUE;
foreach ($semesters_xpath->query('tr', $table) as $table_tr)
if ($first)
{
/* Skip row of
| or titles. */
$first = FALSE;
continue;
}
else
{
$rownodes = school_crawl_table_rownodes($table_tr);
$semester_name = $rownodes->item($semester_columns['name']);
$semester_csv = $rownodes->item($semester_columns['csv']);
if (!preg_match('/^(.+) ([0-9]+)$/', $semester_name->textContent, $matches))
{
school_crawl_logf($school_crawl_log, 4, "Unable to parse semester name `%s'. Skipping this semester.",
$semester_name->textContent);
continue;
}
$semester = new Semester($matches[2], $matches[1]);
$a = NULL;
foreach ($semesters_xpath->query('descendant::a', $semester_csv) as $a)
if ($a->hasAttribute('href'))
break;
if (empty($a) || !$a->hasAttribute('href'))
{
school_crawl_logf($school_crawl_log, 4, "Unable to find element with an href attribute for a CSV link for the %s semester. Skipping this semester. (textContent of CSV column: %s).",
$semester, $semester_csv->textContent);
continue;
}
/*
* Secretively communicate some metadata to
* umich_crawl_semester().
*/
$semester->umich_csv_href = $a->getAttribute('href');
$semesters[] = $semester;
}
}
return 0;
}
/**
* \brief
* Handle the crawling of one semester of umich.
*
* \param $school_crawl_log
* The school_crawl_log handle.
* \param $semester
* A Semester object to populate with courses from this semester.
* \param $csv_href
* A link to a CSV file which will be downloaded and parsed.
*/
function umich_crawl_semester(array $school, Semester $semester, &$school_crawl_log)
{
school_crawl_logf($school_crawl_log, 3, "Crawling %s.",
$semester);
$cookies = array();
$uri = $semester->umich_csv_href;
/* parse into lines and then each row needs to be individually parsed */
$csv = str_getcsv(school_crawl_geturi($uri, $cookies, $school_crawl_log), PHP_EOL);
$fields = array(
'Term' => FALSE /* $semester->season_get() . ' ' . $semester->year_get() */,
'Session' => FALSE /* "Regular Academic Session", "First 7 Week Session", "Second 7 Week Session" <-- half-semester support? */,
'Acad Group' => FALSE /* long version of the department sorta, more general than the subject field */,
'Class Nbr' => FALSE /* section synonym */,
'Subject' => FALSE /* "Mathematics (MATH)" */,
'Catalog Nbr' => FALSE /* "201", unqualified course_id */,
'Section' => FALSE /* You still reading these comments? */,
'Course Title' => FALSE /* for your sake, I hope you aren't */,
'Component' => FALSE /* "LAB", "LEC", "REC" -- i.e., meeting_type(?) */,
'Codes' => FALSE /* "P W", "P ", "P R ", "PI ", "A ", "P RW" ??????? (reminds me of ``svn status''). If flag[3] = 'W', then the class has a meeting times */,
'M' => FALSE /* if a day is enabled, it is set to itself. I.e., $row['M'] = 'M' or $row['M'] = '' */,
'T' => FALSE,
'W' => FALSE,
'TH' => FALSE,
'F' => FALSE,
'S' => FALSE,
'SU' => FALSE,
'Start Date' => FALSE /* yea! */,
'End Date' => FALSE /* "12/13/2011" */,
'Time' => FALSE /* "1230-130PM", "9-1030AM", "1130-1PM" */,
'Location' => FALSE,
'Instructor' => FALSE,
'Units' => FALSE /* As in credit hours */,
);
$ignored_fields = array(
'Term' => TRUE,
'Session' => TRUE,
'Acad Group' => TRUE,
'Codes' => TRUE,
);
foreach (str_getcsv($csv[0]) as $col_num => $col_name)
if (isset($fields[$col_name]))
$fields[$col_name] = $col_num;
else
school_crawl_logf($school_crawl_log, 6, "We do not recognize the %s column in the CSV file for %s.",
$col_name, $semester);
foreach ($fields as $field => $col_num)
if ($col_num === FALSE
&& empty($ignored_field[$field]))
{
school_crawl_logf($school_crawl_log, 2, "Unable to find column %s in CSV for %s. Skipping this semester.",
$field, $semester);
return 1;
}
/* remove the row with heading from the CSV dataset */
unset($csv[0]);
/* Now actually parse some data :-). */
$row_accumulation = array('Instructor' => '');
foreach ($csv as $row)
{
$row = str_getcsv($row);
$synonym = trim($row[$fields['Class Nbr']]);
$course_id = trim($row[$fields['Catalog Nbr']]);
if (!preg_match(';\(([A-Z]+)\)$;', $row[$fields['Subject']], $matches))
{
school_crawl_logf($school_crawl_log, 5, "Unable to parse department string `%s'. Skipping section/course (synonym=%s).",
$row[$fields['Subject']], $synonym);
continue;
}
$dept = $matches[1];
/**
* \todo
* umich stores sometimes ranges of credit hours for courses,
* formatted like "1.00-3.00". This is generally done for ARR
* courses, where there is negotiation between the faculty and
* the student on how the course is arranged. slate_permutate
* should have a concept of a range of credit hours, then when
* calculating credit hours for the user it can present the
* total as a range... not that hard, but still a task ;-).
*/
$credit_hours = (float)$row[$fields['Units']];
$days = '';
foreach (array('SU' => 'u', 'M' => 'm', 'T' => 't', 'W' => 'w', 'TH' => 'h', 'F' => 'f', 'S' => 's')
as $field => $day)
if (strlen(trim($row[$fields[$field]])))
$days .= $day;
if (!preg_match(';^([0-9]+)-([0-9]+)([AP])M$;', $row[$fields['Time']], $matches))
{
/*
* Some courses exist but only have sections which have ARR
* for their meeting times. I think this means sometimes
* that the student is to arrange the course meeting with
* the instructor, other times just that the course is
* planned but not scheduled yet. These courses should still
* show up in autocomplete even if they have no meeting
* times.
*/
if ($row[$fields['Time']] != 'ARR')
/* Log an unanticipated Time value */
school_crawl_logf($school_crawl_log, 4, "Unable to parse meeting time: `%s'. Skipping section/meeting (synonym=%s).",
$row[$fields['Time']], $synonym);
/* ensure that the class is added nonetheless */
if ($semester->class_get($dept, $course_id) === NULL)
/**
* \todo
* SP does credit hours by section, what about Courses
* with no sections because they're these weird limbo
* `ARR' courses but these limbo courses still have a
* number of credit hours?
*/
$semester->class_add(new Course($dept . '-' . $course_id, $row[$fields['Course Title']]));
continue;
}
$time_end = umich_crawl_time($matches[2], $matches[3]);
$time_start = umich_crawl_time($matches[1], FALSE, $time_end);
/* umich defines course_slots by meeting_type. */
$meeting_type = school_crawl_meeting_type(trim($row[$fields['Component']]));
/*
* Some information is only presented in the first row in a
* listing of courses. Perform some accumulation here.
*/
foreach (array('Instructor') as $key)
if (strlen($curr_value = trim($row[$fields[$key]])))
$row_accumulation[$key] = $curr_value;
$semester->section_meeting_add($dept, $course_id, trim($row[$fields['Course Title']]),
trim($row[$fields['Section']]), $synonym,
new SectionMeeting($days, $time_start, $time_end,
trim($row[$fields['Location']]),
$meeting_type,
$row_accumulation['Instructor']),
$meeting_type,
$credit_hours);
/*
* If the section so far passed as being a normal section, use
* its start and end dates to help determine the semester's
* respective start and end dates.
*/
$date_start_tm = strptime(trim($row[$fields['Start Date']]), '%m/%d/%Y');
$date_end_tm = strptime(trim($row[$fields['End Date']]), '%m/%d/%Y');
if (!empty($date_start_tm) && !empty($date_end_tm))
{
$date_start = school_crawl_gmmktime($date_start_tm);
$date_end = school_crawl_gmmktime($date_end_tm);
if ($date_start > 1000000 && $date_end > 1000000)
{
$semester->time_start_set_test($date_start);
$semester->time_end_set_test($date_end);
}
}
}
}
/**
* \brief
* Try to turn a umich-formatted time into something usable.
*
* \param $raw
* The raw input.
* \param $xm
* FALSE or, if PM or AM was specified, 'P' for PM and 'A' for AM.
* \param $before
* A time of day before which this time must be. Used generally for
* the start time of a class. The end time of a class must be parsed
* first so that the result of that calculation may be passed as the
* $before value.
*/
function umich_crawl_time($raw, $xm = FALSE, $before = '2400')
{
$h = $raw;
$m = '00';
if (strlen($raw) > 2)
{
$h = substr($raw, 0, strlen($raw) - 2);
$m = substr($raw, strlen($raw) - 2);
}
$before_h = substr($before, 0, 2);
$before_m = substr($before, 2);
if ($xm === FALSE)
{
/* if the time could feasibly be in the afternoon, assume it is: */
if (($h + 12) * 60 + $m < $before_h * 60 + $before_m)
$xm = 'P';
else
$xm = 'A';
}
if (!strcmp($xm, 'P') && $h < 12)
$h += 12;
return sprintf('%02d%02d', $h, $m);
}