*
* This file is a part of slate_permutate.
*
* slate_permutate is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* slate_permutate is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with slate_permutate. If not, see .
*/
/**
* \brief
* Crawl's Calvin's registration course listing pages.
*
* \param $semesters
* An array to be filled with Semester objects which I should
* populate.
* \param $verbosity
* How verbose I should be. Sensicle range is from 0 through 10.
*/
function calvin_crawl(array &$semesters, $verbosity = 1)
{
/**
* The first link we start at is the one from KV into WebAdvisor.
*
* 1. https://kvdata.calvin.edu/walive/WebAdvisor?CONSTITUENCY=WBST&type=P&pid=ST-WESTS12A&LASTTOKEN=NULL
*
*
* Calls javascript:getWindowHTML(). This merely adds
* TOKENIDX=NULL to the query string, so we can skip this step
* and just have TOKENIDX=NULL.
*
* 2. https://kvdata.calvin.edu/walive/WebAdvisor?CONSTITUENCY=WBST&type=P&pid=ST-WESTS12A&LASTTOKEN=NULL&TOKENIDX=NULL
*
*
* In the above, the second argument to setWindowHTML() is
* random. Thus, we have to capture this value.
*/
$cookies = array();
$baseuri = 'https://kvdata.calvin.edu/walive/WebAdvisor?CONSTITUENCY=WBST&type=P&pid=ST-WESTS12A&LASTTOKEN=NULL';
$token_uri = $baseuri . '&TOKENIDX=NULL';
$token_html = calvin_crawl_noscript_filter(school_crawl_geturi($token_uri, $cookies));
if (!preg_match('/setWindowHTML\(\'\', \'([0-9]+)\'\);/', $token_html, $matches))
{
fprintf(STDERR, "Could not steal the token\n");
return 1;
}
$token = $matches[1];
if ($verbosity > 5)
{
echo 'token: ' . $token . "\n";
echo "\n";
}
/*
* here we have arrived at the main webadvisor screen which lists the
* search form. From here, we can get a list of all of the departments
* that Calvin College has and then know enough to query each
* individual department for courses.
*/
$uri = $baseuri . '&TOKENIDX=' . $token;
$departments_html = calvin_crawl_noscript_filter(school_crawl_geturi($uri, $cookies));
$departments_dom = new DOMDocument();
$departments_dom->loadHTML($departments_html);
/*
* Discover the available semesters
*/
$semesters_select_nodes = $departments_dom->getElementById('VAR1')->childNodes;
$semester_strs = array();
foreach ($semesters_select_nodes as $semester_node)
{
if ($semester_node->tagName != 'option'
|| !$semester_node->hasAttribute('value')
|| !strlen($semester_node->getAttribute('value')))
continue;
$semester_strs[$semester_node->getAttribute('value')] =
$semester_node->nodeValue;
}
$semester_strs = array_reverse($semester_strs, TRUE);
$departments_select_nodes = $departments_dom->getElementById('LIST_VAR1_1')->childNodes;
$departments = array();
foreach ($departments_select_nodes as $dept_node)
{
if ($dept_node->tagName != 'option'
|| !$dept_node->hasAttribute('value'))
continue;
$departments[$dept_node->getAttribute('value')] =
$dept_node->nodeValue;
}
/*
* get all of the different possible course levels... dynamically
* rather than hardcodedly ;-).
*/
$departments_select_nodes = $departments_dom->getElementById('LIST_VAR1_2')->childNodes;
$course_levels = array();
foreach ($departments_select_nodes as $courselevel_node)
{
if ($courselevel_node->tagName != 'option'
|| !$courselevel_node->hasAttribute('value'))
continue;
$course_levels[] = $courselevel_node->getAttribute('value');
}
$return_url = dom_input_value($departments_dom, 'RETURN.URL');
if ($verbosity > 4)
fprintf(STDERR, "Available semesters: %s\n", implode($semester_strs, ', '));
$semester_start_uri = $uri;
$season_map = array(
'FA' => Semester::SEASON_FALL,
'IN' => 'interim',
'SP' => Semester::SEASON_SPRING,
'MA' => 'may',
/* I don't know if SU is a valid Calvin Smester ID or not */
'SU' => Semester::SEASON_SUMMER);
foreach ($semester_strs as $semester_str => $semester_info)
{
if (empty($season_map[substr($semester_str, 3)]))
{
fprintf(STDERR, "Warning: Unknown semester identification chars: %s. Skipping this semester.\n",
$semester_str);
continue;
}
$season = $season_map[substr($semester_str, 3)];
$year_timespec = strptime(substr($semester_str, 0, 2), '%y');
$year = $year_timespec['tm_year'] + 1900;
$semester = new Semester($year, $season);
/* useful and necessary stats */
$skipped_sections = array('incomplete meeting info' => 0, 'invalid meeting info format' => 0);
$semester_start_min = 0;
$semester_end_max = 0;
$dept = '';
$course_level = '';
$uri = $semester_start_uri;
if ($verbosity)
fprintf(STDERR, "Crawling semester %s->%s\n",
$semester_str, $semester_info);
/*
* LIST.VAR_: is the column, is the row. There
* are apparently a max of 5 rows (see the LIST.VAR_MAX
* below).
*
* Columns:
* LIST.VAR1: department
* LIST.VAR2: course_level
* LIST.VAR3: IIRC, a course identifier, such as 156 from MATH-156
* LIST.VAR4: I forget
*
*/
$form = array('VAR1' => $semester_str,
'LIST.VAR1_1' => $dept,
'LIST.VAR2_1' => $course_level,
/*
* Other form items we're not querying but which need
* to be sent blankly.
*/
'RETURN.URL' => $return_url,
'SUBMIT_OPTIONS' => '',
/*
* The submit button... its value="" key is
* apparently sent with the form... makes a
* little bit of sense I guess ;-).
*/
/*'SUBMIT2' => 'SUBMIT',*/
'DATE.VAR1' => '',
'DATE.VAR2' => '',
'LIST.VAR1_CONTROLLER' => 'LIST.VAR1',
'LIST.VAR1_MEMBERS' => 'LIST.VAR1*LIST.VAR2*LIST.VAR3*LIST.VAR4',
);
foreach (array('1', '2', '3', '4') as $list_col)
{
$colname = 'LIST.VAR' . $list_col;
if (!isset($form[$colname . '_MAX']))
$form[$colname . '_MAX'] = '5';
foreach (array('1', '2', '3', '4', '5') as $list_row)
{
$rowname = $colname . '_' . $list_row;
if (!isset($form[$rowname]))
$form[$rowname] = '';
}
}
/*
* VAR7 and VAR 8 is a constraint of times during which
* courses meet
*/
$form['VAR7'] = '';
$form['VAR8'] = '';
/* ``course title keywords'' */
$form['VAR3'] = '';
/* ? */
$form['VAR6'] = '';
$form['VAR21'] = '';
/* instructor's last name */
$form['VAR9'] = '';
/*
* VAR10 through VAR16 are Monday through Sunday checkboxes
* for days of the week that classes meet.
*
* But we specify no days of the week to avoid this being a
* constraint ;-).
*/
/*
for ($day = 10; $day <= 16; $day ++)
$form['VAR' . $day] = '';
*/
/*
* pages is populated by preg_match() below after the first looping.
*/
$pages = array(1 => 0, 2=> 1);
while ($pages[1] < $pages[2])
{
$html = calvin_crawl_noscript_filter(school_crawl_geturi($uri, $cookies, $form));
$results_dom = new DOMDocument();
$results_dom->loadHTML($html);
$list_done = FALSE;
for ($list_row = 1; !$list_done; $list_row ++)
{
/* either 'Open' (or 'Closed'?) */
$openness = dom_input_value($results_dom, 'LIST.VAR1_' . $list_row);
$sec_short_title = dom_id_content($results_dom, 'SEC_SHORT_TITLE_' . $list_row);
$sec_meeting_info = dom_id_content($results_dom, 'SEC_MEETING_INFO_' . $list_row);
/* check if we're done with this particular page */
if (!strlen($openness) && !strlen($sec_short_title) && !strlen($sec_meeting_info))
{
$list_done = TRUE;
break;
}
/*
* the same info below should be gettable with
* dom_id_content($results_dom, 'SEC_FACULTY_INFO_' . $list_row);
*/
$faculty_name = dom_input_value($results_dom, 'SEC.FACULTY.INFO_' . $list_row);
$credits = dom_input_value($results_dom, 'SEC.MIN.CRED_' . $list_row); /* or id="SEC_FACULTY_INFO_$list_row" */
$comment = dom_id_content($results_dom, 'SEC_COMMENTS_' . $list_row); /* or name="SEC.COMMENTS_$list_row" */
/* parse */
$section_id = Section::parse($sec_short_title);
$synonym = NULL;
if (preg_match(';\(([0-9]+)\);', $sec_short_title, $matches))
$synonym = $matches[1];
if ($verbosity > 6)
{
echo "\n";
echo implode('-', $section_id) . ': ' . $sec_short_title . "\n";
echo $openness . "\n";
echo $sec_meeting_info . "\n";
echo $faculty_name . "\n";
echo $credits . "\n";
echo $comment . "\n";
}
/*
* The input format for this is, thankfully, pretty rigid
* :-D. Example input format:
*
* '01/31/2011-05/11/2011 Lecture Monday, Wednesday 01:00PM - 03:50PM, Spoelhof Center, Room 101'
*
* OR
*
* '01/31/2011-05/18/2011 Practicum Days to be Announced, Times to be AnnouncedTo Be Arranged, Room TBA'
*
* OR
*
* '01/31/2011-05/12/2011 Music Ensemble Monday, Wednesday, Thursday, Friday 03:30PM - 04:20PM, Covenant Fine Arts Center, Room 135'
*
* In the second case.... we'll just ignore the section. In
* the last case, we have to be careful about parsing out
* Monday.
*
* At this point, we don't parse most tokens. We group them
* off. We get the first date, the second date, the type
* ('Lecture', 'Practicum', or some other unknown value),
* the list of days of week the section meets, the start
* time, the end time, and then the meeting location.
*/
if (strpos($sec_meeting_info, 'Times to be Announced') !== FALSE
|| strpos($sec_meeting_info, 'Days to be Announced') !== FALSE)
{
if ($verbosity > 2)
error_log('Skipping class because of incomplete meeting time information: '
. implode('-', $section_id) . ' has meeting info of `'
. $sec_meeting_info . '\'');
$skipped_sections['incomplete meeting info'] ++;
/* Still add to have less confusing autocomplete */
calvin_crawl_course_add($semester, $section_id['department'], $section_id['course']);
continue;
}
if (!preg_match(';^([0-9]{2}/[0-9]{2}/[0-9]{4})-([0-9]{2}/[0-9]{2}/[0-9]{4}) (([^ ,]+ )+)([^0-9]+) ([^ ]+) - ([^ ]+), (.*)$;', $sec_meeting_info, $meeting_info_matches))
{
error_log('Unable to parse calvin section meeting info string into start/end/days information for '
. implode('-', $section_id) . ': ``' . $sec_meeting_info . '\'\'');
$skipped_sections['invalid meeting info format'] ++;
/*
* Still add at least the course to the semester so that
* it shows up in autocmoplete.
*/
calvin_crawl_course_add($semester, $section_id['department'], $section_id['course']);
continue;
}
$date_start = $meeting_info_matches[1];
$date_end = $meeting_info_matches[2];
/* e.g., 'Lecture', 'Practicum' */
$meeting_type = strtolower(trim($meeting_info_matches[3]));
$days = school_crawl_days_format(explode(', ', $meeting_info_matches[5]));
$time_start = school_crawl_time_format(strptime($meeting_info_matches[6], '%I:%M%p'));
$time_end = school_crawl_time_format(strptime($meeting_info_matches[7], '%I:%M%p'));
$meeting_place = $meeting_info_matches[8];
if ($verbosity > 5)
foreach (array('date_start', 'date_end', 'meeting_type', 'days', 'time_start', 'time_end', 'meeting_place', 'meeting_type') as $var)
echo $var . ':' . ${$var} . "\n";
$section = new Section($section_id['section'], array(new SectionMeeting($days, $time_start, $time_end, $meeting_place, $meeting_type)), $synonym, $faculty_name);
$semester->section_add($section_id['department'], $section_id['course'], $section);
/*
* Try to update semester's longetivity stats to help the
* school_semester_guess() function:
*/
$date_start_time = strptime($date_start, '%m/%d/%Y');
$date_end_time = strptime($date_end, '%m/%d/%Y');
if ($date_start_time !== FALSE)
{
$date_start_time = school_crawl_mktime($date_start_time);
if (!$semester_start_min || $semester_start_min > $date_start_time)
$semester_start_min = $date_start_time;
}
if ($date_end_time !== FALSE)
{
$date_end_time = school_crawl_mktime($date_end_time);
if ($semester_end_max < $date_end_time)
$semester_end_max = $date_end_time;
}
}
if (!preg_match(';Page ([0-9]+) of ([0-9]+)\$;m', $html, $pages))
{
error_log('Unable to determine the number of pages in this Calvin resultset');
break;
}
if ($verbosity > 0)
{
echo 'calvin_crawl(): finished page ' . $pages[1] . ' of ' . $pages[2] . ' with ' . ($list_row - 1) . " courses.\n";
}
$form = array(
'ACTION*Grp:WSS.COURSE.SECTIONS' => 'NEXT',
);
}
$has_stat = FALSE;
if ($verbosity > 1)
foreach ($skipped_sections as $reason => $num)
{
if (!$num)
continue;
if (!$has_stat)
error_log('Skipped some sections for : :');
error_log($reason . ': ' . $num);
}
$semester->time_end_set($semester_end_max);
$semester->time_start_set($semester_start_min);
$semesters[] = $semester;
if ($verbosity)
fprintf(STDERR, "\n");
}
return 0;
}
/**
* \brief
* Find an element and return its value attribute.
*
* \param $domdocument
* The DOMDocument to search.
* \param $name
* The name attribute of the element.
* \return
* The value attribute of the input element or NULL if not found.
*/
function dom_input_value($domdocument, $name)
{
$xpath = new DOMXPath($domdocument);
$input_node_list = $xpath->query('/descendant::input[attribute::name="' . $name . '"]');
if (!$input_node_list->length)
return NULL;
$input_node = $input_node_list->item(0);
if (!$input_node->hasAttribute('value'))
return NULL;
return $input_node->getAttribute('value');
}
/**
* \brief
* Returns the content of an element with the given ID.
*
* A convenience function.
*
* \param $domdocument
* A DOMDocument to search.
* \param $id
* The id attribute of the element whose content are requested.
* \return
* A UTF-8 string of the contents of the given element or NULL if
* the element isn't found.
*/
function dom_id_content($domdocument, $id)
{
$node = $domdocument->getElementById($id);
if ($node)
{
return $node->nodeValue;
}
return NULL;
}
/**
* \brief
* Searches for and removes a element.
*
* The WebAdvisor likes to put in a docs , which
* is quite bad invalid HTML so that DOM can't handle it.
*
* \param $html
* The input HTML to filter.
* \return
* The fixed HTML.
*/
function calvin_crawl_noscript_filter($html)
{
return preg_replace(';\<(noscript)\>.*?\\1\>;s', '', $html);
}
/**
* \brief
* Add a course to a semester if that semester doesn't yet have this
* course.
*
* \param $semester
* The semester to which the course should be appended.
* \param $deparmtent
* The department of the course to add.
* \param $course_id
* The course_id which, with the department string, forms a
* fully-qualified course_id.
*/
function calvin_crawl_course_add(Semester $semester, $department, $course_id)
{
if ($semester->class_get($department, $course_id) == NULL)
$semester->class_add(new Course($department . '-' . $course_id));
}