diff --git a/inc/school.crawl.inc b/inc/school.crawl.inc
--- a/inc/school.crawl.inc
+++ b/inc/school.crawl.inc
@@ -56,13 +56,13 @@ function school_crawl_time_format($time)
* simplicity. One-char representations are supported, however, but
* use 'm', 't', 'w', 'h', 'f' to distinguish Thursday and
* Tuesday. 'r' may also be used for Thursday.). Case does not
- * matter.
+ * matter. 's' is for Saturday, based on CCBCMD.
* \return
* slate_permutate's strange internal days representation.
*/
function school_crawl_days_format($days)
{
- static $daymap_1 = array('m' => 'm', 't' => 't', 'w' => 'w', 'h' => 'h', 'r' => 'h', 'f' => 'f');
+ static $daymap_1 = array('m' => 'm', 't' => 't', 'w' => 'w', 'h' => 'h', 'r' => 'h', 'f' => 'f', 's' => 's');
static $daymap_2 = array('th' => 'h');
$my_days = array();
@@ -137,6 +137,14 @@ function school_crawl_days_str_format($d
* \param $post
* If not NULL, causes an HTTP POST. In that case, should be an
* associative array of form keys/values.
+ * \param $follow_meta_refresh
+ * Parse the resultant HTML with http://docs.php.net/dom and if it
+ * contains a line that looks like ``'',
+ * follow that URL.
+ * \param $curlsetup_hook
+ * A function which is passed a curl handle which allows the caller
+ * to do silly things like setting CURLOPT_SSLVERSION for silly
+ * sites like ccbcmd's registration site.
* \param $verbosity
* How verbose to be.
* \param $loopspin
@@ -147,7 +155,7 @@ function school_crawl_days_str_format($d
* malformed HTML, especially with Calvin's WebAdvisor
* installation).
*/
-function school_crawl_geturi(&$uri, &$cookies, $post = NULL, $verbosity = 0, $loopspin = 0)
+function school_crawl_geturi(&$uri, &$cookies, $post = NULL, $follow_meta_refresh = FALSE, $curlsetup_hook = NULL, $verbosity = 0, $loopspin = 0)
{
global $school_crawl_geturi_write_buf, $school_crawl_geturi_headers_buf, $school_crawl_geturi_verbosity;
@@ -160,6 +168,9 @@ function school_crawl_geturi(&$uri, &$co
$curl = curl_init();
+ if ($curlsetup_hook !== NULL)
+ $curlsetup_hook($curl);
+
$school_crawl_geturi_verbosity = $verbosity;
$school_crawl_geturi_write_buf = '';
$school_crawl_geturi_headers_buf = '';
@@ -185,9 +196,28 @@ function school_crawl_geturi(&$uri, &$co
/* var_dump($post); */
$posttxt = '';
- foreach ($post as $postkey => $postval)
+ foreach ($post as $postkey => $postvals)
{
- $posttxt .= (strlen($posttxt) ? '&' : '')
+ /*
+ * This not escaping MEMBER thing is Calvin-specific
+ * too. Maybe we need a way to ask for some particular char
+ * not to be encoded?
+ */
+
+ /*
+ * Apparently, browsers like seamonkey will send multiple
+ * versions of if another input exists with name="field", like:
+ * field=1&field=blah. It seems like the webserver for
+ * ccbcmd cares about having these multiple values too...
+ *
+ * Yes, sending subj_sel=dummy&subj_sel=%25 made _all_ of
+ * the difference. Wow.
+ */
+ if (!is_array($postvals))
+ $postvals = array($postvals);
+ foreach ($postvals as $postval)
+ $posttxt .= (strlen($posttxt) ? '&' : '')
. urlencode($postkey) . '=' . (strpos($postkey, 'MEMBER') === FALSE ? urlencode($postval) : $postval);
}
if ($verbosity > 8)
@@ -229,18 +259,42 @@ function school_crawl_geturi(&$uri, &$co
case 'Location':
$location = $header_val;
+ /* yes, a calvin-specific replacement :-/ */
$location = preg_replace(';(kvdata\.calvin\.edu/)(WebAdvisor);', '\1walive/\2', $location) . "\n";
$post = NULL;
break;
}
}
+ if ($follow_meta_refresh)
+ {
+ $dom = new DOMDocument();
+ $dom->loadHTML($school_crawl_geturi_write_buf);
+ foreach ($dom->getElementsByTagName('meta') as $meta_node)
+ if ($meta_node->hasAttribute('http-equiv')
+ && !strcasecmp('refresh', $meta_node->getAttribute('http-equiv')))
+ {
+ $meta_content = $meta_node->getAttribute('content');
+ if ($verbosity > 2)
+ echo 'Following http-equiv Refresh: ' . $meta_content . PHP_EOL;
+ if (!(preg_match('/^[0-9]+; *url=(.*)$/', $meta_content, $meta_matches)))
+ {
+ echo 'Error following http-equiv Refresh: ' . $meta_content . PHP_EOL;
+ }
+ else
+ {
+ $location = $meta_matches[1];
+ $post = NULL;
+ }
+ }
+ }
+
if ($verbosity > 9)
echo $school_crawl_geturi_write_buf;
if ($location && $loopspin < 6)
{
$uri = $location;
- return school_crawl_geturi($uri, $cookies, $post, $loopspin + 1);
+ return school_crawl_geturi($uri, $cookies, $post, $follow_meta_refresh, $curlsetup_hook, $verbosity, $loopspin + 1);
}
return $school_crawl_geturi_write_buf;
}
@@ -258,3 +312,165 @@ function school_crawl_geturi_write_cb($c
$school_crawl_geturi_write_buf .= $write_buf;
return strlen($write_buf);
}
+
+/**
+ * \brief
+ * Finds the closest parent of a DOM element with a certain tag
+ * name.
+ *
+ * Useful for finding the
element associated with a given
+ * or set of s so that the form's action=""
+ * parameter may be found.
+ *
+ * The node itself passed in will be considered for whether or not it
+ * matches the $element_name.
+ *
+ * \param $node
+ * The dom node whose ancestor should be found.
+ * \param $element_name
+ * The name of the ancestor element which is requested.
+ * \return
+ * The DOMElement sought or NULL if not found.
+ */
+function school_crawl_element_ancestor(DOMElement $node, $element_name)
+{
+ if (!strcmp($node->tagName, $element_name))
+ return $node;
+ if ($node->parentNode)
+ return school_crawl_element_ancestor($node->parentNode, $element_name);
+ return NULL;
+}
+
+/**
+ * \brief
+ * Create an array based on an HTML form for submitting the form.
+ *
+ * Currently, this will only support the and
+ * elements.
+ *
+ * \param $form_node
+ * The dom node of the form.
+ * \return
+ * An array suitable for passing to school_crawl_geturi().
+ */
+function school_crawl_form(DOMElement $form_node)
+{
+ $form = array();
+
+ $xpath = new DOMXPath($form_node->ownerDocument);
+ foreach ($xpath->query('input', $form_node) as $input_node)
+ {
+ if ($input_node->hasAttribute('name'))
+ {
+ $input_name = $input_node->getAttribute('name');
+ if (!isset($form[$input_name]))
+ $form[$input_name] = array();
+ if ($input_node->hasAttribute('value'))
+ $form[$input_name][] = $input_node->getAttribute('value');
+ else
+ /* not sure about what best to do in this case... */
+ $form[$input_name][] = '';
+ }
+ }
+
+ foreach ($xpath->query('select', $form_node) as $select_node)
+ {
+ if ($select_node->hasAttribute('name'))
+ {
+ $select_name = $select_node->getAttribute('name');
+ if (!isset($form[$select_name]))
+ $form[$select_name] = array();
+ foreach ($xpath->query('option[selected]', $select_node) as $option_node)
+ if ($option_node->hasAttribute('value'))
+ $form[$select_name][] = $option_node->getAttribute('value');
+ }
+ }
+
+ return $form;
+}
+
+/**
+ * \brief
+ * Resolve a relativish URL.
+ *
+ * \param $orig_url
+ * The original URL.
+ * \param $url
+ * The new URL to be reconciled with the original one.
+ * \return
+ * A string, the new URL.
+ */
+function school_crawl_url($orig_url, $url)
+{
+ /*
+ * This accounts for both if the $url is already an absolute, fully
+ * qualified URL. It falls back to the original URL if it fails to
+ * match.
+ */
+ foreach (array($url, $orig_url) as $aurl)
+ if (preg_match(';^(https?)://([^/]+)(/.*)$;', $aurl, $matches))
+ {
+ $new_url['schema'] = $matches[1];
+ $new_url['hostname'] = $matches[2];
+ $new_url['path'] = $matches[3];
+ }
+
+ /* check if we have an absolute relative path */
+ if (!strncmp($url, '/', 1))
+ $new_url['path'] = $url;
+
+ /* relative */
+ while (!strncmp($url, '../', 3))
+ {
+ $new_url['path'] = preg_replace(';[^/]+/[^/]+$;', '/', dirname($new_url['path']));
+ $url = substr($url, 3);
+ }
+
+ return $new_url['schema'] . '://' . $new_url['hostname'] . $new_url['path'];
+}
+
+/**
+ * \brief
+ * Map a name onto a column of the table with the help of
.
+ *
+ * This should be a quite reliable way of matching the data that a
+ * user sees onto the actual data because, in most cases, HTML writers
+ * are forced to properly align
and the following hundreds of
+ *
s for there to be a visual alignment.
+ *
+ * \param $tr_node
+ * The
with the
elements to resolve.
+ * \param $column_name
+ * The name of the column to search for.
+ * \param $strcmp
+ * The function to use with a strcmp() interface when judging
+ * whether or not a
's textContent matches $column_name.
+ * \param $trim
+ * The function to apply to the
's textContent before
+ * subjecting it to the $strcmp test.
+ * \return
+ * The 0-based index of the column offset or FALSE if the item isn't
+ * found. This index ignores the existence of text elements, so be
+ * careful in using the result.
+ */
+function school_crawl_table_resolve_column(DOMElement $tr_node, $column_name, $strcmp = 'strcasecmp', $trim = 'trim')
+{
+ $th_nodelist = school_crawl_table_rownodes($tr_node);
+ for ($i = 0; $i < $th_nodelist->length; $i ++)
+ if (!$strcmp($column_name, $trim($th_nodelist->item($i)->textContent)))
+ return $i;
+ return FALSE;
+}
+
+/**
+ * \brief
+ * Get a DOMNodeList of a row's elements without #text elements in
+ * the way.
+ *
+ * Helpful when using school_crawl_table_resolve_column() to get data.
+ */
+function school_crawl_table_rownodes(DOMElement $tr_node)
+{
+ $xpath = new DOMXPath($tr_node->ownerDocument);
+ return $xpath->query('descendant::*[self::th or self::td]', $tr_node);
+}
\ No newline at end of file
diff --git a/school.d/ccbcmd.crawl.inc b/school.d/ccbcmd.crawl.inc
new file mode 100644
--- /dev/null
+++ b/school.d/ccbcmd.crawl.inc
@@ -0,0 +1,254 @@
+
+ *
+ * This file is a part of slate_permutate.
+ *
+ * slate_permutate is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * slate_permutate is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with slate_permutate. If not, see .
+ */
+
+/**
+ * \brief
+ * Crawl CCBCMD's registration stuffage.
+ *
+ * \param $semester
+ * The Semester object which I should populate.
+ * \param $verbosity
+ * A scale from 0 to 10 determining how loud I should be.
+ * \return
+ * 1 on failure, 0 on success.
+ */
+function ccbcmd_crawl(Semester $semester, $verbosity = 1)
+{
+ $cookies = array();
+
+ /*
+ * It seems that http://ccbcmd.edu/schedule/sched.html is what we're
+ * meant to start from. That's just a redirect to some other page
+ * from which we get a listing of available semesters and choose
+ * one.
+ */
+ $uri = 'http://ccbcmd.edu/schedule/sched.html';
+ $semesters_dom = new DOMDocument();
+ $semesters_dom->loadHTML(school_crawl_geturi($uri, $cookies, NULL, TRUE, 'ccbcmd_crawl_curlhook', $verbosity));
+ $semesters_select_node = $semesters_dom->getElementById('term_input_id');
+ if ($semesters_select_node === NULL)
+ {
+ fprintf(STDERR, "Could not get list of available semesters to choose from\n");
+ return 1;
+ }
+
+ $semester_strings = array($semester->year_get(), ucfirst($semester->season_get()));
+ $semester_value = NULL;
+ foreach ($semesters_select_node->childNodes as $semesters_option_node)
+ {
+ $semester_match = TRUE;
+ foreach ($semester_strings as $semester_string)
+ if (stripos($semesters_option_node->textContent, $semester_string) === FALSE)
+ {
+ $semester_match = FALSE;
+ break;
+ }
+ if ($semester_match)
+ {
+ $semester_value = $semesters_option_node->getAttribute('value');
+ break;
+ }
+ }
+
+ $semester_string = implode(' ', $semester_strings);
+ if ($semester_value === NULL)
+ {
+ fprintf(STDERR, "Could not find the desired semester, ``%s'', in the list of available semesters.\n",
+ $semester_string);
+ return 1;
+ }
+
+ if ($verbosity > 1)
+ fprintf(STDERR, "Found semester: %s=``%s''=``%s''.\n",
+ $semester_value, $semester_string, trim($semesters_option_node->textContent));
+ $semesters_form = school_crawl_element_ancestor($semesters_select_node, 'form');
+ if ($semesters_form === NULL)
+ {
+ fprintf(STDERR, "Unable to find associated with semester.\n");
+ return 1;
+ }
+ $semesters_post = school_crawl_form($semesters_form);
+ $semesters_post[$semesters_select_node->getAttribute('name')] = $semester_value;
+
+ $subjects_dom = new DOMDocument();
+ $uri = school_crawl_url($uri, $semesters_form->getAttribute('action'));
+ $subjects_dom->loadHTML(school_crawl_geturi($uri, $cookies, $semesters_post, TRUE, 'ccbcmd_crawl_curlhook', $verbosity));
+
+ $subjects_form_nodelist = $subjects_dom->getElementsByTagName('form');
+ if (!$subjects_form_nodelist->length)
+ {
+ fprintf(STDERR, "Unable to find to submit for the subjects choosing page.\n");
+ return 1;
+ }
+ $subjects_form_node = $subjects_form_nodelist->item(0);
+ $subjects_post = school_crawl_form($subjects_form_node);
+
+ $subjects_select_node = $subjects_dom->getElementById('subj_id');
+ foreach ($subjects_select_node->childNodes as $subjects_option_node)
+ if (!strcasecmp('all', trim($subjects_option_node->textContent)))
+ $subjects_post[$subjects_select_node->getAttribute('name')][] = $subjects_option_node->getAttribute('value');
+
+ $courses_dom = new DOMDocument();
+ $uri = school_crawl_url($uri, $subjects_form_node->getAttribute('action'));
+ $courses_dom->loadHTML(school_crawl_geturi($uri, $cookies, $subjects_post, TRUE, 'ccbcmd_crawl_curlhook', $verbosity));
+
+ $courses_xpath = new DOMXPath($courses_dom);
+
+ /* The second row of the table has all of the headers in it */
+ $tr_header_nodelist = $courses_xpath->query('//table[@class="datadisplaytable" and position()=1]//tr[position()=2]');
+ if (!$tr_header_nodelist->length)
+ {
+ fprintf(STDERR, "Unable to find the row of the course/section data table which gives us the mappings of column names onto columns.\n");
+ return 1;
+ }
+ $tr_header_node = $tr_header_nodelist->item(0);
+
+ $section_offsets = array(
+ 'registration_number' => school_crawl_table_resolve_column($tr_header_node, 'CRN'),
+ 'section_id' => school_crawl_table_resolve_column($tr_header_node, 'subj/crse/sec'),
+ /* there's a boolean column which says whether or not the course has any prerequisites/corequisites.... */
+ 'credits' => school_crawl_table_resolve_column($tr_header_node, 'credhrs'),
+ /* there's a column for the number of contact hours, vs. credit hours */
+ );
+ foreach (array('title', 'days', 'times', 'instructor', 'location') as $column_key)
+ $section_offsets[$column_key] = school_crawl_table_resolve_column($tr_header_node, $column_key);
+ /* there's also a column for ``session dates'' */
+
+ /* error check and calculate the number of children that a node must have to be */
+ $max_offset = 0;
+ foreach ($section_offsets as $name => $value)
+ {
+ if ($value === FALSE)
+ {
+ fprintf(STDERR, "Unable to find column offset for `%s'.\n",
+ $name);
+ return 1;
+ }
+ else
+ if ($verbosity > 6)
+ echo $name . ' -> ' . $value . PHP_EOL;
+
+ $max_offset = max($max_offset, $value);
+ }
+
+ foreach ($courses_xpath->query('//table[@class="datadisplaytable" and position()=1]//tr') as $tr_node)
+ {
+ $children = school_crawl_table_rownodes($tr_node);
+ if ($children->length < $max_offset)
+ /*
+ * Skip this row because it doesn't have all of the columns we
+ * want and thus it can't be a row containing information
+ * about a section.
+ */
+ continue;
+ if (!strcmp($children->item($section_offsets['section_id'])->tagName, 'th'))
+ /*
+ * We've hit one of the
s filled with
s. Skip this one.
+ */
+ continue;
+
+ /*
+ * There are some rows with the time set to TBA and with empty
+ * section_id columns. Respond to this by skipping empty
+ * section_id columns since there's no useful data in these
+ * rows. We use strlen() < 3 because trim() doesn't take care of
+ * :-/
+ */
+ $section_id = trim($children->item($section_offsets['section_id'])->textContent);
+ if (strlen($section_id) < 3)
+ continue;
+
+ $section_id_parts = Section::parse($section_id);
+
+ $registration_number = $children->item($section_offsets['registration_number'])->textContent;
+ $instructor = $children->item($section_offsets['instructor'])->textContent;
+
+ $section_meetings = array();
+ {
+ $time_range_text = $children->item($section_offsets['times'])->textContent;
+ if (strpos($time_range_text, 'TBA') !== FALSE)
+ {
+ /*
+ * Add the section to the autocomplete list, just without
+ * any meeting info (i.e., $section_meetings is still
+ * empty now).
+ */
+ $semester->section_add($section_id_parts['department'], $section_id_parts['course'],
+ new Section($section_id_parts['section'], $section_meetings, $registration_number, $instructor));
+ continue;
+
+ }
+ if (($dash_pos = strpos($time_range_text, '-')) === FALSE)
+ {
+ fprintf(STDERR, "Unable to understand course's time range format, cannot find dash: ``%s''.\n",
+ $time_range_text);
+ return 1;
+ }
+
+ $time_start_text = substr($time_range_text, 0, $dash_pos);
+ $time_start = strptime($time_start_text, '%I:%M %p');
+ $time_end_text = substr($time_range_text, $dash_pos + 1);
+ /*
+ * Make sure that _only_ one date range is specified to ensure
+ * data integrity. I.e., make sure that the college doesn't
+ * suddenly support multiple meeting times without our
+ * anticipating that and then cause us to have invalid
+ * data. ;-). --binki
+ */
+ if (strpos($time_end_text, '-') !== FALSE)
+ {
+ fprintf(STDERR, "College seems to support multiple meeting times per semester which we don't know how to parse (even though slate_permutate itself can handle this situation): ``%s'' time_end_text: ``%s''.\n",
+ $time_range_text, $time_end_text);
+ return 1;
+ }
+ $time_end = strptime($time_end_text, '%I:%M %p');
+ if ($time_end === FALSE || $time_start === FALSE)
+ {
+ fprintf(STDERR, "Error parsing start or end time: start: ``%s'' end: ``%s''.\n",
+ $time_start_text, $time_end_text);
+ return 1;
+ }
+
+ $days = school_crawl_days_str_format($children->item($section_offsets['days'])->textContent);
+
+ $section_meetings[] = new SectionMeeting($days, school_crawl_time_format($time_start), school_crawl_time_format($time_end),
+ $children->item($section_offsets['location'])->textContent);
+ }
+
+ $semester->section_add($section_id_parts['department'], $section_id_parts['course'],
+ new Section($section_id_parts['section'], $section_meetings, $registration_number, $instructor));
+ }
+
+ return 0;
+}
+
+function ccbcmd_crawl_curlhook(&$curl)
+{
+ /*
+ * OK, so this must be set to SSLv2 or SSLv3 because of how the
+ * server's SSL junk is messed up. When curl is built against
+ * gnutls, though, we can't use SSL2 since it doesn't support that
+ * old of a protocol. So, we use 3 which works. Apparently, the
+ * server can't handle gnutls's attempt to use TLS. Even openssl's
+ * s_client command fails without manually specifying --ssl2 or
+ * --ssl3. So, this must be a _really_ weird server setup...
+ */
+ curl_setopt($curl, CURLOPT_SSLVERSION, 3);
+}
diff --git a/school.d/ccbcmd.inc b/school.d/ccbcmd.inc
new file mode 100644
--- /dev/null
+++ b/school.d/ccbcmd.inc
@@ -0,0 +1,31 @@
+
+ *
+ * This file is a part of slate_permutate.
+ *
+ * slate_permutate is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * slate_permutate is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with slate_permutate. If not, see .
+ */
+
+function ccbcmd_info()
+{
+ return array('name' => 'Community College of Baltimore County',
+ 'url' => 'http://ccbcmd.edu/',
+ 'domains' => array(
+ 'ccbcmd.edu',
+ ),
+ 'student_address' => 'student',
+ 'example_course_id' => 'ENGL 101',
+ );
+}