-- copyright_begin
-- IBM Confidential
-- OCO Source Materials
--
-- 5724-R21
-- © Copyright IBM Corp.  2006.   All Rights Reserved.
-- 
-- The source code for this program is not published or otherwise divested of its trade secrets, 
-- irrespective of what has been deposited with the U.S. Copyright Office.
-- copyright_end

--
-- The V8.3 web-crawler database schema
--
CREATE SCHEMA {0};
SET SCHEMA {0};

--
-- Crawl data for one web server.
--
CREATE TABLE bucket (
   hosthash                 BIGINT NOT NULL,
   protonum                 SMALLINT NOT NULL,
   hostname                 VARCHAR(255) NOT NULL,
   port                     INTEGER NOT NULL,
   dnsdate                  INTEGER,
   dnsfailcount             INTEGER,
   robotsdate               INTEGER,
   robotsfailcount          INTEGER,
   servertype               VARCHAR(255),
   volatile                 INTEGER,
   activatedate             INTEGER,
   scandate                 INTEGER,
   scancount                INTEGER,
   crawldate                INTEGER,
   uncrawledcount           INTEGER,
   deactivatedate           INTEGER); 
   
CREATE INDEX buckethidx on bucket(hosthash);
CREATE INDEX bucketidx ON bucket(hosthash,dnsdate,robotsdate);

--
-- "robots.txt" content for one bucket, per row.
--
CREATE TABLE robots (
   hosthash                 BIGINT NOT NULL ,
   robotstxt                VARCHAR(31744));
create index bothostidx on robots(hosthash);

CREATE TABLE ip (
   hosthash                 BIGINT NOT NULL,
   address                  VARCHAR(16) FOR BIT DATA NOT NULL);
create index iphostidx on ip(hosthash);


--
-- Crawl data for one URL.
--
CREATE TABLE url (
   httpcode                 SMALLINT,
   httpcodecount            SMALLINT WITH DEFAULT 0,
   hosthash                 BIGINT NOT NULL,
   urlhash                  BIGINT NOT NULL ,
   discoverer               BIGINT,
   creationdate             INTEGER,
   crawldate                INTEGER,
   recrawldate              INTEGER,
   serverdate               INTEGER,
   expiredate               INTEGER,
   modifieddate             INTEGER,
   fingerprint              INTEGER,
   flags1                   INTEGER,
   urlstr                   VARCHAR(4096) NOT NULL);
CREATE UNIQUE INDEX urlhashidx ON url(urlhash);
CREATE INDEX urlidx ON url(hosthash,recrawldate);
-- CREATE INDEX fingeridx ON url(fingerprint);


--
-- Crawl data for one URL for an archive file.
--
CREATE TABLE ARCHIVEDATA (
   URIHASH	 	            BIGINT NOT NULL ,
   PARENTURIHASH            BIGINT NOT NULL,
   FINGERPRINT              INTEGER,
   URISTRING                VARCHAR(4096) NOT NULL);
CREATE INDEX arcuriidx ON ARCHIVEDATA (URIHASH);

CREATE TABLE COOKIE (
   HOSTHASH BIGINT NOT NULL,
   NAMEHASH	BIGINT NOT NULL,
   EXPIRES  INTEGER NOT NULL,
   SECURE   char(1) NOT NULL,
   VERSION  smallint not null,
   NAME     VARCHAR(1024) NOT NULL,
   VALUE    VARCHAR(4096) NOT NULL,
   PATH		VARCHAR(1024),
   DOMAIN   VARCHAR(1024));

CREATE INDEX ckhostidx on COOKIE(hosthash,expires);
CREATE INDEX ckhostpathidx on COOKIE(hosthash,namehash);
-- CREATE INDEX ckexipresidx on COOKIE(expires);

CREATE TABLE CONFIG (
   name INTEGER NOT NULL,
   value BLOB(1G) );

   