-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathUploadToDatabase
More file actions
executable file
·125 lines (103 loc) · 3.76 KB
/
UploadToDatabase
File metadata and controls
executable file
·125 lines (103 loc) · 3.76 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
#!/bin/bash
#**************************************************************
# EPPIC Pre-Computation Script
# Author : Kumaran Baskaran
# Date : 05/07/2013
#***************************************************************
set -e
help="\t\tUsage : $0 \n
\t\t [-d <dir-path> : Directory contains downloaded files\n
\t\t -j <file-path> : eppic jar file path\n
\t\t -n <name> : database name (usually uniprot relese name)]\n
\t\tExample : ./Download files -o ~/Downloads "
if (($# == 0)); then
echo "`date +%d/%m/%y-%H:%M:%S` ERROR: Script requires arguments" >&2
echo -e $help
exit 1;
fi
while getopts :d:j:n:h opt
do
case $opt in
d) DOWNLOAD=$OPTARG;;
j) EPPIC_JAR=$OPTARG;;
n) DATABASE_NAME=$OPTARG;;
h) echo -e $help;;
\?) echo "`date +%d/%m/%y-%H:%M:%S` ERROR: Invalid option: -$OPTARG" >&2
exit 1;;
:) echo "`date +%d/%m/%y-%H:%M:%S` ERROR: Option -$OPTARG requires an argument." >&2
exit 1;;
esac
done
if [ -z $DOWNLOAD ] || [ -z $EPPIC_JAR ] || [ -z $DATABASE_NAME ]
then
echo -e "\n`date +%d/%m/%y-%H:%M:%S` ERROR: ---------Some Options not specified correctly -------" >&2
echo -e $help
exit 1
fi
if [ ! -e $EPPIC_JAR ]
then
echo -e "\t `date +%d/%m/%y-%H:%M:%S` ERROR: Couldn't find $EPPIC_JAR ..........." >&2
exit 1
fi
if [ ! -e $DOWNLOAD/uniref100.xml.gz ]
then
echo "`date +%d/%m/%y-%H:%M:%S` ERROR: Couldn't find $DOWNLOAD/uniref100.xml.gz" >&2
exit 1
fi
if [ ! -e $DOWNLOAD/taxonomy-all.tab ]
then
echo "`date +%d/%m/%y-%H:%M:%S` ERROR: Couldn't find $DOWNLOAD/taxonomy-all.tab" >&2
exit 1
fi
echo "`date +%d/%m/%y-%H:%M:%S` INFO: Creating output tab and cluster members tab file.."
cmd="java -cp $EPPIC_JAR owl.core.connections.UnirefXMLParser"
$cmd $DOWNLOAD/uniref100.xml.gz $DOWNLOAD/uniref100.tab $DOWNLOAD/uniref100.clustermembers.tab || exit 1
echo "`date +%d/%m/%y-%H:%M:%S` INFO: Creating a new database named $DATABASENAME .. "
mysql <<EOF
DROP DATABASE IF EXISTS $DATABASE_NAME;
CREATE DATABASE $DATABASE_NAME;
EOF
#---------------------------------------------------------------
# Create uniprot table
table=uniprot
# note 1: uniprot ids are 6 char, but for isoforms they can be 8 or 9 chars (hyphen and number additionally)
# note 2: primary keys are created later in indexing step so that load data is a lot quicker
# note 3: in first versions of uniprot there are duplicate uniprot ids, e.g. Q9WHW0 in version 1.0
# Because of that we need to create the uniprot table with a primary key in uniparc id so that
# on loading the duplicates will be eliminated. Also a primary key on uniprot_clusters (member)
# The other indexes are created in another step
echo "`date +%d/%m/%y-%H:%M:%S` INFO: Creating $table tables.."
mysql $DATABASE_NAME <<EOF
DROP TABLE IF EXISTS $table;
DROP TABLE IF EXISTS ${table}_clusters;
CREATE TABLE $table (
id varchar(23),
uniprot_id varchar(15),
uniparc_id char(13) PRIMARY KEY,
tax_id int,
sequence text
);
CREATE TABLE ${table}_clusters (
representative varchar(15),
member varchar(15) PRIMARY KEY,
tax_id int
);
EOF
#------------------------------------------------------
# Load Data in uniprot table
echo "`date +%d/%m/%y-%H:%M:%S` INFO: Loading Data in to $table table "
mysql --enable-local $DATABASE_NAME <<EOF
LOAD DATA LOCAL INFILE '$DOWNLOAD/uniref100.tab' INTO TABLE $table;
SHOW WARNINGS;
LOAD DATA LOCAL INFILE '$DOWNLOAD/uniref100.clustermembers.tab' INTO TABLE ${table}_clusters;
SHOW WARNINGS;
EOF
#-------------------------------------------------------------
##################################################
# Creating Uniprot Database Indices
#--------------------------------------------------------------
echo "Creating indices.."
mysql $DATABASE_NAME <<EOF
CREATE INDEX UNIPROTID_IDX ON uniprot (uniprot_id);
EOF
echo "`date +%d/%m/%y-%H:%M:%S` INFO: Finished loading data into database."