1
2
3
4
5
6
7 """
8 This module provides code to work with the standalone version of
9 BLAST, either blastall or blastpgp, provided by the NCBI.
10 http://www.ncbi.nlm.nih.gov/BLAST/
11
12 Classes:
13 LowQualityBlastError Except that indicates low quality query sequences.
14 BlastParser Parses output from blast.
15 BlastErrorParser Parses output and tries to diagnose possible errors.
16 PSIBlastParser Parses output from psi-blast.
17 Iterator Iterates over a file of blast results.
18
19 _Scanner Scans output from standalone BLAST.
20 _BlastConsumer Consumes output from blast.
21 _PSIBlastConsumer Consumes output from psi-blast.
22 _HeaderConsumer Consumes header information.
23 _DescriptionConsumer Consumes description information.
24 _AlignmentConsumer Consumes alignment information.
25 _HSPConsumer Consumes hsp information.
26 _DatabaseReportConsumer Consumes database report information.
27 _ParametersConsumer Consumes parameters information.
28
29 Functions:
30 blastall Execute blastall.
31 blastpgp Execute blastpgp.
32 rpsblast Execute rpsblast.
33
34 """
35
36 from __future__ import generators
37 import os
38 import re
39
40 from Bio import File
41 from Bio.ParserSupport import *
42 from Bio.Blast import Record
43
44
46 """Error caused by running a low quality sequence through BLAST.
47
48 When low quality sequences (like GenBank entries containing only
49 stretches of a single nucleotide) are BLASTed, they will result in
50 BLAST generating an error and not being able to perform the BLAST.
51 search. This error should be raised for the BLAST reports produced
52 in this case.
53 """
54 pass
55
57 """Error caused by running a short query sequence through BLAST.
58
59 If the query sequence is too short, BLAST outputs warnings and errors:
60 Searching[blastall] WARNING: [000.000] AT1G08320: SetUpBlastSearch failed.
61 [blastall] ERROR: [000.000] AT1G08320: Blast:
62 [blastall] ERROR: [000.000] AT1G08320: Blast: Query must be at least wordsize
63 done
64
65 This exception is raised when that condition is detected.
66
67 """
68 pass
69
70
72 """Scan BLAST output from blastall or blastpgp.
73
74 Tested with blastall and blastpgp v2.0.10, v2.0.11
75
76 Methods:
77 feed Feed data into the scanner.
78
79 """
80 - def feed(self, handle, consumer):
81 """S.feed(handle, consumer)
82
83 Feed in a BLAST report for scanning. handle is a file-like
84 object that contains the BLAST report. consumer is a Consumer
85 object that will receive events as the report is scanned.
86
87 """
88 if isinstance(handle, File.UndoHandle):
89 uhandle = handle
90 else:
91 uhandle = File.UndoHandle(handle)
92
93
94 read_and_call_until(uhandle, consumer.noevent, contains='BLAST')
95
96 self._scan_header(uhandle, consumer)
97 self._scan_rounds(uhandle, consumer)
98 self._scan_database_report(uhandle, consumer)
99 self._scan_parameters(uhandle, consumer)
100
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117 consumer.start_header()
118
119 read_and_call(uhandle, consumer.version, contains='BLAST')
120 read_and_call_while(uhandle, consumer.noevent, blank=1)
121
122
123
124 attempt_read_and_call(uhandle, consumer.noevent, start="<pre>")
125 read_and_call(uhandle, consumer.reference, start='Reference')
126 while 1:
127 line = uhandle.readline()
128 if is_blank_line(line) or line.startswith("RID"):
129 consumer.noevent(line)
130 read_and_call_while(uhandle, consumer.noevent, blank=1)
131 break
132 consumer.reference(line)
133
134
135 if attempt_read_and_call(
136 uhandle, consumer.reference, start="Reference"):
137 read_and_call_until(uhandle, consumer.reference, blank=1)
138 read_and_call_while(uhandle, consumer.noevent, blank=1)
139
140
141 read_and_call(uhandle, consumer.query_info, start='Query=')
142 read_and_call_until(uhandle, consumer.query_info, blank=1)
143 read_and_call_while(uhandle, consumer.noevent, blank=1)
144
145
146 read_and_call_until(uhandle, consumer.database_info, end='total letters')
147 read_and_call(uhandle, consumer.database_info, contains='sequences')
148 read_and_call_while(uhandle, consumer.noevent, blank=1)
149
150 consumer.end_header()
151
153
154
155
156
157
158
159
160 while 1:
161 line = safe_peekline(uhandle)
162 if (not line.startswith('Searching') and
163 not line.startswith('Results from round') and
164 re.search(r"Score +E", line) is None and
165 line.find('No hits found') == -1):
166 break
167
168 self._scan_descriptions(uhandle, consumer)
169 self._scan_alignments(uhandle, consumer)
170
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194 consumer.start_descriptions()
195
196
197
198 attempt_read_and_call(uhandle, consumer.noevent, start='Searching')
199
200
201
202 if not uhandle.peekline():
203 raise SyntaxError, "Unexpected end of blast report. " + \
204 "Looks suspiciously like a PSI-BLAST crash."
205
206
207
208
209
210
211
212
213
214 line = uhandle.peekline()
215 if line.find("ERROR:") != -1 or line.startswith("done"):
216 read_and_call_while(uhandle, consumer.noevent, contains="ERROR:")
217 read_and_call(uhandle, consumer.noevent, start="done")
218
219
220
221
222
223
224
225
226
227
228
229
230
231 read_and_call_while(uhandle, consumer.noevent, blank=1)
232
233 if attempt_read_and_call(uhandle, consumer.round, start='Results'):
234 read_and_call_while(uhandle, consumer.noevent, blank=1)
235
236
237
238
239
240
241
242
243 if not attempt_read_and_call(
244 uhandle, consumer.description_header,
245 has_re=re.compile(r'Score +E')):
246
247 attempt_read_and_call(uhandle, consumer.no_hits,
248 contains='No hits found')
249 read_and_call_while(uhandle, consumer.noevent, blank=1)
250
251 consumer.end_descriptions()
252
253 return
254
255
256 read_and_call(uhandle, consumer.description_header,
257 start='Sequences producing')
258
259
260 attempt_read_and_call(uhandle, consumer.model_sequences,
261 start='Sequences used in model')
262 read_and_call_while(uhandle, consumer.noevent, blank=1)
263
264
265
266 if not uhandle.peekline().startswith('Sequences not found'):
267 read_and_call_until(uhandle, consumer.description, blank=1)
268 read_and_call_while(uhandle, consumer.noevent, blank=1)
269
270
271
272
273
274 if attempt_read_and_call(uhandle, consumer.nonmodel_sequences,
275 start='Sequences not found'):
276
277 read_and_call_while(uhandle, consumer.noevent, blank=1)
278 l = safe_peekline(uhandle)
279
280
281
282
283 if not l.startswith('CONVERGED') and l[0] != '>' \
284 and not l.startswith('QUERY'):
285 read_and_call_until(uhandle, consumer.description, blank=1)
286 read_and_call_while(uhandle, consumer.noevent, blank=1)
287
288 attempt_read_and_call(uhandle, consumer.converged, start='CONVERGED')
289 read_and_call_while(uhandle, consumer.noevent, blank=1)
290
291 consumer.end_descriptions()
292
307
314
327
329
330
331
332
333 read_and_call(uhandle, consumer.title, start='>')
334 while 1:
335 line = safe_readline(uhandle)
336 if line.lstrip().startswith('Length ='):
337 consumer.length(line)
338 break
339 elif is_blank_line(line):
340
341 raise SyntaxError, "I missed the Length in an alignment header"
342 consumer.title(line)
343
344
345
346 if not attempt_read_and_call(uhandle, consumer.noevent,
347 start=' '):
348 read_and_call(uhandle, consumer.noevent, blank=1)
349
355
357
358
359
360
361
362
363 read_and_call(uhandle, consumer.score, start=' Score')
364 read_and_call(uhandle, consumer.identities, start=' Identities')
365
366 attempt_read_and_call(uhandle, consumer.strand, start = ' Strand')
367
368 attempt_read_and_call(uhandle, consumer.frame, start = ' Frame')
369 read_and_call(uhandle, consumer.noevent, blank=1)
370
372
373
374
375
376
377
378
379
380
381 while 1:
382
383 attempt_read_and_call(uhandle, consumer.noevent, start=' ')
384 read_and_call(uhandle, consumer.query, start='Query')
385 read_and_call(uhandle, consumer.align, start=' ')
386 read_and_call(uhandle, consumer.sbjct, start='Sbjct')
387 read_and_call_while(uhandle, consumer.noevent, blank=1)
388 line = safe_peekline(uhandle)
389
390 if not (line.startswith('Query') or line.startswith(' ')):
391 break
392
394 consumer.start_alignment()
395 while 1:
396 line = safe_readline(uhandle)
397
398
399
400
401
402 if line.startswith('Searching') or \
403 line.startswith('Results from round'):
404 uhandle.saveline(line)
405 break
406 elif line.startswith(' Database'):
407 uhandle.saveline(line)
408 break
409 elif is_blank_line(line):
410 consumer.noevent(line)
411 else:
412 consumer.multalign(line)
413 read_and_call_while(uhandle, consumer.noevent, blank=1)
414 consumer.end_alignment()
415
417
418
419
420
421
422
423
424
425
426
427
428
429
430 consumer.start_database_report()
431
432
433
434
435 if attempt_read_and_call(uhandle, consumer.noevent, start=" Subset"):
436 read_and_call(uhandle, consumer.noevent, contains="letters")
437 read_and_call(uhandle, consumer.noevent, contains="sequences")
438 read_and_call(uhandle, consumer.noevent, start=" ")
439
440
441
442 while attempt_read_and_call(uhandle, consumer.database,
443 start=' Database'):
444
445
446
447 if not uhandle.peekline():
448 consumer.end_database_report()
449 return
450
451
452 read_and_call_until(uhandle, consumer.database, start=' Posted')
453 read_and_call(uhandle, consumer.posted_date, start=' Posted')
454 read_and_call(uhandle, consumer.num_letters_in_database,
455 start=' Number of letters')
456 read_and_call(uhandle, consumer.num_sequences_in_database,
457 start=' Number of sequences')
458 read_and_call(uhandle, consumer.noevent, start=' ')
459
460 line = safe_readline(uhandle)
461 uhandle.saveline(line)
462 if line.find('Lambda') != -1:
463 break
464
465 read_and_call(uhandle, consumer.noevent, start='Lambda')
466 read_and_call(uhandle, consumer.ka_params)
467 read_and_call(uhandle, consumer.noevent, blank=1)
468
469
470 attempt_read_and_call(uhandle, consumer.gapped, start='Gapped')
471
472 if attempt_read_and_call(uhandle, consumer.noevent, start='Lambda'):
473 read_and_call(uhandle, consumer.ka_params_gap)
474
475
476
477
478 try:
479 read_and_call_while(uhandle, consumer.noevent, blank=1)
480 except SyntaxError, x:
481 if str(x) != "Unexpected end of stream.":
482 raise
483 consumer.end_database_report()
484
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515 if not uhandle.peekline():
516 return
517
518
519
520 consumer.start_parameters()
521
522
523 attempt_read_and_call(uhandle, consumer.matrix, start='Matrix')
524
525 attempt_read_and_call(uhandle, consumer.gap_penalties, start='Gap')
526
527 attempt_read_and_call(uhandle, consumer.num_sequences,
528 start='Number of Sequences')
529 read_and_call(uhandle, consumer.num_hits,
530 start='Number of Hits')
531 attempt_read_and_call(uhandle, consumer.num_sequences,
532 start='Number of Sequences')
533 read_and_call(uhandle, consumer.num_extends,
534 start='Number of extensions')
535 read_and_call(uhandle, consumer.num_good_extends,
536 start='Number of successful')
537
538 read_and_call(uhandle, consumer.num_seqs_better_e,
539 start='Number of sequences')
540
541
542 if attempt_read_and_call(uhandle, consumer.hsps_no_gap,
543 start="Number of HSP's better"):
544
545
546 if attempt_read_and_call(uhandle, consumer.noevent,
547 start="Number of HSP's gapped:"):
548 read_and_call(uhandle, consumer.noevent,
549 start="Number of HSP's successfully")
550 read_and_call(uhandle, consumer.noevent,
551 start="Number of extra gapped extensions")
552 else:
553 read_and_call(uhandle, consumer.hsps_prelim_gapped,
554 start="Number of HSP's successfully")
555 read_and_call(uhandle, consumer.hsps_prelim_gap_attempted,
556 start="Number of HSP's that")
557 read_and_call(uhandle, consumer.hsps_gapped,
558 start="Number of HSP's gapped")
559
560
561 attempt_read_and_call(uhandle, consumer.noevent,
562 start="Number of HSP's gapped:")
563
564 attempt_read_and_call(uhandle, consumer.noevent,
565 start="Number of HSP's successfully")
566
567
568 attempt_read_and_call(uhandle, consumer.query_length,
569 has_re=re.compile(r"[Ll]ength of query"))
570 read_and_call(uhandle, consumer.database_length,
571 has_re=re.compile(r"[Ll]ength of \s*[Dd]atabase"))
572
573
574 attempt_read_and_call(uhandle, consumer.noevent,
575 start="Length adjustment")
576 attempt_read_and_call(uhandle, consumer.effective_hsp_length,
577 start='effective HSP')
578
579 attempt_read_and_call(
580 uhandle, consumer.effective_query_length,
581 has_re=re.compile(r'[Ee]ffective length of query'))
582 read_and_call(
583 uhandle, consumer.effective_database_length,
584 has_re=re.compile(r'[Ee]ffective length of \s*[Dd]atabase'))
585
586
587 attempt_read_and_call(
588 uhandle, consumer.effective_search_space,
589 has_re=re.compile(r'[Ee]ffective search space:'))
590
591 attempt_read_and_call(
592 uhandle, consumer.effective_search_space_used,
593 has_re=re.compile(r'[Ee]ffective search space used'))
594
595
596 attempt_read_and_call(uhandle, consumer.frameshift, start='frameshift')
597
598 attempt_read_and_call(uhandle, consumer.threshold, start='T')
599 attempt_read_and_call(uhandle, consumer.window_size, start='A')
600
601 attempt_read_and_call(uhandle, consumer.threshold, start='Neighboring')
602 attempt_read_and_call(uhandle, consumer.window_size, start='Window')
603
604 read_and_call(uhandle, consumer.dropoff_1st_pass, start='X1')
605 read_and_call(uhandle, consumer.gap_x_dropoff, start='X2')
606
607 attempt_read_and_call(uhandle, consumer.gap_x_dropoff_final,
608 start='X3')
609 read_and_call(uhandle, consumer.gap_trigger, start='S1')
610
611
612
613 if not is_blank_line(uhandle.peekline(), allow_spaces=1):
614 read_and_call(uhandle, consumer.blast_cutoff, start='S2')
615
616 consumer.end_parameters()
617
619 """Parses BLAST data into a Record.Blast object.
620
621 """
626
627 - def parse(self, handle):
628 """parse(self, handle)"""
629 self._scanner.feed(handle, self._consumer)
630 return self._consumer.data
631
633 """Parses BLAST data into a Record.PSIBlast object.
634
635 """
640
641 - def parse(self, handle):
642 """parse(self, handle)"""
643 self._scanner.feed(handle, self._consumer)
644 return self._consumer.data
645
648 self._header = Record.Header()
649
651 c = line.split()
652 self._header.application = c[0]
653 self._header.version = c[1]
654 self._header.date = c[2][1:-1]
655
657 if line.startswith('Reference: '):
658 self._header.reference = line[11:]
659 else:
660 self._header.reference = self._header.reference + line
661
663 if line.startswith('Query= '):
664 self._header.query = line[7:]
665 elif not line.startswith(' '):
666 self._header.query = "%s%s" % (self._header.query, line)
667 else:
668 letters, = _re_search(
669 r"([0-9,]+) letters", line,
670 "I could not find the number of letters in line\n%s" % line)
671 self._header.query_letters = _safe_int(letters)
672
674 line = line.rstrip()
675 if line.startswith('Database: '):
676 self._header.database = line[10:]
677 elif not line.endswith('total letters'):
678 self._header.database = self._header.database + line.strip()
679 else:
680 sequences, letters =_re_search(
681 r"([0-9,]+) sequences; ([0-9,-]+) total letters", line,
682 "I could not find the sequences and letters in line\n%s" %line)
683 self._header.database_sequences = _safe_int(sequences)
684 self._header.database_letters = _safe_int(letters)
685
690
693 self._descriptions = []
694 self._model_sequences = []
695 self._nonmodel_sequences = []
696 self._converged = 0
697 self._type = None
698 self._roundnum = None
699
700 self.__has_n = 0
701
703 if line.startswith('Sequences producing'):
704 cols = line.split()
705 if cols[-1] == 'N':
706 self.__has_n = 1
707
709 dh = self._parse(line)
710 if self._type == 'model':
711 self._model_sequences.append(dh)
712 elif self._type == 'nonmodel':
713 self._nonmodel_sequences.append(dh)
714 else:
715 self._descriptions.append(dh)
716
718 self._type = 'model'
719
721 self._type = 'nonmodel'
722
724 self._converged = 1
725
727 pass
728
730 if not line.startswith('Results from round'):
731 raise SyntaxError, "I didn't understand the round line\n%s" % line
732 self._roundnum = _safe_int(line[18:].strip())
733
735 pass
736
737 - def _parse(self, description_line):
738 line = description_line
739 dh = Record.Description()
740
741
742
743
744
745
746
747
748 cols = line.split()
749 if len(cols) < 3:
750 raise SyntaxError, \
751 "Line does not appear to contain description:\n%s" % line
752 if self.__has_n:
753 i = line.rfind(cols[-1])
754 i = line.rfind(cols[-2], 0, i)
755 i = line.rfind(cols[-3], 0, i)
756 else:
757 i = line.rfind(cols[-1])
758 i = line.rfind(cols[-2], 0, i)
759 if self.__has_n:
760 dh.title, dh.score, dh.e, dh.num_alignments = \
761 line[:i].rstrip(), cols[-3], cols[-2], cols[-1]
762 else:
763 dh.title, dh.score, dh.e, dh.num_alignments = \
764 line[:i].rstrip(), cols[-2], cols[-1], 1
765 dh.num_alignments = _safe_int(dh.num_alignments)
766 dh.score = _safe_int(dh.score)
767 dh.e = _safe_float(dh.e)
768 return dh
769
771
772
773
774
776 self._alignment = Record.Alignment()
777 self._multiple_alignment = Record.MultipleAlignment()
778
780 self._alignment.title = "%s%s" % (self._alignment.title,
781 line.lstrip())
782
786
788
789 if line.startswith('QUERY') or line.startswith('blast_tmp'):
790
791
792
793
794
795 try:
796 name, start, seq, end = line.split()
797 except ValueError:
798 raise SyntaxError, "I do not understand the line\n%s" \
799 % line
800 self._start_index = line.index(start, len(name))
801 self._seq_index = line.index(seq,
802 self._start_index+len(start))
803
804 self._name_length = self._start_index - 1
805 self._start_length = self._seq_index - self._start_index - 1
806 self._seq_length = line.rfind(end) - self._seq_index - 1
807
808
809
810
811
812
813
814
815
816 name = line[:self._name_length]
817 name = name.rstrip()
818 start = line[self._start_index:self._start_index+self._start_length]
819 start = start.rstrip()
820 if start:
821 start = _safe_int(start)
822 end = line[self._seq_index+self._seq_length:].rstrip()
823 if end:
824 end = _safe_int(end)
825 seq = line[self._seq_index:self._seq_index+self._seq_length].rstrip()
826
827 if len(seq) < self._seq_length:
828 seq = seq + ' '*(self._seq_length-len(seq))
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847 align = self._multiple_alignment.alignment
848 align.append((name, start, seq, end))
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
898
899 if self._alignment:
900 self._alignment.title = self._alignment.title.rstrip()
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922 try:
923 del self._seq_index
924 del self._seq_length
925 del self._start_index
926 del self._start_length
927 del self._name_length
928 except AttributeError:
929 pass
930
933 self._hsp = Record.HSP()
934
936 self._hsp.bits, self._hsp.score = _re_search(
937 r"Score =\s*([0-9.e+]+) bits \(([0-9]+)\)", line,
938 "I could not find the score in line\n%s" % line)
939 self._hsp.score = _safe_float(self._hsp.score)
940 self._hsp.bits = _safe_float(self._hsp.bits)
941
942 x, y = _re_search(
943 r"Expect\(?(\d*)\)? = +([0-9.e\-|\+]+)", line,
944 "I could not find the expect in line\n%s" % line)
945 if x:
946 self._hsp.num_alignments = _safe_int(x)
947 else:
948 self._hsp.num_alignments = 1
949 self._hsp.expect = _safe_float(y)
950
952 x, y = _re_search(
953 r"Identities = (\d+)\/(\d+)", line,
954 "I could not find the identities in line\n%s" % line)
955 self._hsp.identities = _safe_int(x), _safe_int(y)
956
957 if line.find('Positives') != -1:
958 x, y = _re_search(
959 r"Positives = (\d+)\/(\d+)", line,
960 "I could not find the positives in line\n%s" % line)
961 self._hsp.positives = _safe_int(x), _safe_int(y)
962
963 if line.find('Gaps') != -1:
964 x, y = _re_search(
965 r"Gaps = (\d+)\/(\d+)", line,
966 "I could not find the gaps in line\n%s" % line)
967 self._hsp.gaps = _safe_int(x), _safe_int(y)
968
969
971 self._hsp.strand = _re_search(
972 r"Strand = (\w+) / (\w+)", line,
973 "I could not find the strand in line\n%s" % line)
974
976
977
978
979 if line.find('/') != -1:
980 self._hsp.frame = _re_search(
981 r"Frame = ([-+][123]) / ([-+][123])", line,
982 "I could not find the frame in line\n%s" % line)
983 else:
984 self._hsp.frame = _re_search(
985 r"Frame = ([-+][123])", line,
986 "I could not find the frame in line\n%s" % line)
987
988
989
990
991
992 _query_re = re.compile(r"Query: \s*(\d+)\s*(.+) (\d+)")
994 m = self._query_re.search(line)
995 if m is None:
996 raise SyntaxError, "I could not find the query in line\n%s" % line
997
998
999
1000 start, seq, end = m.groups()
1001 self._hsp.query = self._hsp.query + seq
1002 if self._hsp.query_start is None:
1003 self._hsp.query_start = _safe_int(start)
1004
1005
1006
1007 self._hsp.query_end = _safe_int(end)
1008 self._query_start_index = m.start(2)
1009 self._query_len = len(seq)
1010
1012 seq = line[self._query_start_index:].rstrip()
1013 if len(seq) < self._query_len:
1014
1015 seq = seq + ' ' * (self._query_len-len(seq))
1016 elif len(seq) < self._query_len:
1017 raise SyntaxError, "Match is longer than the query in line\n%s" % \
1018 line
1019 self._hsp.match = self._hsp.match + seq
1020
1022
1023
1024
1025 start, seq, end = _re_search(
1026 r"Sbjct:\s*(-?\d+)\s*(.+) (-?\d+)", line,
1027 "I could not find the sbjct in line\n%s" % line)
1028
1029
1030
1031
1032 if not seq.strip():
1033 seq = ' ' * self._query_len
1034 self._hsp.sbjct = self._hsp.sbjct + seq
1035 if self._hsp.sbjct_start is None:
1036 self._hsp.sbjct_start = _safe_int(start)
1037
1038 self._hsp.sbjct_end = _safe_int(end)
1039 if len(seq) != self._query_len:
1040 raise SyntaxError, \
1041 "QUERY and SBJCT sequence lengths don't match in line\n%s" \
1042 % line
1043
1044 del self._query_start_index
1045 del self._query_len
1046
1048 pass
1049
1051
1053 self._dr = Record.DatabaseReport()
1054
1056 m = re.search(r"Database: (.+)$", line)
1057 if m:
1058 self._dr.database_name.append(m.group(1))
1059 elif self._dr.database_name:
1060
1061 self._dr.database_name[-1] = "%s%s" % (self._dr.database_name[-1],
1062 line.strip())
1063
1064 - def posted_date(self, line):
1065 self._dr.posted_date.append(_re_search(
1066 r"Posted date:\s*(.+)$", line,
1067 "I could not find the posted date in line\n%s" % line))
1068
1073
1078
1082
1085
1089
1091 pass
1092
1095 self._params = Record.Parameters()
1096
1098 self._params.matrix = line[8:].rstrip()
1099
1104
1106 if line.find('1st pass') != -1:
1107 x, = _get_cols(line, (-4,), ncols=11, expected={2:"Hits"})
1108 self._params.num_hits = _safe_int(x)
1109 else:
1110 x, = _get_cols(line, (-1,), ncols=6, expected={2:"Hits"})
1111 self._params.num_hits = _safe_int(x)
1112
1114 if line.find('1st pass') != -1:
1115 x, = _get_cols(line, (-4,), ncols=9, expected={2:"Sequences:"})
1116 self._params.num_sequences = _safe_int(x)
1117 else:
1118 x, = _get_cols(line, (-1,), ncols=4, expected={2:"Sequences:"})
1119 self._params.num_sequences = _safe_int(x)
1120
1122 if line.find('1st pass') != -1:
1123 x, = _get_cols(line, (-4,), ncols=9, expected={2:"extensions:"})
1124 self._params.num_extends = _safe_int(x)
1125 else:
1126 x, = _get_cols(line, (-1,), ncols=4, expected={2:"extensions:"})
1127 self._params.num_extends = _safe_int(x)
1128
1130 if line.find('1st pass') != -1:
1131 x, = _get_cols(line, (-4,), ncols=10, expected={3:"extensions:"})
1132 self._params.num_good_extends = _safe_int(x)
1133 else:
1134 x, = _get_cols(line, (-1,), ncols=5, expected={3:"extensions:"})
1135 self._params.num_good_extends = _safe_int(x)
1136
1142
1144 self._params.hsps_no_gap, = _get_cols(
1145 line, (-1,), ncols=9, expected={3:"better", 7:"gapping:"})
1146 self._params.hsps_no_gap = _safe_int(self._params.hsps_no_gap)
1147
1153
1159
1164
1169
1174
1180
1186
1192
1198
1204
1206 self._params.frameshift = _get_cols(
1207 line, (4, 5), ncols=6, expected={0:"frameshift", 2:"decay"})
1208
1213
1218
1224
1230
1236
1242
1248
1250 pass
1251
1252
1253 -class _BlastConsumer(AbstractConsumer,
1254 _HeaderConsumer,
1255 _DescriptionConsumer,
1256 _AlignmentConsumer,
1257 _HSPConsumer,
1258 _DatabaseReportConsumer,
1259 _ParametersConsumer
1260 ):
1261
1262
1263
1264
1265
1266
1267
1268
1269
1271 self.data = None
1272
1274
1275 raise ValueError, \
1276 "This consumer doesn't handle PSI-BLAST data"
1277
1281
1285
1287 self.data.descriptions = self._descriptions
1288
1290 _AlignmentConsumer.end_alignment(self)
1291 if self._alignment.hsps:
1292 self.data.alignments.append(self._alignment)
1293 if self._multiple_alignment.alignment:
1294 self.data.multiple_alignment = self._multiple_alignment
1295
1297 _HSPConsumer.end_hsp(self)
1298 try:
1299 self._alignment.hsps.append(self._hsp)
1300 except AttributeError:
1301 raise SyntaxError, "Found an HSP before an alignment"
1302
1306
1310
1311 -class _PSIBlastConsumer(AbstractConsumer,
1312 _HeaderConsumer,
1313 _DescriptionConsumer,
1314 _AlignmentConsumer,
1315 _HSPConsumer,
1316 _DatabaseReportConsumer,
1317 _ParametersConsumer
1318 ):
1320 self.data = None
1321
1325
1329
1334
1336 _DescriptionConsumer.end_descriptions(self)
1337 self._round.number = self._roundnum
1338 if self._descriptions:
1339 self._round.new_seqs.extend(self._descriptions)
1340 self._round.reused_seqs.extend(self._model_sequences)
1341 self._round.new_seqs.extend(self._nonmodel_sequences)
1342 if self._converged:
1343 self.data.converged = 1
1344
1346 _AlignmentConsumer.end_alignment(self)
1347 if self._alignment.hsps:
1348 self._round.alignments.append(self._alignment)
1349 if self._multiple_alignment:
1350 self._round.multiple_alignment = self._multiple_alignment
1351
1353 _HSPConsumer.end_hsp(self)
1354 try:
1355 self._alignment.hsps.append(self._hsp)
1356 except AttributeError:
1357 raise SyntaxError, "Found an HSP before an alignment"
1358
1362
1366
1368 """Iterates over a file of multiple BLAST results.
1369
1370 Methods:
1371 next Return the next record from the stream, or None.
1372
1373 """
1374 - def __init__(self, handle, parser=None):
1375 """__init__(self, handle, parser=None)
1376
1377 Create a new iterator. handle is a file-like object. parser
1378 is an optional Parser object to change the results into another form.
1379 If set to None, then the raw contents of the file will be returned.
1380
1381 """
1382 try:
1383 handle.readline
1384 except AttributeError:
1385 raise ValueError(
1386 "I expected a file handle or file-like object, got %s"
1387 % type(handle))
1388 self._uhandle = File.UndoHandle(handle)
1389 self._parser = parser
1390
1392 """next(self) -> object
1393
1394 Return the next Blast record from the file. If no more records,
1395 return None.
1396
1397 """
1398 lines = []
1399 while 1:
1400 line = self._uhandle.readline()
1401 if not line:
1402 break
1403
1404 if lines and (line.startswith('BLAST')
1405 or line.startswith('BLAST', 1)
1406 or line.startswith('<?xml ')):
1407 self._uhandle.saveline(line)
1408 break
1409 lines.append(line)
1410
1411 if not lines:
1412 return None
1413
1414 data = ''.join(lines)
1415 if self._parser is not None:
1416 return self._parser.parse(File.StringHandle(data))
1417 return data
1418
1420 return iter(self.next, None)
1421
1422 -def blastall(blastcmd, program, database, infile, **keywds):
1423 """blastall(blastcmd, program, database, infile, **keywds) ->
1424 read, error Undohandles
1425
1426 Execute and retrieve data from blastall. blastcmd is the command
1427 used to launch the 'blastall' executable. program is the blast program
1428 to use, e.g. 'blastp', 'blastn', etc. database is the path to the database
1429 to search against. infile is the path to the file containing
1430 the sequence to search with.
1431
1432 You may pass more parameters to **keywds to change the behavior of
1433 the search. Otherwise, optional values will be chosen by blastall.
1434
1435 Scoring
1436 matrix Matrix to use.
1437 gap_open Gap open penalty.
1438 gap_extend Gap extension penalty.
1439 nuc_match Nucleotide match reward. (BLASTN)
1440 nuc_mismatch Nucleotide mismatch penalty. (BLASTN)
1441 query_genetic_code Genetic code for Query.
1442 db_genetic_code Genetic code for database. (TBLAST[NX])
1443
1444 Algorithm
1445 gapped Whether to do a gapped alignment. T/F (not for TBLASTX)
1446 expectation Expectation value cutoff.
1447 wordsize Word size.
1448 strands Query strands to search against database.([T]BLAST[NX])
1449 keep_hits Number of best hits from a region to keep.
1450 xdrop Dropoff value (bits) for gapped alignments.
1451 hit_extend Threshold for extending hits.
1452 region_length Length of region used to judge hits.
1453 db_length Effective database length.
1454 search_length Effective length of search space.
1455
1456 Processing
1457 filter Filter query sequence? T/F
1458 believe_query Believe the query defline. T/F
1459 restrict_gi Restrict search to these GI's.
1460 nprocessors Number of processors to use.
1461
1462 Formatting
1463 html Produce HTML output? T/F
1464 descriptions Number of one-line descriptions.
1465 alignments Number of alignments.
1466 align_view Alignment view. Integer 0-6.
1467 show_gi Show GI's in deflines? T/F
1468 seqalign_file seqalign file to output.
1469
1470 """
1471 att2param = {
1472 'matrix' : '-M',
1473 'gap_open' : '-G',
1474 'gap_extend' : '-E',
1475 'nuc_match' : '-r',
1476 'nuc_mismatch' : '-q',
1477 'query_genetic_code' : '-Q',
1478 'db_genetic_code' : '-D',
1479
1480 'gapped' : '-g',
1481 'expectation' : '-e',
1482 'wordsize' : '-W',
1483 'strands' : '-S',
1484 'keep_hits' : '-K',
1485 'xdrop' : '-X',
1486 'hit_extend' : '-f',
1487 'region_length' : '-L',
1488 'db_length' : '-z',
1489 'search_length' : '-Y',
1490
1491 'program' : '-p',
1492 'database' : '-d',
1493 'infile' : '-i',
1494 'filter' : '-F',
1495 'believe_query' : '-J',
1496 'restrict_gi' : '-l',
1497 'nprocessors' : '-a',
1498
1499 'html' : '-T',
1500 'descriptions' : '-v',
1501 'alignments' : '-b',
1502 'align_view' : '-m',
1503 'show_gi' : '-I',
1504 'seqalign_file' : '-O'
1505 }
1506
1507 if not os.path.exists(blastcmd):
1508 raise ValueError, "blastall does not exist at %s" % blastcmd
1509
1510 params = []
1511
1512 params.extend([att2param['program'], program])
1513 params.extend([att2param['database'], database])
1514 params.extend([att2param['infile'], infile])
1515
1516 for attr in keywds.keys():
1517 params.extend([att2param[attr], str(keywds[attr])])
1518
1519 w, r, e = os.popen3(" ".join([blastcmd] + params))
1520 w.close()
1521 return File.UndoHandle(r), File.UndoHandle(e)
1522
1523
1524 -def blastpgp(blastcmd, database, infile, **keywds):
1525 """blastpgp(blastcmd, database, infile, **keywds) ->
1526 read, error Undohandles
1527
1528 Execute and retrieve data from blastpgp. blastcmd is the command
1529 used to launch the 'blastpgp' executable. database is the path to the
1530 database to search against. infile is the path to the file containing
1531 the sequence to search with.
1532
1533 You may pass more parameters to **keywds to change the behavior of
1534 the search. Otherwise, optional values will be chosen by blastpgp.
1535
1536 Scoring
1537 matrix Matrix to use.
1538 gap_open Gap open penalty.
1539 gap_extend Gap extension penalty.
1540 window_size Multiple hits window size.
1541 npasses Number of passes.
1542 passes Hits/passes. Integer 0-2.
1543
1544 Algorithm
1545 gapped Whether to do a gapped alignment. T/F
1546 expectation Expectation value cutoff.
1547 wordsize Word size.
1548 keep_hits Number of beset hits from a region to keep.
1549 xdrop Dropoff value (bits) for gapped alignments.
1550 hit_extend Threshold for extending hits.
1551 region_length Length of region used to judge hits.
1552 db_length Effective database length.
1553 search_length Effective length of search space.
1554 nbits_gapping Number of bits to trigger gapping.
1555 pseudocounts Pseudocounts constants for multiple passes.
1556 xdrop_final X dropoff for final gapped alignment.
1557 xdrop_extension Dropoff for blast extensions.
1558 model_threshold E-value threshold to include in multipass model.
1559 required_start Start of required region in query.
1560 required_end End of required region in query.
1561
1562 Processing
1563 XXX should document default values
1564 program The blast program to use. (PHI-BLAST)
1565 filter Filter query sequence with SEG? T/F
1566 believe_query Believe the query defline? T/F
1567 nprocessors Number of processors to use.
1568
1569 Formatting
1570 html Produce HTML output? T/F
1571 descriptions Number of one-line descriptions.
1572 alignments Number of alignments.
1573 align_view Alignment view. Integer 0-6.
1574 show_gi Show GI's in deflines? T/F
1575 seqalign_file seqalign file to output.
1576 align_outfile Output file for alignment.
1577 checkpoint_outfile Output file for PSI-BLAST checkpointing.
1578 restart_infile Input file for PSI-BLAST restart.
1579 hit_infile Hit file for PHI-BLAST.
1580 matrix_outfile Output file for PSI-BLAST matrix in ASCII.
1581 align_infile Input alignment file for PSI-BLAST restart.
1582
1583 """
1584 att2param = {
1585 'matrix' : '-M',
1586 'gap_open' : '-G',
1587 'gap_extend' : '-E',
1588 'window_size' : '-A',
1589 'npasses' : '-j',
1590 'passes' : '-P',
1591
1592 'gapped' : '-g',
1593 'expectation' : '-e',
1594 'wordsize' : '-W',
1595 'keep_hits' : '-K',
1596 'xdrop' : '-X',
1597 'hit_extend' : '-f',
1598 'region_length' : '-L',
1599 'db_length' : '-Z',
1600 'search_length' : '-Y',
1601 'nbits_gapping' : '-N',
1602 'pseudocounts' : '-c',
1603 'xdrop_final' : '-Z',
1604 'xdrop_extension' : '-y',
1605 'model_threshold' : '-h',
1606 'required_start' : '-S',
1607 'required_end' : '-H',
1608
1609 'program' : '-p',
1610 'database' : '-d',
1611 'infile' : '-i',
1612 'filter' : '-F',
1613 'believe_query' : '-J',
1614 'nprocessors' : '-a',
1615
1616 'html' : '-T',
1617 'descriptions' : '-v',
1618 'alignments' : '-b',
1619 'align_view' : '-m',
1620 'show_gi' : '-I',
1621 'seqalign_file' : '-O',
1622 'align_outfile' : '-o',
1623 'checkpoint_outfile' : '-C',
1624 'restart_infile' : '-R',
1625 'hit_infile' : '-k',
1626 'matrix_outfile' : '-Q',
1627 'align_infile' : '-B'
1628 }
1629
1630 if not os.path.exists(blastcmd):
1631 raise ValueError, "blastpgp does not exist at %s" % blastcmd
1632
1633 params = []
1634
1635 params.extend([att2param['database'], database])
1636 params.extend([att2param['infile'], infile])
1637
1638 for attr in keywds.keys():
1639 params.extend([att2param[attr], str(keywds[attr])])
1640
1641 w, r, e = os.popen3(" ".join([blastcmd] + params))
1642 w.close()
1643 return File.UndoHandle(r), File.UndoHandle(e)
1644
1645
1646 -def rpsblast(blastcmd, database, infile, align_view="7", **keywds):
1647 """rpsblast(blastcmd, database, infile, **keywds) ->
1648 read, error Undohandles
1649
1650 Execute and retrieve data from standalone RPS-BLAST. blastcmd is the
1651 command used to launch the 'rpsblast' executable. database is the path
1652 to the database to search against. infile is the path to the file
1653 containing the sequence to search with.
1654
1655 You may pass more parameters to **keywds to change the behavior of
1656 the search. Otherwise, optional values will be chosen by rpsblast.
1657
1658 Please note that this function will give XML output by default, by
1659 setting align_view to seven (i.e. command line option -m 7).
1660 You should use the NCBIXML.BlastParser() to read the resulting output.
1661 This is because NCBIStandalone.BlastParser() does not understand the
1662 plain text output format from rpsblast.
1663
1664 WARNING - The following text and associated parameter handling has not
1665 received extensive testing. Please report any errors we might have made...
1666
1667 Algorithm/Scoring
1668 gapped Whether to do a gapped alignment. T/F
1669 multihit 0 for multiple hit (default), 1 for single hit
1670 expectation Expectation value cutoff.
1671 range_restriction Range restriction on query sequence (Format: start,stop) blastp only
1672 0 in 'start' refers to the beginning of the sequence
1673 0 in 'stop' refers to the end of the sequence
1674 Default = 0,0
1675 xdrop Dropoff value (bits) for gapped alignments.
1676 xdrop_final X dropoff for final gapped alignment (in bits).
1677 xdrop_extension Dropoff for blast extensions (in bits).
1678 search_length Effective length of search space.
1679 nbits_gapping Number of bits to trigger gapping.
1680 protein Query sequence is protein. T/F
1681 db_length Effective database length.
1682
1683 Processing
1684 filter Filter query sequence with SEG? T/F
1685 case_filter Use lower case filtering of FASTA sequence T/F, default F
1686 believe_query Believe the query defline. T/F
1687 nprocessors Number of processors to use.
1688 logfile Name of log file to use, default rpsblast.log
1689
1690 Formatting
1691 html Produce HTML output? T/F
1692 descriptions Number of one-line descriptions.
1693 alignments Number of alignments.
1694 align_view Alignment view. Integer 0-9.
1695 show_gi Show GI's in deflines? T/F
1696 seqalign_file seqalign file to output.
1697 align_outfile Output file for alignment.
1698
1699 """
1700 att2param = {
1701 'multihit' : '-P',
1702 'gapped' : '-g',
1703 'expectation' : '-e',
1704 'range_restriction' : '-L',
1705 'xdrop' : '-X',
1706 'xdrop_final' : '-Z',
1707 'xdrop_extension' : '-y',
1708 'search_length' : '-Y',
1709 'nbits_gapping' : '-N',
1710 'protein' : '-p',
1711 'db_length' : '-z',
1712
1713 'database' : '-d',
1714 'infile' : '-i',
1715 'filter' : '-F',
1716 'case_filter' : '-U',
1717 'believe_query' : '-J',
1718 'nprocessors' : '-a',
1719 'logfile' : '-l',
1720
1721 'html' : '-T',
1722 'descriptions' : '-v',
1723 'alignments' : '-b',
1724 'align_view' : '-m',
1725 'show_gi' : '-I',
1726 'seqalign_file' : '-O',
1727 'align_outfile' : '-o'
1728 }
1729
1730 if not os.path.exists(blastcmd):
1731 raise ValueError, "rpsblast does not exist at %s" % blastcmd
1732
1733 params = []
1734
1735 params.extend([att2param['database'], database])
1736 params.extend([att2param['infile'], infile])
1737 params.extend([att2param['align_view'], align_view])
1738
1739 for attr in keywds.keys():
1740 params.extend([att2param[attr], str(keywds[attr])])
1741
1742 w, r, e = os.popen3(" ".join([blastcmd] + params))
1743 w.close()
1744 return File.UndoHandle(r), File.UndoHandle(e)
1745
1747 m = re.search(regex, line)
1748 if not m:
1749 raise SyntaxError, error_msg
1750 return m.groups()
1751
1752 -def _get_cols(line, cols_to_get, ncols=None, expected={}):
1753 cols = line.split()
1754
1755
1756 if ncols is not None and len(cols) != ncols:
1757 raise SyntaxError, "I expected %d columns (got %d) in line\n%s" % \
1758 (ncols, len(cols), line)
1759
1760
1761 for k in expected.keys():
1762 if cols[k] != expected[k]:
1763 raise SyntaxError, "I expected '%s' in column %d in line\n%s" % (
1764 expected[k], k, line)
1765
1766
1767 results = []
1768 for c in cols_to_get:
1769 results.append(cols[c])
1770 return tuple(results)
1771
1773 try:
1774 return int(str)
1775 except ValueError:
1776
1777
1778 str = str.replace(',', '')
1779 try:
1780
1781 return int(str)
1782 except ValueError:
1783 pass
1784
1785
1786 return long(float(str))
1787
1789
1790
1791
1792
1793
1794 if str and str[0] in ['E', 'e']:
1795 str = '1' + str
1796 try:
1797 return float(str)
1798 except ValueError:
1799
1800 str = str.replace(',', '')
1801
1802 return float(str)
1803
1808 if line.find("Query must be at least wordsize") != -1:
1809 raise ShortQueryBlastError, "Query must be at least wordsize"
1810
1811 method = getattr(_BlastConsumer, 'noevent',
1812 _BlastConsumer.__getattr__(self, 'noevent'))
1813 method(line)
1814
1816 """Attempt to catch and diagnose BLAST errors while parsing.
1817
1818 This utilizes the BlastParser module but adds an additional layer
1819 of complexity on top of it by attempting to diagnose SyntaxError's
1820 that may actually indicate problems during BLAST parsing.
1821
1822 Current BLAST problems this detects are:
1823 o LowQualityBlastError - When BLASTing really low quality sequences
1824 (ie. some GenBank entries which are just short streches of a single
1825 nucleotide), BLAST will report an error with the sequence and be
1826 unable to search with this. This will lead to a badly formatted
1827 BLAST report that the parsers choke on. The parser will convert the
1828 SyntaxError to a LowQualityBlastError and attempt to provide useful
1829 information.
1830
1831 """
1832 - def __init__(self, bad_report_handle = None):
1833 """Initialize a parser that tries to catch BlastErrors.
1834
1835 Arguments:
1836 o bad_report_handle - An optional argument specifying a handle
1837 where bad reports should be sent. This would allow you to save
1838 all of the bad reports to a file, for instance. If no handle
1839 is specified, the bad reports will not be saved.
1840 """
1841 self._bad_report_handle = bad_report_handle
1842
1843
1844 self._scanner = _Scanner()
1845 self._consumer = _BlastErrorConsumer()
1846
1847 - def parse(self, handle):
1848 """Parse a handle, attempting to diagnose errors.
1849 """
1850 results = handle.read()
1851
1852 try:
1853 self._scanner.feed(File.StringHandle(results), self._consumer)
1854 except SyntaxError, msg:
1855
1856 if self._bad_report_handle:
1857
1858 self._bad_report_handle.write(results)
1859
1860
1861 self._diagnose_error(
1862 File.StringHandle(results), self._consumer.data)
1863
1864
1865
1866 raise
1867 return self._consumer.data
1868
1870 """Attempt to diagnose an error in the passed handle.
1871
1872 Arguments:
1873 o handle - The handle potentially containing the error
1874 o data_record - The data record partially created by the consumer.
1875 """
1876 line = handle.readline()
1877
1878 while line:
1879
1880
1881
1882 if line.startswith('Searchingdone'):
1883 raise LowQualityBlastError("Blast failure occured on query: ",
1884 data_record.query)
1885 line = handle.readline()
1886