1
2 """ This script is used to evalulate the mapping results """
3
4 import HTSeq
5 import cPickle as pickle
6 import numpy as np
7 import time
8 import sys
9 import re
10 import subprocess
11 import random
12 import os
13
14
15
16 alngts = 0
17 AlignedReadsdic = {}
18 algnedToRef = 0
19 algnedToArt = 0
20
21 """ Helper debug functions.... """
23 pickle.dump(dictionaary, open(outputname + ".p", "wb"))
24 print("Saved .pickle!")
25
27 dictionary = pickle.load(open(inputname + ".p"))
28 print("Loaded " + inputname + ".pickle!")
29 return (dictionary)
30
31
33 """
34 Class for exhaustive reading of the sam alignments.
35
36 """
37
38 - def __init__(self, readname, mr, nm, subs, score, mq, start, end, gaps, mism):
39 self.id = readname
40 self.mr = [mr]
41 self.subs =[subs]
42 self.nm = [nm]
43 self.rq = score
44 self.mq = [mq]
45 self.start = [start]
46 self.end = [end]
47
48 self.gaps = [gaps]
49 self.mism = [mism]
50
52 self.mr.append(read.mr[0])
53 self.subs.append(read.subs[0])
54 self.nm.append(read.nm[0])
55 self.mq.append(read.mq[0])
56 self.start.append(read.start[0])
57 self.end.append(read.end[0])
58 self.gaps.append(read.gaps[0])
59 self.mism.append(read.mism[0])
60
61 - def toStr(self, readname):
62 """Converts the object to a string """
63 str = ""
64
65 for i, read in enumerate(self.mr):
66
67 str += "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\r\n" % (readname, self.mr[i], self.subs[i], self.nm[i], self.rq, self.mq[i], self.start[i], self.end[i], self.gaps[i], self.mism[i])
68 return(str)
69
71 """Converts the object to a string """
72
73 if identifier == "art":
74 str = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\r\n" % (self.id, 0, self.subs[0], self.nm[0], self.rq, self.mq[0], self.start[0], self.end[0], self.gaps[0], self.mism[0])
75 else:
76 str = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\r\n" % (self.id, 1, 0, self.nm[0], self.rq, self.mq[0], self.start[0], self.end[0], self.gaps[0], self.mism[0])
77 return(str)
78
79
80
82 """ Class for exhaustive reading of the sam alignments. Only for TP """
83 - def __init__(self, nm, score, mq, start, end, gaps, mism):
84 self.nm = [nm]
85 self.rq = score
86 self.mq = [mq]
87 self.start = [start]
88 self.end = [end]
89 self.gaps = [gaps]
90 self.mism = [mism]
91
93 self.nm.append( read.nm[0])
94 self.mq.append(read.mq[0])
95 self.start.append( read.start[0])
96 self.end.append( read.end[0])
97 self.gaps.append( read.gaps[0])
98 self.mism.append(read.mism[0])
99
100 - def toStr(self, readname):
101 str = ""
102 for i, read in enumerate(self.mr):
103 str += "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\r\n" % (readname, self.nm[i], self.rq, self.mq[i], self.start[i], self.end[i], self.gaps[i], self.mism[i])
104 return(str)
105
106 """ returns 1 if an equal alignment is contained in the array """
108 for i, position in enumerate(self.start):
109
110 if position == read.start and self.end[i] == read.end:
111 return(1)
112 return(0)
113
114
116 """ Class for efficient addressing for np. arrays! """
117 - def __init__(self, internalID, quality):
118 self.internalID = internalID
119 self.quality = quality
120
121
122
123
124
126 """
127 Funtion which transforms an HTSeq alignment to a TPRead class.
128
129 @type alngt: alignment
130 @param alngt: alignment from the sam file.
131 @type compareList: list
132 @param compareList: list which indicates differences between reference / artificial
133 @type readdic: dictionary
134 @param readdic: Contains ranks and qualities for every entry in the fastq file
135
136 @rtype: TPRead
137 @return: Transformed ReadObject.
138 """
139 cigar = alngt.cigar
140 gaps = sum([i.size for i in cigar if i.type=="I" or i.type=="D"])
141 mism = sum([i.size for i in cigar if i.type=="M"]) - sum([int(s) for s in alngt.optional_field("MD") if s.isdigit()])
142 nm = alngt.optional_field("NM")
143
144
145 return (TPRead(nm, readdic[alngt.read.name], alngt.aQual, alngt.iv.start, alngt.iv.end, gaps, mism))
146
148 """
149 Funtion which transforms an HTSeq alignment to a CustomRead class.
150
151 @type alngt: alignment
152 @param alngt: alignment from the sam file.
153 @type compareList: list
154 @param compareList: list which indicates differences between reference / artificial
155 @type readdic: dictionary
156 @param readdic: Contains ranks and qualities for every entry in the fastq file
157
158 @rtype: CustomRead
159 @return: Transformed ReadObject.
160
161 """
162 cigar = alngt.cigar
163
164 gaps = sum([i.size for i in cigar if i.type=="I" or i.type=="D"])
165
166
167
168 mism = sum([i.size for i in cigar if i.type=="M"]) - getsum(re.findall("(\d+)", alngt.optional_field("MD")))
169
170
171 nm = alngt.optional_field("NM")
172
173
174 if len(alngt.read.qualstr) <= 1:
175
176 return (CustomRead(alngt.read.name,0,nm,getHammingdistance(compareList, alngt.iv.start, alngt.iv.end),readdic[alngt.read.name],alngt.aQual,alngt.iv.start,alngt.iv.end,gaps,mism))
177 else:
178 return (CustomRead(alngt.read.name,0,nm,getHammingdistance(compareList, alngt.iv.start, alngt.iv.end),(alngt.read.qual.mean() / (alngt.read.qual.max() - alngt.read.qual.min() + 1)),alngt.aQual,alngt.iv.start,alngt.iv.end,gaps,mism))
179
181 """
182 Function to get a dictionary containing the rank of a read (given a sorted samfile by readname, samtools).
183
184 @type referenceSAM: string
185 @param referenceSAM: Inputfile name for reference SAM file.
186 @rtype: dictionary
187 @return: Internalnaming according to the sorting of samtools. Key = ReadID, Value = rank
188 """
189 internalDic = {}
190
191
192 readnames = str(subprocess.check_output("""awk < %s -F "\t" '{print $1 }'""" % referenceSAM, shell=True))
193
194
195 readnames = readnames.split("\n")
196
197 i = 0
198
199 while readnames[i][0] == ("@"):
200 i += 1
201
202 index = 0
203 for k in xrange(i, len(readnames)):
204 if readnames[k] in internalDic:
205 pass
206 else:
207 internalDic[readnames[k]] = index
208
209 index += 1
210
211
212
213 return(internalDic)
214
215
217 """
218 Function which returns the next line from a SAMFile.
219
220 @type textfile: fileobject stream
221 @param textfile: SAMfile
222 @type compareList: list
223 @param compareList: AA sequence of the reference genome.
224 @type readdic: Dictionary
225 @param readdic: Boolean which decides if unbalanced mutations are allowed (only initial mutation is performed)
226 @rtype: Readobj
227 @return: Parsed read from text line.
228 """
229
230
231
232 line = textfile.readline()
233 if not line:
234 return(0)
235 CRead, dummy = readSAMline(line, "noMem", compareList, readdic)
236 return(CRead)
237
238
239
241 """
242 Reads the alignment tag given, the text and a tag to search for
243
244 @type mdtag: string
245 @param mdtag: MDTag from alignment
246 @type cigar: string
247 @param cigar: Cigar from alignment
248 @rtype: gaps,mismatches
249 @return: Parsed gaps and mismatches
250 """
251 deletions = getsum(re.findall("(\d+)D", cigar))
252 insertions = getsum(re.findall("(\d+)I", cigar))
253 gaps = int(insertions + deletions)
254
255
256
257
258
259 mismatch = getsum(re.findall("(\d+)M", cigar)) - getsum(re.findall("(\d+)", mdtag))
260
261
262
263 return(gaps, mismatch)
264
265 -def getAllID(textfile, read, compareList, readdic):
266 """
267 Reads all alignments for the current read which are in the SAM file (sorted). If a new read ID is scanned the results are returned.
268
269 @type textfile: fileobject stream
270 @param textfile: SAM file for reading.
271 @type read: read obj
272 @param read: the last read obj which defines the current read id
273 @type readdic: dictionary
274 @param readdic: Look up for quality values.
275 @rtype: (bool,list,bool)
276 @return: a "triple", where 2 bools are defined as indicator variables and a list with all alignments for one read.
277 """
278
279 if read == 0:
280
281 CRead = getNextLine(textfile, compareList, readdic)
282 return(getAllID(textfile, CRead, compareList, readdic))
283 else:
284
285 templist = []
286 templist.append(read)
287
288 while 1:
289 line = textfile.readline()
290
291 if not line:
292 return(0, templist, 0)
293
294 try:
295 CRead = getCustomRead(HTSeq.SAM_Alignment.from_SAM_line(line), compareList, readdic)
296
297 except:
298 CRead, dummy = readSAMline(line, "noMem", compareList, readdic)
299
300 if CRead == 0:
301
302 return(1, templist, CRead)
303
304 if CRead.id == read.id:
305 templist.append(CRead)
306 else:
307 return(1, templist, CRead)
308
309
311 """
312 Compares alignments for the reference and artificial reference for a specific read id.
313 The goal is to identify false positives.
314
315 @type reflist: read obj list
316 @param reflist: list containing alignments witht the same ID (reference)
317 @type artlist: read obj list
318 @param artlist: list containing alignments witht the same ID (artificial)
319 @rtype: np.array
320 @return: indices of unique alignments
321 """
322
323
324 artlen = len(artlist)
325 reflen = len(reflist)
326
327 TPcount = reflen
328 FPcount = 0
329 for RefRead in reflist:
330 file.write(RefRead.toStrNoMem("ref"))
331
332
333 for i in xrange(0, artlen):
334
335 ArtInRef = 0
336 for j in xrange(0,reflen):
337 if artlist[i].start == reflist[j].start and artlist[i].end == reflist[j].end:
338 ArtInRef = 1
339 break
340
341 if (ArtInRef == 0):
342 FPcount+=1
343 file.write(artlist[i].toStrNoMem("art"))
344
345
346 return(TPcount,FPcount)
347
348 """def WriteToFile(file, list, identifier):
349
350 Writes read obj. to a file.
351
352
353 @type file: read obj list
354 @param file: list containing alignments witht the same ID (reference)
355 @type list: read obj. list
356 @param list: list containing alignments witht the same ID (artificial)
357 @type identifier: string
358 @param identifier: indicator if the results are from the reference or artificial
359
360 file.write(getBestAlignment(list).toStrNoMem(identifier))"""
361
363 """
364 Skips the header from a SAM file, but reads the first line of the alignment section.
365
366 @type file: read obj list
367 @param file: list containing alignments witht the same ID (reference)
368 @type compareList: list
369 @param compareList: list for accumulation of the same read id
370 @type readdic: dictionary
371 @param readdic: dictionary containing read ID - read quality mappings.
372 @rtype: read obj.
373 @return: Returns a read obj. from the SAM file.
374 """
375 while 1:
376 temp = file.readline()
377 if temp.startswith("@"):
378 pass
379 else:
380 return(readSAMline(temp, "noMem", compareList, readdic))
381 break
382
383
385 """
386 Function which returns the ranks of 2 given readIDs (read from reference,read from artificial).
387
388 @type RefRead: read obj
389 @param RefRead: read obj (reference)
390 @type ArtRead: read obj
391 @param ArtRead: read obj (artificial)
392 @type rankdic: dictionary
393 @param rankdic: dictionary containing ranks of the read IDs (according to the sorted SAM files).
394 @rtype: int,int
395 @return: returns the false positives and true positives for a SAM file pair (reference, artificial)
396 """
397
398 RefReadRank = rankdic[RefRead[0].id]
399 if ArtRead[0].id in rankdic:
400 ArtReadRank = rankdic[ArtRead[0].id]
401 else:
402
403 ArtReadRank = RefReadRank - 1
404
405 return(RefReadRank,ArtReadRank)
406
407
408 -def ReadSAMnoMem(ref, art, output, compareList, readdic, rankdic):
409 """
410 Main function for comparing to mappings. This functions takes the complete alignments for artificial and reference genome and goes through them in parallel.
411 Since the mappings are sorted the function alternates the parsing of the samfiles in such a way that no memory is used for comparing these functions.
412
413 @type ref: string
414 @param ref: path to reference alignments (SAM file)
415 @type art: string
416 @param art: path to artificial alignments (SAM file)
417 @type output: read obj
418 @param output: read obj (artificial)
419 @type compareList: read obj
420 @param compareList: read obj (artificial)
421 @type readdic: dictionary
422 @param readdic: dictionary containing read ID - read quality mappings.
423 @type rankdic: dictionary
424 @param rankdic: dictionary containing ranks of the read IDs (according to the sorted SAM files).
425 @rtype: ranks
426 @return: Returns the ranks for the 2 read ids.
427 """
428
429 fobj = open(output, "w")
430 fobj.write("#ReadID\tMatchedReference\tSubstitutions\tNMtag\tReadQuality\tMappingQuality\tStart\tEnd\tGaps\tMisM\r\n")
431 fileref = open(ref, "r")
432 fileart = open(art, "r")
433
434
435 CurrentReadRef, dummy = SkipHeader(fileref, compareList, readdic)
436 CurrentReadArt, dummy = SkipHeader(fileart, compareList, readdic)
437
438
439 hasNextLine, ArtRead, NextReadArt = getAllID(fileart, CurrentReadArt, compareList, readdic)
440 hasNextLine, RefRead, NextReadRef = getAllID(fileref, CurrentReadRef, compareList, readdic)
441
442
443 tp = 0
444 fp = 0
445 MappedReads = 0
446 i = 0
447 start = time.time()
448
449 ArtBool = 0
450 RefBool = 0
451 while 1:
452 i += 1
453 if i == 100000:
454 print "\t\t%f" % (time.time() - start),
455
456 RefReadRank,ArtReadRank = getRanks(RefRead,ArtRead,rankdic)
457
458
459
460
461
462 while (RefReadRank > ArtReadRank and hasNextLine != 0):
463
464 for l in xrange(0, len(ArtRead)):
465 ArtRead[l].mr = [0]
466
467
468 for hits in ArtRead:
469 fobj.write(hits.toStrNoMem("art"))
470 fp += 1
471
472
473
474 CurrentReadArt = NextReadArt
475
476 hasNextLine, ArtRead, NextReadArt = getAllID(fileart, CurrentReadArt, compareList, readdic)
477
478
479 ArtReadRank = RefReadRank-1 if ArtRead[0].id not in rankdic else rankdic[ArtRead[0].id]
480 MappedReads+=1
481
482
483
484
485
486 while (RefReadRank < ArtReadRank and hasNextLine != 0):
487
488 for hits in RefRead:
489 fobj.write(hits.toStrNoMem("ref"))
490 tp += 1
491
492 CurrentReadRef = NextReadRef
493 hasNextLine, RefRead, NextReadRef = getAllID(fileref, CurrentReadRef, compareList, readdic)
494 RefReadRank = rankdic[RefRead[0].id]
495 MappedReads+=1
496
497
498
499
500 if (RefReadRank == ArtReadRank):
501 MappedReads+=1
502
503
504
505 tempTP,tempFP = CompareAlignments(RefRead, ArtRead,fobj)
506 tp +=tempTP
507 fp +=tempFP
508
509 CurrentReadRef = NextReadRef
510 CurrentReadArt = NextReadArt
511
512
513
514 hasNextLine, RefRead, NextReadRef = getAllID(fileref, CurrentReadRef, compareList, readdic)
515 if hasNextLine == 0:
516
517 RefReadRank = rankdic[RefRead[0].id]
518 RefBool = 1
519 break
520
521 hasNextLine, ArtRead, NextReadArt = getAllID(fileart, CurrentReadArt, compareList, readdic)
522 if hasNextLine == 0:
523
524 ArtReadRank = RefReadRank-1 if ArtRead[0].id not in rankdic else rankdic[ArtRead[0].id]
525 ArtBool = 1
526 break
527
528
529 hasNextLine = 1
530
531 if RefBool == 1:
532
533
534 hasNextLine, ArtRead, NextReadArt = getAllID(fileart, CurrentReadArt, compareList, readdic)
535 ArtReadRank = RefReadRank-1 if ArtRead[0].id not in rankdic else rankdic[ArtRead[0].id]
536
537 while (RefReadRank > ArtReadRank and hasNextLine != 0):
538 MappedReads+=1
539
540 for l in xrange(0, len(ArtRead)):
541 ArtRead[l].mr = [0]
542
543
544
545 for hits in ArtRead:
546 fobj.write(hits.toStrNoMem("art"))
547 fp += 1
548
549 CurrentReadArt = NextReadArt
550
551 hasNextLine, ArtRead, NextReadArt = getAllID(fileart, CurrentReadArt, compareList, readdic)
552
553
554 ArtReadRank = RefReadRank-1 if ArtRead[0].id not in rankdic else rankdic[ArtRead[0].id]
555
556
557
558 if (RefReadRank == ArtReadRank):
559 MappedReads+=1
560
561
562 tempTP,tempFP = CompareAlignments(RefRead, ArtRead,fobj)
563 tp +=tempTP
564 fp +=tempFP
565
566
567 while 1:
568
569 if hasNextLine == 0:
570 break
571 else:
572
573 CurrentReadArt = NextReadArt
574
575 hasNextLine, ArtRead, NextReadArt = getAllID(fileart, CurrentReadArt, compareList, readdic)
576
577
578 for hits in ArtRead:
579 fobj.write(hits.toStrNoMem("art"))
580 fp += 1
581 MappedReads+=1
582
583 else:
584
585 while (RefReadRank < ArtReadRank and hasNextLine != 0):
586 MappedReads+=1
587
588 for hits in RefRead:
589 fobj.write(hits.toStrNoMem("ref"))
590 tp += 1
591 CurrentReadRef = NextReadRef
592 hasNextLine, RefRead, NextReadRef = getAllID(fileref, CurrentReadRef, compareList, readdic)
593 RefReadRank = rankdic[RefRead[0].id]
594
595
596 if (RefReadRank == ArtReadRank):
597 MappedReads+=1
598
599
600 tempTP,tempFP = CompareAlignments(RefRead, ArtRead,fobj)
601 tp +=tempTP
602 fp +=tempFP
603
604
605 else:
606 MappedReads+=1
607 for hits in ArtRead:
608 fobj.write(hits.toStrNoMem("art"))
609 fp += 1
610
611 while 1:
612 if hasNextLine == 0:
613 MappedReads+=1
614 for hits in RefRead:
615 fobj.write(hits.toStrNoMem("ref"))
616 tp += 1
617
618 break
619 else:
620 CurrentReadRef = NextReadRef
621 MappedReads+=1
622 for hits in RefRead:
623 fobj.write(hits.toStrNoMem("ref"))
624 tp += 1
625 hasNextLine, RefRead, NextReadRef = getAllID(fileref, CurrentReadRef, compareList, readdic)
626
627 end = time.time()
628 print ("\t\t%f\t\t" %(end - start)),
629 fobj.close()
630 return(tp, fp,tp+fp,MappedReads)
631
632
633
635 """
636 Reads the alignment tag, given the text and a tag to search for.
637
638 @type line: string
639 @param line: SAM line
640 @type tag: string
641 @param tag: SAM tag. i.e: NM,MD
642 @rtype: int
643 @return: number x behind desired tag tag:i:x
644 """
645
646
647
648 m = re.search(tag + ":i:(\d+)", line)
649 besthits = m.group(1)
650
651 return(besthits)
652
654 """
655 Sum of strings (as ints)
656
657 @type strlist: list(str)
658 @param strlist: SAM line
659
660 @rtype: int
661 @return: MD tag calculation.
662 """
663
664 return(sum([int(i) for i in strlist]))
665
667 """
668 Mean of strings.
669
670 @type strlist: list(str)
671 @param strlist: SAM line
672
673 @rtype: float
674 @return: mean
675 """
676 sum = getsum(strlist)
677 return(sum/float(len(strlist)))
678
679
681 """
682 Computes the ReadQualityScore given a quality string
683
684 @type quality: string
685 @param quality: quality string of a read.
686
687 @rtype: float
688 @return: ReadQualityScore (RQS)
689 """
690
691 qualarray = [ord(i)-33 for i in quality]
692
693 score =( sum(qualarray)/float(len(qualarray))) / (max(qualarray) - min(qualarray) + 1)
694
695 return (score)
696
697
699 """
700 Computes the alignment length given a cigar string. Needed for Start + End calculation.
701
702 @type cigar: string
703 @param cigar: Cigar string (SAM)
704
705 @rtype: int
706 @return: alignmentlength
707 """
708
709
710
711
712 match = re.findall("(\d+)[MDN]", cigar)
713 sumOfMatches = 0
714 for item in match:
715 sumOfMatches += int(item)
716 return (sumOfMatches)
717
718
720 """
721 Checks alignment line for unnecessary informations to skip
722
723 @type alignment: string
724 @param alignment: Line from SAM
725 @type identifier: string
726 @param identifier: read id
727 @type compareList: list
728 @param compareList: list of alignments with same read id.
729 @type readdic: dictionary
730 @param readdic: Dictionary containg a read id; read quality mapping
731 @rtype: readobj, readname
732 @return: Returns the read and it's identifier.
733 """
734
735 if alignment.startswith("@"):
736 return(0, "")
737 else:
738 read, readname = readSAMline(alignment, identifier, compareList, readdic)
739 if read == 0:
740 return(0, "")
741 else:
742 return (read, readname)
743
744
746 """
747 Function for comparison of artificial alignments and reference alignments. FP are defined such that start and end position must be unique to the artificial reference
748 returns 0 if no same read is found (FP found)
749 returns 1 if an equal alignment is found
750
751 @type readref: read obj.
752 @param readref: reference
753 @type readart: read obj.
754 @param readart: artificial
755 @rtype: bool
756 @return: Indicator if alignment is the same (start & end equal)
757 """
758 for i, item in enumerate(readref.start):
759 if (readref.start[i] == readart.start[0] and readref.end[i] == readart.end[0]):
760 return(1)
761 return(0)
762
763
764
766 """
767 Computes the number of subsitutions in the artificial reference using the CompareString.
768
769 @type CompareString: string
770 @param CompareString: string of 0 and 1s. 1 = hamming 1 between reference and artificial.
771 @type start: int
772 @param start: start of alignment
773 @type end: int
774 @param end: end ofalignment
775 @rtype: int
776 @return: hamming distance
777 """
778 return (sum(CompareString[start:end]))
779
780
782 """
783
784 Reads a .fastqfile and calculates a defined readscore
785 input: fastq file
786 output: fastq dictionary key = readid; value = qualstr
787
788
789 @type fastqfile: string
790 @param fastqfile: path to fastq file
791
792 @rtype: dictionary
793 @return: dictionary containing read ids and read qualities.
794 """
795 fastq_file = HTSeq.FastqReader(fastqfile , "phred")
796 readdictionary = {}
797
798 for read in fastq_file:
799 readdictionary[read.name] = ComputeRQScore(read.qualstr)
800 print("\tReading Fastq file done!")
801 return readdictionary
802
803
804
805
807 """
808 Extends a given dictionary with KEY = readid and VALUE = qualstr such that an internal naming is generated which can be used to efficiently create an numpy array
809
810
811 @type readdic: dictionary
812 @param readdic: dictionary containing read ids and read qualities.
813
814 @rtype: dictionary
815 @return: extended readdic with KEY = ID, VALUE = READID object with READID.internalid and READID.qulstr = qualstr
816 """
817 internalnaming = 0
818 reverseReadDic = np.zeros(len(readdic), dtype='S50')
819 for id in readdic.iterkeys():
820
821 readdic[id] = ReadID(internalnaming, readdic[id])
822 reverseReadDic[internalnaming] = id
823 internalnaming += 1
824 return(readdic, reverseReadDic)
825
826
827
829 """
830 Returns a sequence string from a fasta file.
831
832
833 @type fasta: string
834 @param fasta: path to fasta file.
835
836 @rtype: string
837 @return: sequence
838 """
839 fastafile = HTSeq.FastaReader(fasta)
840 for sequence in fastafile:
841 return(sequence.seq)
842
843
844
846 """
847 Creates a list which is used for comparisons between aligned reads (exact number of mismatches)
848
849
850
851 @type Reference: string
852 @param Reference: reference genome
853 @type ARG: string
854 @param ARG: artificial reference genome.
855 @rtype: list
856 @return: list containt 1s, where there is a difference in the genomes and 0s where the nucleotides are equal.
857 """
858
859 reference = returnSequence(Reference)
860 artificialreference = returnSequence(ARG)
861 complist = []
862
863
864 if (len(reference) != len(artificialreference)):
865 print "first 10 letter:\t\t" + reference[0:10] + "..." + reference[-10:]
866 print "last 10 letter:\t\t" + artificialreference[0:10] + "..." + artificialreference[-10:]
867 print ("Error! Two Sequences have different length! Try to add a line break after the last nucleotide.")
868 sys.exit(1)
869 else:
870 return([ 0 if x == artificialreference[i] else 1 for i, x in enumerate(reference)])
871
872
873
874 -def readSAMline(alignment, identifier, compareList, readdic):
875 """
876 Function for reading SAM alignment text file (one line)
877
878
879
880 @type alignment: string
881 @param alignment: SAM alignment
882 @type identifier: string
883 @param identifier: read id
884 @type compareList: list
885 @param compareList: list containt 1s, where there is a difference in the genomes and 0s where the nucleotides are equal.
886 @type readdic: dictionary
887 @param readdic: dictionary containing read ids and read qualities.
888 @rtype: read obj.
889 @return: returns a customRead object
890 """
891 k = 0
892
893
894 columns = alignment.split("\t")
895 flag = int(columns[1].strip())
896
897
898 if flag == 4:
899 return(0, "")
900
901 else:
902
903
904 readname = columns[0].strip()
905 reference = columns[2].strip()
906 start = int(columns[3].strip())-1
907 mappingquality = columns[4].strip()
908 cigar = columns[5].strip()
909 qualstr = columns[10].strip()
910 tags = " ".join(columns[11:])
911
912 mdtag = re.search("MD:Z:([^\s]+)", tags).group(1)
913 try:
914 mdtag = re.search("MD:Z:([^\s]+)", tags).group(1)
915 gaps, mism = getMisGap(mdtag, cigar)
916 except:
917 gaps = 0
918 mism = 0
919 print "Error Reading MDtag of %s.Setting gaps = 0,mism = 0" % readname
920 try:
921 nm = int(getNumberOf(tags, "NM"))
922 except:
923 nm = 0
924
925 leng = getAlignmentLength(cigar)
926
927 if len(qualstr) == 1 or (len(qualstr.strip()) == 0):
928 score = readdic[readname].quality
929 else:
930 score = ComputeRQScore(qualstr)
931
932 if identifier == "art":
933
934 tempobj = CustomRead(readname, 0, nm, getHammingdistance(compareList, start, start + leng), score, mappingquality, start, start + leng, gaps, mism)
935 elif identifier == "noMem":
936 tempobj = CustomRead(readname, 1, nm, getHammingdistance(compareList, start, start + leng), score, mappingquality, start, start + leng, gaps, mism)
937
938 else:
939
940 tempobj = TPRead(nm, score, mappingquality, start, start + leng, gaps, mism)
941
942
943
944 return(tempobj, readname)
945
946
947
948
949
951 """
952 Returns the index of a read. The index is prescribed by the ordering in the sam file.
953 @type readname: string
954 @param readname: read id
955 @type readdic: dictionary
956 @param readdic: dictionary containing read ids and read qualities.ArtRead
957 @rtype: int
958 @return: index
959 """
960 return(readdic[readname].internalID)
961
962
964 """
965 Function for reading the artificial reference genome using HTSeq.This function is mainly used. Only if no quality string is in the SAM line. The custom
966 SAM reading function is used.
967
968 @type art: string
969 @param art: artificial file.
970 @type RefArray: array
971 @param RefArray: Results from reading the reference SAM file.
972 @type compareList: list
973 @param compareList: list containt 1s, where there is a difference in the genomes and 0s where the nucleotides are equal.
974 @type readdic: dictionary
975 @param readdic: dictionary containing read ids and read qualities.
976 @rtype: array
977 @return: aligned read objects in an array.
978 """
979 start = time.time()
980
981 fobj = open(art, "r")
982 artdic = {}
983 k = 0
984
985 read = SkipHeader(fobj,compareList,readdic)
986
987
988 for alignment in fobj:
989 k += 1
990 if k % 1000000 == 0:
991 print ("%d.." %(k/1000000)),
992 read, readname = isSaneAlignment(alignment, "art", compareList, readdic)
993 if read == 0:
994 pass
995 else:
996
997 index = returnIndex(readdic, readname)
998 if RefArray[index] != 0:
999
1000 if (RefArray[index].isContained(read) == 0):
1001 if readname in artdic:
1002 artdic[readname].toObjself(read)
1003
1004 else:
1005 artdic[readname] = read
1006 else:
1007 pass
1008
1009 else:
1010
1011 artdic[readname] = read
1012
1013
1014 fobj.close()
1015 end = time.time()
1016
1017 print ("\t %f " % (end - start)),
1018
1019 return(artdic)
1020
1021
1022
1024 """
1025 Write function for hits ont he reference genome. Only the best alignment (least mismatches)
1026 Header :
1027 ReadID MatchedReference Substitutions NumberOfMismatches ReadQuality MappingQuality
1028
1029
1030 @type outfile: string
1031 @param outfile: Path to outfile.
1032 @type ReadDic: dictionary
1033 @param ReadDic: dictionary containing read ids and read qualities.
1034 """
1035
1036 fobj = open(outfile, "a")
1037 for read in ReadDic.keys():
1038 BestReadIndex = np.array(ReadDic[read].nm).argmin()
1039 fobj.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\r\n" % (read, ReadDic[read].mr[BestReadIndex], ReadDic[read].subs[BestReadIndex], ReadDic[read].nm[BestReadIndex], ReadDic[read].rq, ReadDic[read].mq[BestReadIndex], ReadDic[read].start[BestReadIndex], ReadDic[read].end[BestReadIndex], ReadDic[read].gaps[BestReadIndex], ReadDic[read].mism[BestReadIndex]))
1040 fobj.close()
1041
1042
1043
1044
1045 -def writeToTabRef(RefArray, outfile, reverseReadArray, ReadDic):
1046 """
1047 Write function for hits ont he reference genome. Only the best alignment (least mismatches)
1048
1049 Header :
1050 ReadID MatchedReference Substitutions NumberOfMismatches ReadQuality MappingQuality
1051 rows in the infile
1052 @type RefArray: string
1053 @param RefArray: path to inputfile.
1054 @type outfile: int
1055 @param outfile: rows in the infile
1056 @type reverseReadArray: dictionary
1057 @param reverseReadArray: Contains reads = values and ranks = keys
1058 @type ReadDic: dictionary
1059 @param ReadDic: dictionary containing read ids and read qualities.
1060 """
1061
1062 fobj = open(outfile, "a")
1063 for i in xrange(0, len(RefArray)):
1064 read = RefArray[i]
1065
1066 if read == 0:
1067 continue
1068 else:
1069 readname = reverseReadArray[i]
1070 index = returnIndex(ReadDic, readname)
1071 BestReadIndex = np.array(RefArray[index].nm,dtype="int").argmin()
1072 fobj.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\r\n" % (readname, 1, 0, RefArray[index].nm[BestReadIndex], RefArray[index].rq, RefArray[index].mq[BestReadIndex], RefArray[index].start[BestReadIndex], RefArray[index].end[BestReadIndex], RefArray[index].gaps[BestReadIndex], RefArray[index].mism[BestReadIndex]))
1073 fobj.close()
1074
1075
1076
1078 """
1079 Reads the results from writeToTab in for plotting and analysis
1080
1081 Header :
1082 ReadID MatchedReference Substitutions NumberOfMismatches ReadQuality MappingQuality
1083
1084 @type infile: string
1085 @param infile: path to inputfile.
1086 @type arraysize: int
1087 @param arraysize: rows in the infile
1088
1089 @rtype: np.array(obj)
1090 @return: array for unique classified reads.
1091 """
1092
1093
1094 fobj = open(infile, "r")
1095 dataArray = np.arange(arraysize, dtype=object)
1096 for i, line in enumerate(fobj):
1097 if line.startswith("#"):
1098 continue
1099 else:
1100 zeile = line.split("\t")
1101 id = zeile[0]
1102 mr = int(zeile[1])
1103 subs = int(zeile[2])
1104 nm = int(zeile[3])
1105 rq = float(zeile[4])
1106 mq = int(zeile[5])
1107 gaps = int(zeile[8])
1108 mism = int(zeile[9])
1109
1110
1111 dataArray[i - 1] = CustomRead(id, mr, nm, subs, rq, mq, 0, 0, gaps, mism)
1112
1113 return (dataArray)
1114
1115
1116
1165
1167 return(path.replace("~",os.path.expanduser("~")))
1168
1170 """
1171 Since we append in the program, we need to make sure no old files remain...
1172
1173 @type controlDic: dictionary
1174 @param controlDic: dictionary containing future filenames.
1175 @type mapper: string
1176 @param mapper: current identifier mapper for which the results are written
1177 @type outpath: string
1178 @param outpath: existing path, where the outfiles will be written.
1179 """
1180 for artificial in controlDic["mapper"][mapper][1]:
1181 filename = artificial.split("/")[-1]
1182 outfile = mapper + "_" + artificial.split("/")[-1][:-4] + ".esam"
1183
1184 fobj = open(outpath + outfile, "w")
1185 fobj.write("#ReadID\tMatchedReference\tSubstitutions\tNumberOfMismatches\tReadQuality\tMappingQuality\tStart\tEnd\tGaps\tMisM\r\n")
1186 fobj.close()
1187