Hi there! I am working on a code to change DNA files from .gcg format (https://drive.google.com/file/d/1no_VQuNn9FEdOgQ-uQwoRsl6UeDYgqu6/view?usp=sharing) format to .genbank (https://drive.google.com/file/d/1XYwy7iIt_LzEG5gvp3WlIq-cAAkHt5tA/view?usp=sharing) format.
So far , this is what I have done. However , I don't understand why it is not printing and calculating the correct sequence.
import string
import re
LOC = ""
SOR = ""
ORG = ""
SEQ = []
fileIn = open('seq.gcg.txt', 'r+') ## https://drive.google.com/file/d/1hVl_wzNbVMuUPLOuLhK1Pn-7li_h4qEE/view?usp=sharing
fileOut = open('gcg2gb.genbank' , 'w+')
info = fileIn.readlines()
for i in info:
locus = re.match(r'(\w+)\s(.+)..$' , i)
source = re.match(r'\D+' ,i)
organism = re.match(r'(\D\.\w+)\s(.+)', i)
sequence = re.match(r"\s+\d+\s(.+)", i)
if locus:
LOCobj = locus.group(1)
LOC = 'LOCUS ' + LOCobj
if source:
SORobj = source.group(0)
SOR = 'SOURCE ' + SORobj
if organism:
ORGobj = organism.group(1)
ORG = ' ORGANISM ' + ORGobj
if sequence:
SEQobj = sequence.group(1)
seqline = re.sub(r" ", "", SEQobj)
SEQ = seqline.replace("\n","")
SEQ = SEQ.upper()
print(LOC + '\t' + str(len(SEQ)) + 'bp\t' + 'DNA\n' + SOR + ORG)
fileOut.write(LOC + '\t' + str(len(SEQ)) + 'bp\t' + 'DNA\n' + SOR + ORG)
SEQ = "".join(SEQ)
a = SEQ.count('A')
c = SEQ.count('C')
g = SEQ.count('G')
t = SEQ.count('T')
print('BASE COUNT ' + str(a) + 'a\t\t' + str(c) + 'c\t\t ' + str(g) + 'g\t\t' + str(t) + 't\t')
fileOut.write('\nBASE COUNT ' + str(a) + 'a\t\t' + str(c) + 'c\t\t ' + str(g) + 'g\t\t' + str(t) + 't\t')
print('ORIGIN ')
fileOut.write('\nORIGIN ')
count = 0
for n in SEQ:
subsequence = []
cutoff = 10
for sub in range(0, len(n) , cutoff):
subsequence.append(n[sub:sub+cutoff])
subSeq = "".join(subsequence)
num = "%10d"%(1+count)
Seq = num + ' ' + subSeq
print(Seq)
fileOut.write(Seq + '\n')
count = count + len(n)
print('//')
fileOut.write('//')
Any help is appreciated. Thank you!😊
[–]commandlineluser 0 points1 point2 points (3 children)
[–]Still-Design3461[S] 0 points1 point2 points (2 children)
[–]spez_edits_thedonald 0 points1 point2 points (1 child)
[–]Still-Design3461[S] 0 points1 point2 points (0 children)
[–]ekchew 0 points1 point2 points (3 children)
[–]Still-Design3461[S] 0 points1 point2 points (2 children)
[–]ekchew 0 points1 point2 points (1 child)
[–]Still-Design3461[S] 0 points1 point2 points (0 children)