<- "MKWVTFISLLFLFSSAYSRGVFRRDAHKSEVAHRFKDLGEENFKALVLIAFAQYLQQCPFEDHVKLVNEV
seq TEFAKTCVADESAENCDKSLHTLFGDKLCTVATLRETYGEMADCCAKQEPERNECFLQHKDDNPNLPRLV
RPEVDVMCTAFHDNEETFLKKYLYEIARRHPYFYAPELLFFAKRYKAAFTECCQAADKAACLLPKLDELR
DEGKASSAKQRLKCASLQKFGERAFKAWAVARLSQRFPKAEFAEVSKLVTDLTKVHTECCHGDLLECADD
RADLAKYICENQDSISSKLKECCEKPLL EKSHCIAEVENDEMPADLPSLAADFVESKDVCKNYAEAKDVF
LGMFLYEYARRHPDYSVVLLLRLAKTYETTLEKCCAAADPHECYAKVFDEFKPLVEEPQNLIKQNCELFE
QLGEYKFQNALLVRYTKKVPQVSTPTLVEVSRNLGKVGSKCCKHPEAKRMPCAEDYLSVVLNQLCVLHEK
TPVSDRVTKCCTESLVNRRPCFSALEVDETYVPKEFNAETFTFHADICTLSEKERQIKKQTALVELVKHK
PKATKEQLKAVMDDFAAFVEKCCKADDKETCFAEEGKKLVAASQAALGL"
Proteins
Amino acid composition
<- seq[seq != "\n"]
seq # boolean operation != means "no" seq
Next, you get the quantity of a specific letter in the sequence.
library(stringr)
<- str_count(seq, pattern = "A")
aa aa
[1] 63
str_count
command only counts the letter “A” in the sequence. This way, you can get all 20 amino acids by repeating this command.library(stringr)
<- str_count(seq, pattern = "A")
ala <- str_count(seq, pattern = "R")
arg <- str_count(seq, pattern = "N")
asn <- str_count(seq, pattern = "D")
asp <- str_count(seq, pattern = "C")
cys <- str_count(seq, pattern = "E")
glu <- str_count(seq, pattern = "Q")
gln <- str_count(seq, pattern = "G")
gly <- str_count(seq, pattern = "H")
his <- str_count(seq, pattern = "I")
ile <- str_count(seq, pattern = "L")
leu <- str_count(seq, pattern = "K")
lys <- str_count(seq, pattern = "M")
met <- str_count(seq, pattern = "F")
phe <- str_count(seq, pattern = "P")
pro <- str_count(seq, pattern = "S")
ser <- str_count(seq, pattern = "T")
thr <- str_count(seq, pattern = "W")
trp <- str_count(seq, pattern = "Y")
tyr <- str_count(seq, pattern = "V") val
And, to visualize the result in a table:
<- c("Ala", "Arg", "Asn", "Asp", "Cys", "Glu",
aa_3abrev "Gln", "Gly", "His", "Ile", "Leu", "Lys", "Met",
"Phe", "Pro", "Ser", "Thr", "Trp", "Tyr", "Val")
<- c(ala, arg, asn, asp, cys, glu, gln, gly,
aa_quant
his, ile, leu, lys, met, phe, pro, ser, thr, trp,# vector with the quantity of amino acids in the protein
tyr, val) <- data.frame(aa_3abrev, aa_quant) # dataframe with the results
aa_seq colnames(aa_seq) <- c("Tipo", "Qtde") # rename the columns
# Amino acid composition in human serum albumin
# displays the table aa_seq
Tipo Qtde
1 Ala 63
2 Arg 27
3 Asn 17
4 Asp 36
5 Cys 35
6 Glu 62
7 Gln 20
8 Gly 13
9 His 16
10 Ile 9
11 Leu 64
12 Lys 60
13 Met 7
14 Phe 35
15 Pro 24
16 Ser 28
17 Thr 29
18 Trp 2
19 Tyr 19
20 Val 43
library(knitr) # to generate the table
::kable(aa_seq, caption = "Composition of amino acids in human serum albumin.", "pipe") # table knitr
Tipo | Qtde |
---|---|
Ala | 63 |
Arg | 27 |
Asn | 17 |
Asp | 36 |
Cys | 35 |
Glu | 62 |
Gln | 20 |
Gly | 13 |
His | 16 |
Ile | 9 |
Leu | 64 |
Lys | 60 |
Met | 7 |
Phe | 35 |
Pro | 24 |
Ser | 28 |
Thr | 29 |
Trp | 2 |
Tyr | 19 |
Val | 43 |
<- c("A", "R", "N", "D", "C", "E", "Q", "G", "H", "I", "L", "K", "M",
aa_1abrev "F", "P", "S", "T", "W", "Y", "V")
for (i in aa_1abrev) {
<- str_count(seq, pattern = aa_1abrev)
aa_quant2 return(aa_quant2) # optional syntax for function with only one output
}<- data.frame(aa_3abrev, aa_quant2) # dataframe with results
aa_seq colnames(aa_seq) <- c("Type", "Quantity") # rename columns
::kable(aa_seq, caption = "Amino acid composition in human serum albumin
knitr(using loop).", "pipe") # table
Type | Quantity |
---|---|
Ala | 63 |
Arg | 27 |
Asn | 17 |
Asp | 36 |
Cys | 35 |
Glu | 62 |
Gln | 20 |
Gly | 13 |
His | 16 |
Ile | 9 |
Leu | 64 |
Lys | 60 |
Met | 7 |
Phe | 35 |
Pro | 24 |
Ser | 28 |
Thr | 29 |
Trp | 2 |
Tyr | 19 |
Val | 43 |
str_count
function retains an internal loop, since an element counting function is applied to a sequence, based on a predefined pattern (the vector aa_1abrev, in this case). This way, the script can be further simplified, without the need for the external loop.str_count(seq, pattern = aa_1abrev)
[1] 63 27 17 36 35 62 20 13 16 9 64 60 7 35 24 28 29 2 19 43
<- c(1, 2, 4, 8, 16, 32)
y mean(y)
[1] 10.5
sum(y)
[1] 63
# Estimated average size of a protein from the number of
# amino acid residues
<- function(x) {
prot.size <- x * 110 # 'x' represents the number of amino acids in the protein
MM return(MM)
}prot.size(575) # number of amino acid residues in human albumin
[1] 63250
apply
family of functions, consisting of the apply
, sapply
, tapply
, lapply
, and mapply
commands. Although they have faster processing than external loop functions for using very complex matrices, each one is aimed at a distinct object or specific R situation (returning a list, vector or matrix), allows the use of subset
(data subsets), uses R functions or functions previously defined by the user, and runs in just one command line. These advantages contrast with the use of for loops applied to vectors. However, vectorization works very well when you want to apply or map a function to a vector/matrix/list. When, on the other hand, you want to apply a function whose result depends on more than one vector/matrix/list, the for loop becomes indispensable, as in the titration of weak acids in the Amino Acids chapter.<- aa_seq[4, 2] + aa_seq[6, 2] # AA acids
aa_ac <- aa_seq[2, 2] + aa_seq[9, 2] + aa_seq[12, 2] # AA basics
aa_bas <- aa_seq[14, 2] + aa_seq[18, 2] + aa_seq[19, 2] # AA aromatics
aa_arom <- aa_seq[10, 2] + aa_seq[11, 2] + aa_seq[15, 2] + aa_seq[1, 2] +
aa_alif 20, 2] # Aliphatic AA
aa_seq[<- aa_seq[3, 2] + aa_seq[5, 2] + aa_seq[7, 2] + aa_seq[8, 2] +
aa_pol 13, 2] + aa_seq[16, 2] + aa_seq[17, 2] ## Neutral polar AA aa_seq[
<- str_count(seq, pattern = "") # length of the sequence
aa_tot <- round(c(aa_ac, aa_bas, aa_arom, aa_alif, aa_pol) / aa_tot * 100) class_perc
And now, yes, we build the table.
<- c("acidic", "basic", "aromatic", "aliphatic", "polar")
aa_class <- data.frame(aa_class, class_perc) # dataframe with results
aa_perc colnames(aa_perc) <- c("Class", "%") # rename columns
::kable(aa_perc, caption = "Distribution of amino acid classes
knitrin human albumin.", "pipe") # table
Class | % |
---|---|
acidic | 16 |
basic | 17 |
aromatic | 9 |
aliphatic | 33 |
polar | 24 |
Protein Purification Table & R as a spreadsheet
The procedures for protein purification (or isolation, fractionation) involve techniques such as chemical treatment (ammonium sulfate precipitation, acetone), acid treatment, thermal treatment, dialysis, chromatography (molecular filtration, ion exchange, affinity, reverse phase), among others. To measure the degree of purity of the sample obtained, simple electrophoresis, isoelectric focusing, 2D electrophoresis, use of monoclonal antibodies, and specific activity assays, among others, are normally used.
For the purification table, only the sample mass and sample enzymatic activity vectors obtained at each purification step are required. A simple spreadsheet could be constructed as:
# Preparation of a simple enzyme purification spreadsheet
# (each element of the vector represents a purification step)
# 1. Definition of the main vectors:
<- c(6344, 302, 145, 34, 10, 3.8) # protein, mg
prot.total <- c(200, 122, 106, 70, 53, 24) * 1000 # activity, U
ativ.tot
# 2. Construction of the spreadsheet:
<- data.frame(prot.total, ativ.tot)
purif.plan purif.plan
prot.total ativ.tot
1 6344.0 200000
2 302.0 122000
3 145.0 106000
4 34.0 70000
5 10.0 53000
6 3.8 24000
<- cbind(prot.total, ativ.tot)
purif.plan2 purif.plan2
prot.total ativ.tot
[1,] 6344.0 200000
[2,] 302.0 122000
[3,] 145.0 106000
[4,] 34.0 70000
[5,] 10.0 53000
[6,] 3.8 24000
# Editing column names
colnames(purif.plan2) <- c("totalProt", "enzAtiv")
purif.plan2
totalProt enzAtiv
[1,] 6344.0 200000
[2,] 302.0 122000
[3,] 145.0 106000
[4,] 34.0 70000
[5,] 10.0 53000
[6,] 3.8 24000
<- data.frame(prot.total, ativ.tot, ativ.tot / prot.total)
purif.plan3 options(digits = 1) # option for no. of decimal places
colnames(purif.plan3) <- c("prot.total", "ativ.tot", "ativ.specif")
rownames(purif.plan3) <- c("extr.bruto", "NH4SO2", "acetone",
"Sephadex G-100", "DEAE-cellulose", "C8-phase rev")
purif.plan3
prot.total ativ.tot ativ.specif
extr.bruto 6344 2e+05 32
NH4SO2 302 1e+05 404
acetone 145 1e+05 731
Sephadex G-100 34 7e+04 2059
DEAE-cellulose 10 5e+04 5300
C8-phase rev 4 2e+04 6316
# Simple spreadsheet editing (changes in values and column names)
<- edit(purif.plan3) # or data.entry( ) purif.plan4
# Importing data from another spreadsheet (CSV):
# 1. Import with the name of the desired spreadsheet:
<- read.table("planilha.csv", header = T, sep = ",")
purif.plan5
# 2. Import with the search screen for the desired spreadsheet:
<- frame <- read.csv(file.choose()) purif.plan5
library(tibble)
<- as_tibble(purif.plan3)
purif.plan6 purif.plan6
# A tibble: 6 × 3
prot.total ativ.tot ativ.specif
<dbl> <dbl> <dbl>
1 6344 200000 31.5
2 302 122000 404.
3 145 106000 731.
4 34 70000 2059.
5 10 53000 5300
6 3.8 24000 6316.
# Enzyme purification table with 'dplyr' package:
library(dplyr)
<- mutate(purif.plan, ativ.esp = ativ.tot / prot.total)
purif.plan7 purif.plan7
prot.total ativ.tot ativ.esp
1 6344 2e+05 32
2 302 1e+05 404
3 145 1e+05 731
4 34 7e+04 2059
5 10 5e+04 5300
6 4 2e+04 6316
<- transmute(purif.plan7, ativ.tot = ativ.tot / 1e3)
ativ.tot.kU # specific activity vector in U x 10^3 ativ.tot.kU
ativ.tot
1 200
2 122
3 106
4 70
5 53
6 24
<- mutate(purif.plan7,
purif.plan8 purif = ativ.esp / ativ.esp[1], # purification level
rend.perc = 100 * ativ.tot / ativ.tot[1]
# percentage yield
)
# Converting to the table...
library(knitr)
::kable(purif.plan8, caption = "Purification table for an enzyme", "pipe") knitr
prot.total | ativ.tot | ativ.esp | purif | rend.perc |
---|---|---|---|---|
6344 | 2e+05 | 32 | 1 | 100 |
302 | 1e+05 | 404 | 13 | 61 |
145 | 1e+05 | 731 | 23 | 53 |
34 | 7e+04 | 2059 | 65 | 35 |
10 | 5e+04 | 5300 | 168 | 26 |
4 | 2e+04 | 6316 | 200 | 12 |
library(DT)
<- as.data.frame(purif.plan8)
purif.plan9 rownames(purif.plan9) <- c("extr.bruto", "NH4SO2", "acetone",
"Sephadex G-100", "DEAE-cellulose",
"C8-phase rev") # converts the purification table
# into a spreadsheet for use by the DT package
datatable(purif.plan9) %>% formatRound(1:5, 1) # columns with 1 decimal place
::datatable(purif.plan9, editable = "cell") DT
Interaction of oxygen with myoglobin and hemoglobin
\[ y=\frac{pO_2}{K_{50}+pO_2} \tag{1}\]
On the other hand, the value of \(K_{50}\) for hemoglobin is 26 mmHg, but its function is expressed differently from that of myoglobin:
\[ y=\frac{pO_2^{nH}} {K_{50}^{nH}+pO_2^{nH}} \tag{2}\]
In this Equation 2, nH represents the Hill cooperativity coefficient, which summarizes the energy distributed among the four microscopic dissociation constants of O\(_{2}\) to the four porphyrin centers of hemoglobin (heme groups). Simulating both curves:
<- 2.8
K50 curve(x / (K50 + x),
xlim = c(0, 100),
xlab = "pO2 (mmHg)", ylab = "y", lty = "dotted"
)
<- 26
K50 <- 2.8
nH curve(x^nH / (K50^nH + x^nH),
xlim = c(0, 100),
xlab = "pO2 (mmHg)", ylab = "y", col = "red",
add = TRUE
# "add" allows you to add curves to the graph
) abline(0.5, 0, lty = 2) # add baseline at half saturation
\[ y=\frac{K1*L+2*K2*K1*L^2+3*K3*K2*K1*L^3+4*K4*K3*K2*K1*L^4} {4*(1+K1*L+2*K2*K1*L^2+3*K3*K2*K1*L^3+4*K4*K3*K2*K1*L^4)} \tag{3}\]
\[ Ki_{corr} = \frac{i}{N-1+i}*Ki \tag{4}\]
<- c(0.011, 0.016, 0.118, 0.400) # vector of microscopic constants for
K # dissociation of Hb to O2
<- seq(1, 201, 2) # vector of O2 contents
L
<- c() # initialize an empty vector to output the corrected Ki vector
Kcorr <- 4 # declare the number of sites in Hb
N for (i in 1:N) Kcorr[i] <- i / (N - i + 1) * K[i]
# returns the vector of Ki values corrected for the statistical effect Kcorr
[1] 0.003 0.011 0.177 1.600
<- K[1] * L + 2 * K[2] * K[1] * L^2 + 3 * K[3] * K[2] * K[1] * L^3 +
numer 4 * K[4] * K[3] * K[2] * K[1] * L^4
<- 1 + numer
denom <- numer / denom
y plot(L, y, xlab = "pO2", type = "l", col = 2)
# Calculation of y in each L
<- function(L, Kcorr) {
Yi <- length(Kcorr)
N <- c()
conc 1] <- L * Kcorr[1]
conc[for (i in 2:N) conc[i] <- conc[i - 1] * L * Kcorr[i]
<- sum((1:N) * conc) / N
numer2 <- 1 + sum(conc)
denom2 return(numer2 / denom2)
}# Calculation of y to vector L
<- function(L, Kcorr) {
Y <- c()
YY for (j in 1:length(L)) YY[j] <- Yi(L[j], Kcorr)
return(YY)
}# Application of the function of y to L and graph
<- Y(L, Kcorr)
Yfinal plot(L, Yfinal, type = "l", col = 2, xlab = "pO2", ylab = "y")
Some R packages for studying proteins
seqinr
library seen in the Amino Acids chapter, which computes various values and information for protein sequences, such as pI, hydroxypathy index, residue distribution, among others. The project website [^seqinrSitio-1] contains detailed information for its use. Using the same procedure to obtain the FASTA sequence for lysozyme from the Amino Acids chapter (code CAA32175 on the NCBI site), one can obtain an extensive set of protein information, as exemplified below:library(seqinr)
<- c("KVFERCELARTLKRLGMDGYRGISLANWMCLAKWESGYNTRATNYNAGDRSTDYGIFQ
lysozyme INSRYWCNDGKTPGAVNACHLSCSALLQDNIADAVACAKRVVRDPQGIRAWVAWRNRCQNRDVRQYVQGCGV")
<- s2c(lysozyme) # convert amino acid string sequence to
seq_liso # the seqinr pattern (character vector)
<- seq_liso[seq_liso != "\n"] # elimination of spaces required by
seq_liso2 # seqinr from copy/paste procedure.
seq_liso2
[1] "K" "V" "F" "E" "R" "C" "E" "L" "A" "R" "T" "L" "K" "R" "L" "G" "M" "D"
[19] "G" "Y" "R" "G" "I" "S" "L" "A" "N" "W" "M" "C" "L" "A" "K" "W" "E" "S"
[37] "G" "Y" "N" "T" "R" "A" "T" "N" "Y" "N" "A" "G" "D" "R" "S" "T" "D" "Y"
[55] "G" "I" "F" "Q" "I" "N" "S" "R" "Y" "W" "C" "N" "D" "G" "K" "T" "P" "G"
[73] "A" "V" "N" "A" "C" "H" "L" "S" "C" "S" "A" "L" "L" "Q" "D" "N" "I" "A"
[91] "D" "A" "V" "A" "C" "A" "K" "R" "V" "V" "R" "D" "P" "Q" "G" "I" "R" "A"
[109] "W" "V" "A" "W" "R" "N" "R" "C" "Q" "N" "R" "D" "V" "R" "Q" "Y" "V" "Q"
[127] "G" "C" "G" "V"
pmw(seq_liso2) # molecular weight of the protein
[1] 14700.53
aaa(seq_liso2) # residue distribution
[1] "Lys" "Val" "Phe" "Glu" "Arg" "Cys" "Glu" "Leu" "Ala" "Arg" "Thr" "Leu"
[13] "Lys" "Arg" "Leu" "Gly" "Met" "Asp" "Gly" "Tyr" "Arg" "Gly" "Ile" "Ser"
[25] "Leu" "Ala" "Asn" "Trp" "Met" "Cys" "Leu" "Ala" "Lys" "Trp" "Glu" "Ser"
[37] "Gly" "Tyr" "Asn" "Thr" "Arg" "Ala" "Thr" "Asn" "Tyr" "Asn" "Ala" "Gly"
[49] "Asp" "Arg" "Ser" "Thr" "Asp" "Tyr" "Gly" "Ile" "Phe" "Gln" "Ile" "Asn"
[61] "Ser" "Arg" "Tyr" "Trp" "Cys" "Asn" "Asp" "Gly" "Lys" "Thr" "Pro" "Gly"
[73] "Ala" "Val" "Asn" "Ala" "Cys" "His" "Leu" "Ser" "Cys" "Ser" "Ala" "Leu"
[85] "Leu" "Gln" "Asp" "Asn" "Ile" "Ala" "Asp" "Ala" "Val" "Ala" "Cys" "Ala"
[97] "Lys" "Arg" "Val" "Val" "Arg" "Asp" "Pro" "Gln" "Gly" "Ile" "Arg" "Ala"
[109] "Trp" "Val" "Ala" "Trp" "Arg" "Asn" "Arg" "Cys" "Gln" "Asn" "Arg" "Asp"
[121] "Val" "Arg" "Gln" "Tyr" "Val" "Gln" "Gly" "Cys" "Gly" "Val"
AAstat(seq_liso2, plot = TRUE) # distribution graph, composition
$Compo
* A C D E F G H I K L M N P Q R S T V W Y
0 14 8 8 3 2 11 1 5 5 8 2 10 2 6 14 6 5 9 5 6
$Prop
$Prop$Tiny
[1] 0.3384615
$Prop$Small
[1] 0.5615385
$Prop$Aliphatic
[1] 0.1692308
$Prop$Aromatic
[1] 0.1076923
$Prop$Non.polar
[1] 0.5538462
$Prop$Polar
[1] 0.4461538
$Prop$Charged
[1] 0.2384615
$Prop$Basic
[1] 0.1538462
$Prop$Acidic
[1] 0.08461538
$Pi
[1] 9.2778
# and proportion of residues, pI value
seqinr
include conversion of amino acids to 1- and 3-letter abbreviations (a
and aaa
, respectively), listing of 544 physicochemical properties of the 20 protein amino acids (aaindex
), pK
(self-explanatory, and seen previously), and isolated computation of pI (computePI
) and molecular mass (pmw
), in addition to several others, both for proteomics and genomics.Another interesting R package for studying proteins is Peptides 2, which also computes several physicochemical properties for amino acid sequences, in addition to enabling plotting integration with the GROMACS molecular dynamics package. As for
seqinr
, Peptides
requires conversion of the sequence into string for the recognized vector pattern. The package’s functions include the computation of 66 descriptors for each amino acid in a sequence (aaDescriptors
), the composition of the sequence by residue classification (aaComp
), the computation of the aliphatic index (aIndex
), the hydrophobicity index (hydrophobicity
), the instability index (instalIndex
), the mass/charge ratio (mz
), molecular mass (mw
), and pI (pI
), among others.Among the packages more focused on comparative studies and visualization of structures, as well as for bioinformatics and chemogenomics descriptors, it is worth mentioning
Bio3d
, Autoplotprotein
, protr
, BioMedR
, and UniprotR
, among many others.References
Footnotes
Some programming practices (Best Codes): 1) organize a project into folders (e.g.: data, figures, scripts) or create an R package as an option; 2) create sections in a code to make it easier to find; 3) name the code chunks (pieces of code); 4) place the libraries used, sources, and data calls at the beginning of the code (avoids searching for something necessary for the script to run throughout the code); 5) indent, preferably with 1 or 2 commands per line; 6) always use function parameters inside the function; 7) avoid global parameters; 8) do not use ‘attach’; 8) use parameters with intuitive names (and not x and y; e.g.: function_name); 9) assign names to objects with one of the three naming conventions (e.g.: KiCompet, ki_compet, ki.compet).↩︎
Peptides package: https://cran.r-project.org/web/packages/Peptides/index.html↩︎