Reshape a long table to wide
reshape_long2wide.Rd
Transform the long-format table to wide-format, and each row is feature and cols are expression data or identification information. The output table can be used as the input of DEP2
Usage
reshape_long2wide(
long_table,
feature_col,
expression_col,
sample_col,
remove_sample_prefix = T,
remove_sample_suffix = T,
shrink_ident_cols = NULL,
extend_ident_cols = NULL
)
Arguments
- long_table
data.frame, a long-format table
- feature_col
character(1), the unique identifier of feature, such as "protein.group" for protein, "peptide.sequence" for peptides, "precursor.Id" for precursors.
- expression_col
character(1), the expression column.
- sample_col
character(1), the samples column. The samples in these column will be the columns names of output transformed table
- remove_sample_prefix
logical(1), whether to remove the prefix of samples.
- remove_sample_suffix
logical(1), whether to remove the suffix of samples.
- shrink_ident_cols
NULL or characters,the variablesvariables (identification information, like score, protein.names) to stored in transformed table. If the variable is multiple for a feature, it will paste to a character string. Else, it will store as the unique value.
- extend_ident_cols
NULL or characters,the variables (identification information, like score, protein.names) to stored in transformed table, like the . The variables will be extended to wide table like the expression_col, the value of different sample will store in different colmun in out table.
Examples
# Read in a example long export table
long_table <- read.csv(system.file("extdata/DIA-NN_Export.tsv.gz",package = "DEP2"),sep = "\t")
head(long_table)
#> File.Name
#> 1 C:\\Users\\proteo\\Desktop\\RD139_Feb2021\\mzML\\RD139_Narrow_UPS1_5fmol_inj1.mzML
#> 2 C:\\Users\\proteo\\Desktop\\RD139_Feb2021\\mzML\\RD139_Narrow_UPS1_5fmol_inj2.mzML
#> 3 C:\\Users\\proteo\\Desktop\\RD139_Feb2021\\mzML\\RD139_Narrow_UPS1_5fmol_inj3.mzML
#> 4 C:\\Users\\proteo\\Desktop\\RD139_Feb2021\\mzML\\RD139_Narrow_UPS1_25fmol_inj1.mzML
#> 5 C:\\Users\\proteo\\Desktop\\RD139_Feb2021\\mzML\\RD139_Narrow_UPS1_25fmol_inj2.mzML
#> 6 C:\\Users\\proteo\\Desktop\\RD139_Feb2021\\mzML\\RD139_Narrow_UPS1_25fmol_inj3.mzML
#> Protein.Group Protein.Ids Protein.Names Genes PG.Quantity PG.Normalised
#> 1 P33898 P33898 G3P2_ECOLI gapC 1618640 1550550
#> 2 P33898 P33898 G3P2_ECOLI gapC 1669670 1580930
#> 3 P33898 P33898 G3P2_ECOLI gapC 1693210 1611420
#> 4 P33898 P33898 G3P2_ECOLI gapC 1538390 1466180
#> 5 P33898 P33898 G3P2_ECOLI gapC 1710590 1516660
#> 6 P33898 P33898 G3P2_ECOLI gapC 1487440 1512390
#> Genes.Quantity Genes.Normalised Genes.MaxLFQ Genes.MaxLFQ.Unique
#> 1 1618640 1550550 1541640 1541640
#> 2 1669670 1580930 1542750 1542750
#> 3 1693210 1611420 1527090 1527090
#> 4 1538390 1466180 1535660 1535660
#> 5 1710590 1516660 1506910 1506910
#> 6 1487440 1512390 1486210 1486210
#> Modified.Sequence Stripped.Sequence Precursor.Id Precursor.Charge
#> 1 AAAENIIPHTTGAAK AAAENIIPHTTGAAK AAAENIIPHTTGAAK2 2
#> 2 AAAENIIPHTTGAAK AAAENIIPHTTGAAK AAAENIIPHTTGAAK2 2
#> 3 AAAENIIPHTTGAAK AAAENIIPHTTGAAK AAAENIIPHTTGAAK2 2
#> 4 AAAENIIPHTTGAAK AAAENIIPHTTGAAK AAAENIIPHTTGAAK2 2
#> 5 AAAENIIPHTTGAAK AAAENIIPHTTGAAK AAAENIIPHTTGAAK2 2
#> 6 AAAENIIPHTTGAAK AAAENIIPHTTGAAK AAAENIIPHTTGAAK2 2
#> Q.Value Protein.Q.Value PG.Q.Value GG.Q.Value Proteotypic
#> 1 8.61030e-05 0.000603500 0.000593472 0.000594884 1
#> 2 1.04118e-04 0.000610128 0.000599520 0.000600601 1
#> 3 1.32547e-04 0.000695894 0.000685401 0.000686813 1
#> 4 8.23452e-05 0.000610128 0.000598802 0.000600601 1
#> 5 8.86211e-05 0.000684932 0.000672947 0.000674764 1
#> 6 1.15447e-04 0.000687285 0.000673401 0.000675676 1
#> Precursor.Quantity Precursor.Normalised Label.Ratio RT RT.Start RT.Stop
#> 1 585552 575050 0 26.2347 26.0551 26.4140
#> 2 714354 691530 0 26.2847 26.0457 26.5238
#> 3 553888 537228 0 26.2243 26.0449 26.4035
#> 4 582536 565425 0 27.1925 27.0128 27.3718
#> 5 605009 529203 0 27.0657 26.8864 27.3044
#> 6 483906 488140 0 26.8906 26.7110 27.0699
#> iRT Predicted.RT Predicted.iRT Lib.Q.Value Ms1.Profile.Corr Ms1.Area
#> 1 26.8898 26.2833 26.8305 8.61772e-05 0.828587 331771
#> 2 26.8898 26.3814 26.8007 8.61772e-05 0.000000 0
#> 3 26.8898 26.2363 26.8867 8.61772e-05 0.792160 941024
#> 4 26.8898 27.1476 26.9223 8.61772e-05 0.938746 599337
#> 5 26.8898 27.0569 26.8871 8.61772e-05 0.969395 1190120
#> 6 26.8898 26.8503 26.9224 8.61772e-05 0.959186 828001
#> Evidence CScore Decoy.Evidence MS2.Scan
#> 1 4.84916 0.998191 0 26297
#> 2 3.72305 0.998172 0 26372
#> 3 4.80073 0.998183 0 26297
#> 4 4.54677 0.998704 0 27497
#> 5 4.23394 0.998795 0 27347
#> 6 4.21429 0.998653 0 27122
# Reshape to a wide expression table
wide_table <- reshape_long2wide(long_table,sample_col = "File.Name",
feature_col = "Precursor.Id", expression_col = "Precursor.Normalised")
head(wide_table)
#> Precursor.Id X5fmol_inj1 X5fmol_inj2 X5fmol_inj3
#> 1 AAAENIIPHTTGAAK2 575050 691530 537228
#> 2 AAAENIIPHTTGAAK3 606721 648603 516066
#> 3 AADGQMVPFSAFSSSR2 1321990 1210640 1340600
#> 4 AADILRDDLANRGPVR3 508771 506306 488182
#> 5 AADILRDDLANRGPVR4 198130 161177 149399
#> 6 AAEELEKEGINC(UniMod:4)NLTLLFSFAQAR3 1968710 1853150 2008230
#> X25fmol_inj1 X25fmol_inj2 X25fmol_inj3
#> 1 565425 529203 488140
#> 2 718276 787171 662301
#> 3 1454250 1394310 1067680
#> 4 496371 146527 495867
#> 5 168853 147297 118770
#> 6 1766760 1757760 1509600
# If do not remove prefix or suffix of samples
wide_table2 <- reshape_long2wide(long_table,sample_col = "File.Name",
feature_col = "Precursor.Id", expression_col = "Precursor.Normalised",
remove_sample_prefix = FALSE, remove_sample_suffix = FALSE)
colnames(wide_table2)
#> [1] "Precursor.Id"
#> [2] "C..Users.proteo.Desktop.RD139_Feb2021.mzML.RD139_Narrow_UPS1_5fmol_inj1.mzML"
#> [3] "C..Users.proteo.Desktop.RD139_Feb2021.mzML.RD139_Narrow_UPS1_5fmol_inj2.mzML"
#> [4] "C..Users.proteo.Desktop.RD139_Feb2021.mzML.RD139_Narrow_UPS1_5fmol_inj3.mzML"
#> [5] "C..Users.proteo.Desktop.RD139_Feb2021.mzML.RD139_Narrow_UPS1_25fmol_inj1.mzML"
#> [6] "C..Users.proteo.Desktop.RD139_Feb2021.mzML.RD139_Narrow_UPS1_25fmol_inj2.mzML"
#> [7] "C..Users.proteo.Desktop.RD139_Feb2021.mzML.RD139_Narrow_UPS1_25fmol_inj3.mzML"
# Keep some identification information
wide_table2 <- reshape_long2wide(long_table,sample_col = "File.Name",
feature_col = "Precursor.Id", expression_col = "Precursor.Normalised",
shrink_ident_cols = c("Protein.Names","Protein.Group",
"Stripped.Sequence","Modified.Sequence",
"Precursor.Charge","Evidence"),
extend_ident_cols = "Q.Value"
)
str(wide_table2)
#> 'data.frame': 5720 obs. of 19 variables:
#> $ Precursor.Id : chr "AAAENIIPHTTGAAK2" "AAAENIIPHTTGAAK3" "AADGQMVPFSAFSSSR2" "AADILRDDLANR3" ...
#> $ X5fmol_inj1 : num 575050 606721 1321990 NA 508771 ...
#> $ X5fmol_inj2 : num 691530 648603 1210640 NA 506306 ...
#> $ X5fmol_inj3 : num 537228 516066 1340600 NA 488182 ...
#> $ X25fmol_inj1 : num 565425 718276 1454250 79901 496371 ...
#> $ X25fmol_inj2 : num 529203 787171 1394310 94191 146527 ...
#> $ X25fmol_inj3 : num 488140 662301 1067680 43877 495867 ...
#> $ Protein.Names : chr "G3P2_ECOLI" "G3P2_ECOLI" "ACRB_ECOLI" "FLIG_ECOLI" ...
#> $ Protein.Group : chr "P33898" "P33898" "P31224" "P0ABZ1" ...
#> $ Stripped.Sequence : chr "AAAENIIPHTTGAAK" "AAAENIIPHTTGAAK" "AADGQMVPFSAFSSSR" "AADILRDDLANR" ...
#> $ Modified.Sequence : chr "AAAENIIPHTTGAAK" "AAAENIIPHTTGAAK" "AADGQMVPFSAFSSSR" "AADILRDDLANR" ...
#> $ Precursor.Charge : int 2 3 2 3 3 4 3 2 3 3 ...
#> $ Evidence : chr "4.84916;3.72305;4.80073;4.54677;4.23394;4.21429" "3.34231;4.35038;3.88619;5.35532;4.93782;4.49777" "3.49415;3.19957;3.47656;4.03331;4.03771;3.75486" "NA;NA;NA;3.09877;4.047;2.88724" ...
#> $ Q.Value.5fmol_inj1 : num 8.61e-05 2.47e-04 1.61e-04 NA 1.25e-04 ...
#> $ Q.Value.5fmol_inj2 : num 0.000104 0.000104 0.000104 NA 0.000104 ...
#> $ Q.Value.5fmol_inj3 : num 0.000133 0.000192 0.000168 NA 0.000133 ...
#> $ Q.Value.25fmol_inj1: num 8.23e-05 8.23e-05 9.49e-05 7.03e-04 9.49e-05 ...
#> $ Q.Value.25fmol_inj2: num 8.86e-05 8.86e-05 8.86e-05 8.86e-05 4.90e-04 ...
#> $ Q.Value.25fmol_inj3: num 0.000115 0.000115 0.000184 0.000384 0.000115 ...