Reshape a long table to wide — reshape

Transform the long-format table to wide-format, and each row is feature and cols are expression data or identification information. The output table can be used as the input of DEP2

Usage

reshape_long2wide(
  long_table,
  feature_col,
  expression_col,
  sample_col,
  remove_sample_prefix = T,
  remove_sample_suffix = T,
  shrink_ident_cols = NULL,
  extend_ident_cols = NULL
)

Arguments

long_table: data.frame, a long-format table
feature_col: character(1), the unique identifier of feature, such as "protein.group" for protein, "peptide.sequence" for peptides, "precursor.Id" for precursors.
expression_col: character(1), the expression column.
sample_col: character(1), the samples column. The samples in these column will be the columns names of output transformed table
remove_sample_prefix: logical(1), whether to remove the prefix of samples.
remove_sample_suffix: logical(1), whether to remove the suffix of samples.
shrink_ident_cols: NULL or characters,the variablesvariables (identification information, like score, protein.names) to stored in transformed table. If the variable is multiple for a feature, it will paste to a character string. Else, it will store as the unique value.
extend_ident_cols: NULL or characters,the variables (identification information, like score, protein.names) to stored in transformed table, like the . The variables will be extended to wide table like the expression_col, the value of different sample will store in different colmun in out table.

Value

A data.frame in a wide format

Examples

# Read in a example long export table
long_table <- read.csv(system.file("extdata/DIA-NN_Export.tsv.gz",package = "DEP2"),sep = "\t")
head(long_table)
#>                                                                             File.Name
#> 1  C:\\Users\\proteo\\Desktop\\RD139_Feb2021\\mzML\\RD139_Narrow_UPS1_5fmol_inj1.mzML
#> 2  C:\\Users\\proteo\\Desktop\\RD139_Feb2021\\mzML\\RD139_Narrow_UPS1_5fmol_inj2.mzML
#> 3  C:\\Users\\proteo\\Desktop\\RD139_Feb2021\\mzML\\RD139_Narrow_UPS1_5fmol_inj3.mzML
#> 4 C:\\Users\\proteo\\Desktop\\RD139_Feb2021\\mzML\\RD139_Narrow_UPS1_25fmol_inj1.mzML
#> 5 C:\\Users\\proteo\\Desktop\\RD139_Feb2021\\mzML\\RD139_Narrow_UPS1_25fmol_inj2.mzML
#> 6 C:\\Users\\proteo\\Desktop\\RD139_Feb2021\\mzML\\RD139_Narrow_UPS1_25fmol_inj3.mzML
#>   Protein.Group Protein.Ids Protein.Names Genes PG.Quantity PG.Normalised
#> 1        P33898      P33898    G3P2_ECOLI  gapC     1618640       1550550
#> 2        P33898      P33898    G3P2_ECOLI  gapC     1669670       1580930
#> 3        P33898      P33898    G3P2_ECOLI  gapC     1693210       1611420
#> 4        P33898      P33898    G3P2_ECOLI  gapC     1538390       1466180
#> 5        P33898      P33898    G3P2_ECOLI  gapC     1710590       1516660
#> 6        P33898      P33898    G3P2_ECOLI  gapC     1487440       1512390
#>   Genes.Quantity Genes.Normalised Genes.MaxLFQ Genes.MaxLFQ.Unique
#> 1        1618640          1550550      1541640             1541640
#> 2        1669670          1580930      1542750             1542750
#> 3        1693210          1611420      1527090             1527090
#> 4        1538390          1466180      1535660             1535660
#> 5        1710590          1516660      1506910             1506910
#> 6        1487440          1512390      1486210             1486210
#>   Modified.Sequence Stripped.Sequence     Precursor.Id Precursor.Charge
#> 1   AAAENIIPHTTGAAK   AAAENIIPHTTGAAK AAAENIIPHTTGAAK2                2
#> 2   AAAENIIPHTTGAAK   AAAENIIPHTTGAAK AAAENIIPHTTGAAK2                2
#> 3   AAAENIIPHTTGAAK   AAAENIIPHTTGAAK AAAENIIPHTTGAAK2                2
#> 4   AAAENIIPHTTGAAK   AAAENIIPHTTGAAK AAAENIIPHTTGAAK2                2
#> 5   AAAENIIPHTTGAAK   AAAENIIPHTTGAAK AAAENIIPHTTGAAK2                2
#> 6   AAAENIIPHTTGAAK   AAAENIIPHTTGAAK AAAENIIPHTTGAAK2                2
#>       Q.Value Protein.Q.Value  PG.Q.Value  GG.Q.Value Proteotypic
#> 1 8.61030e-05     0.000603500 0.000593472 0.000594884           1
#> 2 1.04118e-04     0.000610128 0.000599520 0.000600601           1
#> 3 1.32547e-04     0.000695894 0.000685401 0.000686813           1
#> 4 8.23452e-05     0.000610128 0.000598802 0.000600601           1
#> 5 8.86211e-05     0.000684932 0.000672947 0.000674764           1
#> 6 1.15447e-04     0.000687285 0.000673401 0.000675676           1
#>   Precursor.Quantity Precursor.Normalised Label.Ratio      RT RT.Start RT.Stop
#> 1             585552               575050           0 26.2347  26.0551 26.4140
#> 2             714354               691530           0 26.2847  26.0457 26.5238
#> 3             553888               537228           0 26.2243  26.0449 26.4035
#> 4             582536               565425           0 27.1925  27.0128 27.3718
#> 5             605009               529203           0 27.0657  26.8864 27.3044
#> 6             483906               488140           0 26.8906  26.7110 27.0699
#>       iRT Predicted.RT Predicted.iRT Lib.Q.Value Ms1.Profile.Corr Ms1.Area
#> 1 26.8898      26.2833       26.8305 8.61772e-05         0.828587   331771
#> 2 26.8898      26.3814       26.8007 8.61772e-05         0.000000        0
#> 3 26.8898      26.2363       26.8867 8.61772e-05         0.792160   941024
#> 4 26.8898      27.1476       26.9223 8.61772e-05         0.938746   599337
#> 5 26.8898      27.0569       26.8871 8.61772e-05         0.969395  1190120
#> 6 26.8898      26.8503       26.9224 8.61772e-05         0.959186   828001
#>   Evidence   CScore Decoy.Evidence MS2.Scan
#> 1  4.84916 0.998191              0    26297
#> 2  3.72305 0.998172              0    26372
#> 3  4.80073 0.998183              0    26297
#> 4  4.54677 0.998704              0    27497
#> 5  4.23394 0.998795              0    27347
#> 6  4.21429 0.998653              0    27122

# Reshape to a wide expression table
wide_table <- reshape_long2wide(long_table,sample_col = "File.Name",
                                feature_col = "Precursor.Id", expression_col = "Precursor.Normalised")
head(wide_table)
#>                          Precursor.Id X5fmol_inj1 X5fmol_inj2 X5fmol_inj3
#> 1                    AAAENIIPHTTGAAK2      575050      691530      537228
#> 2                    AAAENIIPHTTGAAK3      606721      648603      516066
#> 3                   AADGQMVPFSAFSSSR2     1321990     1210640     1340600
#> 4                   AADILRDDLANRGPVR3      508771      506306      488182
#> 5                   AADILRDDLANRGPVR4      198130      161177      149399
#> 6 AAEELEKEGINC(UniMod:4)NLTLLFSFAQAR3     1968710     1853150     2008230
#>   X25fmol_inj1 X25fmol_inj2 X25fmol_inj3
#> 1       565425       529203       488140
#> 2       718276       787171       662301
#> 3      1454250      1394310      1067680
#> 4       496371       146527       495867
#> 5       168853       147297       118770
#> 6      1766760      1757760      1509600

# If do not remove prefix or suffix of samples
wide_table2 <- reshape_long2wide(long_table,sample_col = "File.Name",
                                 feature_col = "Precursor.Id", expression_col = "Precursor.Normalised",
                                 remove_sample_prefix = FALSE, remove_sample_suffix = FALSE)
colnames(wide_table2)
#> [1] "Precursor.Id"                                                                 
#> [2] "C..Users.proteo.Desktop.RD139_Feb2021.mzML.RD139_Narrow_UPS1_5fmol_inj1.mzML" 
#> [3] "C..Users.proteo.Desktop.RD139_Feb2021.mzML.RD139_Narrow_UPS1_5fmol_inj2.mzML" 
#> [4] "C..Users.proteo.Desktop.RD139_Feb2021.mzML.RD139_Narrow_UPS1_5fmol_inj3.mzML" 
#> [5] "C..Users.proteo.Desktop.RD139_Feb2021.mzML.RD139_Narrow_UPS1_25fmol_inj1.mzML"
#> [6] "C..Users.proteo.Desktop.RD139_Feb2021.mzML.RD139_Narrow_UPS1_25fmol_inj2.mzML"
#> [7] "C..Users.proteo.Desktop.RD139_Feb2021.mzML.RD139_Narrow_UPS1_25fmol_inj3.mzML"

# Keep some identification information
wide_table2 <- reshape_long2wide(long_table,sample_col = "File.Name",
                                 feature_col = "Precursor.Id", expression_col = "Precursor.Normalised",
                                 shrink_ident_cols = c("Protein.Names","Protein.Group",
                                                       "Stripped.Sequence","Modified.Sequence",
                                                       "Precursor.Charge","Evidence"),
                                 extend_ident_cols = "Q.Value"
)
str(wide_table2)
#> 'data.frame':	5720 obs. of  19 variables:
#>  $ Precursor.Id       : chr  "AAAENIIPHTTGAAK2" "AAAENIIPHTTGAAK3" "AADGQMVPFSAFSSSR2" "AADILRDDLANR3" ...
#>  $ X5fmol_inj1        : num  575050 606721 1321990 NA 508771 ...
#>  $ X5fmol_inj2        : num  691530 648603 1210640 NA 506306 ...
#>  $ X5fmol_inj3        : num  537228 516066 1340600 NA 488182 ...
#>  $ X25fmol_inj1       : num  565425 718276 1454250 79901 496371 ...
#>  $ X25fmol_inj2       : num  529203 787171 1394310 94191 146527 ...
#>  $ X25fmol_inj3       : num  488140 662301 1067680 43877 495867 ...
#>  $ Protein.Names      : chr  "G3P2_ECOLI" "G3P2_ECOLI" "ACRB_ECOLI" "FLIG_ECOLI" ...
#>  $ Protein.Group      : chr  "P33898" "P33898" "P31224" "P0ABZ1" ...
#>  $ Stripped.Sequence  : chr  "AAAENIIPHTTGAAK" "AAAENIIPHTTGAAK" "AADGQMVPFSAFSSSR" "AADILRDDLANR" ...
#>  $ Modified.Sequence  : chr  "AAAENIIPHTTGAAK" "AAAENIIPHTTGAAK" "AADGQMVPFSAFSSSR" "AADILRDDLANR" ...
#>  $ Precursor.Charge   : int  2 3 2 3 3 4 3 2 3 3 ...
#>  $ Evidence           : chr  "4.84916;3.72305;4.80073;4.54677;4.23394;4.21429" "3.34231;4.35038;3.88619;5.35532;4.93782;4.49777" "3.49415;3.19957;3.47656;4.03331;4.03771;3.75486" "NA;NA;NA;3.09877;4.047;2.88724" ...
#>  $ Q.Value.5fmol_inj1 : num  8.61e-05 2.47e-04 1.61e-04 NA 1.25e-04 ...
#>  $ Q.Value.5fmol_inj2 : num  0.000104 0.000104 0.000104 NA 0.000104 ...
#>  $ Q.Value.5fmol_inj3 : num  0.000133 0.000192 0.000168 NA 0.000133 ...
#>  $ Q.Value.25fmol_inj1: num  8.23e-05 8.23e-05 9.49e-05 7.03e-04 9.49e-05 ...
#>  $ Q.Value.25fmol_inj2: num  8.86e-05 8.86e-05 8.86e-05 8.86e-05 4.90e-04 ...
#>  $ Q.Value.25fmol_inj3: num  0.000115 0.000115 0.000184 0.000384 0.000115 ...