Extract data from RegulonDB — get

This function retrieves data from RegulonDB. Attributes from datasets can be selected and filtered.

get_dataset(
  regulondb,
  dataset = NULL,
  attributes = NULL,
  filters = NULL,
  and = TRUE,
  interval = NULL,
  partialmatch = NULL,
  output_format = "regulondb_result"
)

Arguments

regulondb: A regulondb() object.
dataset: Dataset of interest. Use the function list_datasets for an overview of valid datasets.
attributes: Vector of attributes to be retrieved.
filters: List of filters to be used. The names should correspond to the attribute and the values correspond to the condition for selection.
and: Logical argument. If FALSE, filters will be considered under the "OR" operator
interval: the filters whose values will be considered as interval
partialmatch: name of the condition(s) with a string pattern for full or partial match in the query
output_format: A string specifying the output format. Possible options are "regulondb_result", "GRanges", "DNAStringSet" or "BStringSet".

Value

By default, a regulon_results object. If specified in the parameter output_format, it can also return either a GRanges object or a Biostrings object.

Author

Carmina Barberena Jonas, Jesús Emiliano Sotelo Fonseca, José Alquicira Hernández, Joselyn Chávez

Examples

## Connect to the RegulonDB database if necessary
if (!exists("regulondb_conn")) regulondb_conn <- connect_database()

## Build the regulon db object
e_coli_regulondb <-
    regulondb(
        database_conn = regulondb_conn,
        organism = "E.coli",
        database_version = "1",
        genome_version = "1"
    )

## Obtain all the information from the "GENE" dataset
get_dataset(e_coli_regulondb, dataset = "GENE")
#> regulondb_result with 4602 rows and 33 columns
#>                id        name     bnumber          gi               synonyms
#>       <character> <character> <character> <character>            <character>
#> 1    ECK120000001         alr       b4053          NA ECK4045,EG10001,alr5..
#> 2    ECK120000002        modB       b0764          NA ECK0753,EG10002,b076..
#> 3    ECK120000003        cysZ       b2413          NA  ECK2408,EG10003,b2413
#> 4    ECK120000004         dfp       b3639          NA ECK3629,EG10004,b363..
#> 5    ECK120000005        dcuB       b4123          NA ECK4116,EG10006,b412..
#> ...           ...         ...         ...         ...                    ...
#> 4598 ECK125276531        yqfH       b4753          NA ECK4610,G0-16747,b4753
#> 4599 ECK125276532        yliM       b4736          NA ECK4593,G0-16731,b4736
#> 4600 ECK125276533        ynfS       b4750          NA ECK4607,G0-16744,b4750
#> 4601 ECK125276534        ylcJ       b4733          NA ECK4590,G0-16728,b4733
#> 4602 ECK125276535        sdhX       b4764          NA ECK4621,G0-17009,b47..
#>        posleft  posright      strand           dna_sequence
#>      <integer> <integer> <character>            <character>
#> 1      4265782   4266861     forward ATGCAAGCGGCAACTGTTGT..
#> 2       795862    796551     forward ATGATACTGACCGATCCAGA..
#> 3      2531463   2532224     forward ATGGTTTCATCATTCACATC..
#> 4      3812731   3813951     forward ATGAGCCTGGCCGGTAAAAA..
#> 5      4347404   4348744     reverse ATGTTATTTACTATCCAACT..
#> ...        ...       ...         ...                    ...
#> 4598   3032939   3033010     reverse ATGATTAACCAAGTGAGCGT..
#> 4599    850332    850397     forward ATGGAAACGTTCTGTTACAT..
#> 4600   1642122   1642211     forward ATGAATAACCCCGTCTGTCT..
#> 4601    568695    568844     forward ATGAGCCTCGTTTTATGCTT..
#> 4602    765050    765150     forward ATATCTGTAATAAGAAATAG..
#>            external_db_link      evidence_reference   product_id
#>                 <character>             <character>  <character>
#> 1    ASAP,http://asap.aha..                      NA ECK120004477
#> 2    ASAP,http://asap.aha..                      NA ECK120004478
#> 3    ASAP,http://asap.aha..                      NA ECK120004479
#> 4    ASAP,http://asap.aha..                      NA ECK120004480
#> 5    ASAP,http://asap.aha..                      NA ECK120004481
#> ...                     ...                     ...          ...
#> 4598                     NA                      NA ECK125276571
#> 4599                     NA                      NA ECK125276556
#> 4600                     NA                      NA ECK125276575
#> 4601                     NA                      NA ECK125276577
#> 4602                     NA \t\t\t30591570,30541135 ECK125276565
#>                product_name        product_synonym       product_sequence
#>                 <character>            <character>            <character>
#> 1        alanine racemase 1 Alr,alanine racemase.. MQAATVVINRRALRHNLQRL..
#> 2    molybdate ABC transp..         ChlJ,ModB,TslJ MILTDPEWQAVLLSLKVSSL..
#> 3    sulfate:H<sup>+</sup..                   CysZ MVSSFTSAPRSGFYYFAQGW..
#> 4    fused 4'-phosphopant..              CoaBC,Dfp MSLAGKKIVLGVSGGIAAYK..
#> 5    anaerobic C4-dicarbo..              DcuB,GenF MLFTIQLIIILICLFYGARK..
#> ...                     ...                    ...                    ...
#> 4598           protein YqfH                   YqfH MINQVSVYRQPPVLSGCRQV..
#> 4599           protein YliM                   YliM  METFCYMKWPVRHHKSRRVSH
#> 4600 Qin prophage; protei..                   YnfS MNNPVCLDDWLIGFKSLCCT..
#> 4601           protein YlcJ                   YlcJ MSLVLCFLLMSLFFMYSFVL..
#> 4602 small regulatory RNA..              RybD,SdhX AUAUCUGUAAUAAGAAAUAG..
#>      molecular_weigth isoelectric_point celullar_location
#>             <numeric>         <numeric>       <character>
#> 1              39.153             7.059           cytosol
#> 2              24.939            10.716    inner membrane
#> 3              29.305             9.809    inner membrane
#> 4              43.438             7.585           cytosol
#> 5              47.935             7.878    inner membrane
#> ...               ...               ...               ...
#> 4598            2.617            10.452                NA
#> 4599            2.716            10.877                NA
#> 4600            3.193             4.113    inner membrane
#> 4601            5.943            10.891                NA
#> 4602               NA                NA                NA
#>                product_note product_type             go_index_bp
#>                 <character>  <character>             <character>
#> 1                        NA           NA GO:0006522\tGO:000836..
#> 2    ModB is the predicte..           NA  GO:0015689\tGO:0055085
#> 3    CysZ is a high affin..           NA GO:0000103\tGO:000827..
#> 4    The <i>dfp</i> (<i>c..           NA GO:0008152\tGO:001593..
#> 5    DcuB is a C4-dicarbo..           NA GO:0009061\tGO:001574..
#> ...                     ...          ...                     ...
#> 4598 YqfH was identified ..           NA                      NA
#> 4599 YliM was identified ..           NA                      NA
#> 4600 YnfS was identified ..           NA                      NA
#> 4601 YlcJ was identified ..           NA                      NA
#> 4602 The small regulatory..    small RNA              GO:0040033
#>                   go_desc_bp             go_index_cc             go_desc_cc
#>                  <character>             <character>            <character>
#> 1     alanine metabolic pr..              GO:0005829                cytosol
#> 2     molybdate ion transp.. GO:0005886\tGO:000588.. plasma membrane,inte..
#> 3     sulfate assimilation.. GO:0005886\tGO:000588.. plasma membrane,inte..
#> 4    metabolic process\tco..  GO:0005737\tGO:0005829      cytoplasm,cytosol
#> 5     anaerobic respiratio.. GO:0005886\tGO:000588.. plasma membrane,inte..
#> ...                      ...                     ...                    ...
#> 4598                      NA                      NA                     NA
#> 4599                      NA                      NA                     NA
#> 4600                      NA GO:0005886\tGO:001602.. plasma membrane,memb..
#> 4601                      NA                      NA                     NA
#> 4602  negative regulation ..                      NA                     NA
#>                  go_index_mf              go_desc_mf product_external_db_links
#>                  <character>             <character>               <character>
#> 1    GO:0003824\tGO:000878.. catalytic activity\ta..   ECOCYC\thttp://biocyc..
#> 2                 GO:0015098  molybdate ion transm..   ECOCYC\thttp://biocyc..
#> 3     GO:0009675\tGO:0015116  high-affinity sulfat..   ECOCYC\thttp://biocyc..
#> 4    GO:0003824\tGO:000463.. catalytic activity\tp..   DIP\thttp://dip.doe-m..
#> 5     GO:0005469\tGO:0015556  succinate:fumarate a..   ECOCYC\thttp://biocyc..
#> ...                      ...                     ...                       ...
#> 4598                      NA                      NA   ECOCYC\thttp://biocyc..
#> 4599                      NA                      NA   ECOCYC\thttp://biocyc..
#> 4600                      NA                      NA   ECOCYC\thttp://biocyc..
#> 4601                      NA                      NA   ECOCYC\thttp://biocyc..
#> 4602  GO:0005515\tGO:0048027 protein binding\tmRNA..   ECOCYC\thttp://biocyc..
#>                product_ev_ref    operon_id          operon_name
#>                   <character>  <character>          <character>
#> 1                          NA ECK120029421                  alr
#> 2                          NA ECK120014822               modABC
#> 3      \t\t\t6341507,29792261 ECK120029422                 cysZ
#> 4                          NA ECK120030583                  dfp
#> 5      \t\t\t20860483,1512189 ECK120014710            dcuB-fumB
#> ...                       ...          ...                  ...
#> 4598                       NA           NA                   NA
#> 4599                       NA ECK125285808            yliM-ompX
#> 4600                       NA           NA                   NA
#> 4601                       NA           NA                   NA
#> 4602 IMP\t\tInferred from m.. ECK125285815 sdhCDAB-sucABCD-sdhX
#>                   tu_promoter    sigma_factor                 gene_tf
#>                   <character>     <character>             <character>
#> 1    ECK120029337\t\tECK120..              NA                      NA
#> 2    ECK120009489\tmodABC\t.. Sigma28,Sigma70 ECK120011235\tCRP ECK..
#> 3                          NA              NA                      NA
#> 4    ECK120030182\tdfp\tECK..              NA                      NA
#> 5     ECK120009650\tdcuB-fu..         Sigma70 ECK120011235\tCRP ECK..
#> ...                       ...             ...                     ...
#> 4598                       NA              NA                      NA
#> 4599  ECK120034934\tyliM-om..         Sigma70                      NA
#> 4600                       NA              NA                      NA
#> 4601                       NA              NA                      NA
#> 4602  ECK120009705\tsdhCDAB..         Sigma70 ECK120011345\tArcA EC..

## Get the attributes posright and name from the "GENE" dataset
get_dataset(e_coli_regulondb,
    dataset = "GENE",
    attributes = c("posright", "name")
)
#> regulondb_result with 4602 rows and 2 columns
#>       posright        name
#>      <integer> <character>
#> 1      4266861         alr
#> 2       796551        modB
#> 3      2532224        cysZ
#> 4      3813951         dfp
#> 5      4348744        dcuB
#> ...        ...         ...
#> 4598   3033010        yqfH
#> 4599    850397        yliM
#> 4600   1642211        ynfS
#> 4601    568844        ylcJ
#> 4602    765150        sdhX

## From "GENE" dataset, get the gene name, strand, posright, product name
## and id of all genes regulated with name like "ara", strand as "forward"
## with a position right between 2000 and 40000
get_dataset(
    e_coli_regulondb,
    dataset = "GENE",
    attributes = c("name", "strand", "posright", "product_name", "id"),
    filters = list(
        name = c("ara"),
        strand = c("forward"),
        posright = c("2000", "40000")
    ),
    and = TRUE,
    partialmatch = "name",
    interval = "posright"
)
#> regulondb_result with 1 row and 5 columns
#>          name      strand  posright           product_name           id
#>   <character> <character> <integer>            <character>  <character>
#> 1        carA     forward     30799 carbamoyl phosphate .. ECK120000130