Compare commits

..

3 Commits

Author SHA1 Message Date
Nicholas Novak
8d8a1e0634 change: Finished almost my last draft on the paper 2023-12-14 02:23:51 -08:00
Nicholas Novak
c95b02bd7e feat: Added start of paper to comps 2023-12-13 23:54:33 -08:00
Nicholas Novak
6fa03619ee chore: Changed project names for the GitHub release 2023-12-13 23:48:53 -08:00
22 changed files with 1042 additions and 25 deletions

View File

@ -9,8 +9,8 @@ import (
log "github.com/sirupsen/logrus" log "github.com/sirupsen/logrus"
"git.nicholasnovak.io/nnovak/spatial-db/server" "github.com/NickyBoy89/spatial-db/server"
"git.nicholasnovak.io/nnovak/spatial-db/world" "github.com/NickyBoy89/spatial-db/world"
) )
func populateStorageDir( func populateStorageDir(

View File

@ -4,7 +4,7 @@ import (
"fmt" "fmt"
"testing" "testing"
"git.nicholasnovak.io/nnovak/spatial-db/server" "github.com/NickyBoy89/spatial-db/server"
) )
var hash server.HashServer var hash server.HashServer

View File

@ -4,7 +4,7 @@ import (
"fmt" "fmt"
"testing" "testing"
"git.nicholasnovak.io/nnovak/spatial-db/server" "github.com/NickyBoy89/spatial-db/server"
) )
var inmemory server.InMemoryServer var inmemory server.InMemoryServer

View File

@ -5,7 +5,7 @@ import (
"strconv" "strconv"
"strings" "strings"
"git.nicholasnovak.io/nnovak/spatial-db/world" "github.com/NickyBoy89/spatial-db/world"
"github.com/Tnze/go-mc/save" "github.com/Tnze/go-mc/save"
"github.com/Tnze/go-mc/save/region" "github.com/Tnze/go-mc/save/region"
) )

View File

@ -5,7 +5,7 @@ import (
"path/filepath" "path/filepath"
"strings" "strings"
"git.nicholasnovak.io/nnovak/spatial-db/storage" "github.com/NickyBoy89/spatial-db/storage"
log "github.com/sirupsen/logrus" log "github.com/sirupsen/logrus"
"github.com/spf13/cobra" "github.com/spf13/cobra"

View File

@ -1,9 +1,9 @@
package main package main
import ( import (
"git.nicholasnovak.io/nnovak/spatial-db/connector" "github.com/NickyBoy89/spatial-db/connector"
"git.nicholasnovak.io/nnovak/spatial-db/loading" "github.com/NickyBoy89/spatial-db/loading"
"git.nicholasnovak.io/nnovak/spatial-db/visualization" "github.com/NickyBoy89/spatial-db/visualization"
"github.com/spf13/cobra" "github.com/spf13/cobra"
) )

View File

@ -4,7 +4,7 @@ import (
"fmt" "fmt"
"testing" "testing"
"git.nicholasnovak.io/nnovak/spatial-db/server" "github.com/NickyBoy89/spatial-db/server"
) )
var disk server.SimpleServer var disk server.SimpleServer

Binary file not shown.

After

Width:  |  Height:  |  Size: 22 KiB

547
paper/document.tex Normal file
View File

@ -0,0 +1,547 @@
\documentclass[10pt,twocolumn]{article}
\usepackage{oxycomps}
\bibliography{references}
\pdfinfo{
/Title (SpatialDB: A Database for Storing Dense Three-Dimensional Voxel Structures)
/Author (Nicholas Novak)
}
\title{SpatialDB: A Database for Storing Dense Three-Dimensional Voxel Structures}
\author{Nicholas Novak}
\affiliation{Occidental College}
\email{nnovak@oxy.edu}
\begin{document}
\maketitle
\section{Introduction and Problem Context}
% What my project is
In my senior comprehensive project, I have designed and implemented a database
application that is designed specifically to store complex shapes in ``voxels'',
or three-dimensional pixels.
% Applications of voxels
A voxel\cite{enwiki:1186283262} represents a single point or cube in a
three-dimensional grid, at a variable size. This feature allows them to
approximately model many three-dimensional structures, and to reduce the
computational complexity in analyzing the shape, which has led to many
data-related use cases outside of computer science. For example, to model the
inner workings of the brain, Neuroscientists track oxygen concentration through
neural tissue on a voxel grid as part of fMRI studies\cite{norman2006beyond},
and Movie studios such as DreamWorks use voxel data structures to model light
reflections for visual effects\cite{museth2013vdb}. The output of MRI scans in
hospitals are very high-resolution voxel grids. Most recently, machine learning
models are being trained on the LIDAR data from self-driving
cars\cite{li2020deep} in order to better process their environments. However,
voxels are not often thought of as a way to permanently store three-dimensional shapes, and
existing research focuses mainly on efficiently representing and processing
shapes. My approach models this problem of voxel storage and representation, and
turns it into a problem of database design.
\subsection{Using Minecraft as a Model for a Database}
% The problems with Minecraft
Minecraft\footnote{https://www.minecraft.net/en-us}, released 2009, is a sandbox
game that is played in a world entirely composed of cubic voxels, where the
player has complete freedom to manipulate the world by building, destroying, or
exploring any part of it. I am focusing this database on the requirements of
Minecraft because the game involves some additional challenges that traditional
databases do not consider. Primarily, the world of Minecraft is infinite in the
horizontal $x$ and $z$ axes, but fixed in the $y$ axis, which limits the amount
of information that can be stored by the database at once. The world also
contains a denser voxel grid than in many other applications, meaning that far
more of the blocks in the world are filled than empty.
A game is also a real-time application, which means that any performance issues
will be immediately be present to the user. Most databases can be evaluated on
only their speed, but as the Minecraft server processes new information 20 times
per second, the game has a time budget of 50ms to handle all game logic,
including the storing of data. Less time processing the data in the world means
that more time will be freed up for the game to process other work, although
finishing work earlier will not necessarily be faster for the end user, if it
still under the budget of 50ms. Most databases do not meet this requirement, and
even though they may be faster, their complexity does not mean that they will
always finish operations within this time limit.
These limitations also make Minecraft unable to take advantage of a cache, since
the number of different operations that can be done on the world is infinitely
large, remembering any previous operations will often not be helpful for the
system's performance. Minecraft also provides a good benchmark for the database,
because the unpredictability of players stresses the system's ability to return
results in a variety of settings.
\section{Technical Background}
\subsection{What is a database?}
When I refer to the concept of a database, I am referencing a program that sits
more or less as a ``black box'' between the user and a software application,
storing any data required for the application. In most existing applications,
this is done by a category of databases called ``relational databases'', which
offer a very general-purpose way to store user data that is highly connected.
For instance, a person stored in a relational database would be efficiently
linked to with any of their associated information, such as name or age.
% The model of a database
In database terms, any amount of data added to the database is called a
``write'', data retrieved from the database is called a ``read'', and any
questions asked, such as ``how many people have done this'', are called
``queries''. Developers ask these questions through computer languages, one such
example being Structured Query Language or SQL, which allow
the database to be queried efficiently.
\subsection{Challenges With Existing Databases}
% Software development and SQL
Most software engineering projects start with a simple front-end and back-end,
typically implemented with some sort of Model-View-Controller architecture, and
connected to a relational SQL database \cite{sqliteOnlyDatabase}. This idea was
popularized by frameworks such as Ruby on Rails and Django, where the model was
most often modeled by structures within the database. This framework allowed
software developers to not have to worry about inner workings of the database,
and focus on writing business logic. This is how many start-ups were built, such
as GitHub \cite{githubSingleSQL}, who recently moved off its single SQL database
after 13 years, citing performance issues.
% Challenges with working with SQL: Performance
Using a single SQL-speaking database can be a significant advantage development
speed, but the database can have some issues keeping up with the demands of the
application as the performance requirements expand.
% Caching
As soon as this happens, companies typically put smaller caching applications in
front of their database, such as \verb|Redis|\footnote{https://redis.io/},
\verb|memcached|\cite{nishtala2013scaling}, or \verb|TAO| \cite{bronson2013tao},
to allow the application to remember some of the commonly asked questions and
reduce load on the database by not having to do the same work again.
\subsubsection{The Complexity of General-Purpose Databases}
% What is being done about this
Modern SQL databases are also very complex. Three of the most popular SQL
databases, PostreSQL, MySQL and Sqlite have 1.4 million lines
\footnote{https://wiki.postgresql.org/wiki/GSoC\_2018, in reference to the
text ``PostgreSQL is over 1.3M lines of code and some of the code paths can be
tricky to reach.''} of code, 2.9 million lines
\footnote{https://www.openhub.net/p/mysql}, and 150,000 lines
\footnote{https://www.sqlite.org/testing.html} respectively.
% Why are databases inefficient?
Why are databases so complex? Most of the reason for the complexity is that
because these database systems so general-purpose, they cannot assume anything
about the data stored in them. For the database, finding an efficient plan to
answer each query is a known NP-hard problem\cite{chatterji2002complexity}, and
to keep itself fast, the database must construct this plan with a complex set of
approximations, based on the assumptions that it can make, which leads to
ever-evolving complexity.
% Impossible to maintain
With this complexity, it is impossible for a single person to understand the
complete inner workings of a database. Thus, the problem of the company's
database often becomes a dedicated person in companies that can afford it, or
become entire teams of engineers at larger organizations such as
Google\cite{googlePerfTeam}.
% Intro to special-purpose databases
What happens in the larger companies that can afford more engineering time, and
have a specific problem that they cannot solve with a traditional database?
Typically, this leads to the creation of special-purpose database solutions. For
instance, the global scale of iCloud and Apple's cloud solutions required them
to create FoundationDB\cite{zhou2021foundationdb}. A different set of challenges
in the Facebook inbox led to the creation of Apache
Cassandra\cite{lakshman2010cassandra}, which is optimized to allow for many
emails to be received, at the expense of search speed, which is done far less
frequently.
\subsubsection{The Special-Purpose Database}
Limiting a database's design to a specific use-case can make the development
process much simpler, to the point where it can be done by a single person, and
can offer higher performance. The first question that needs to be asked is
whether the application is \textit{write-heavy} or \textit{read-heavy}.
Read-heavy applications occur often in web development, and most social media
platforms have far more users reading the content, than writing new content for
the platform. In contrast, write-heavy applications are often seen in analytics
workloads, where data is written from many sources, and analyzed infrequently by
users.
My application has a relatively even write and read balance, and I evaluated
three different storage data structures before choosing to implement my own
% Special-purpose databases
Recently, companies such as Tigerbeetle\cite{tigerbeetleDesign} have taken this
domain-driven approach to database design even further, while designing a
database from the ground up to do financial accounting, which outperforms a
reference MySQL implementation at 76 accounting transactions per second, to
1,757 transactions per second \cite{tigerbeetlePerf}. This highly specialized
and domain-specific approach to creating databases is what my project is going
to be based on, to create a database around the challenges that the game
Minecraft has.
\subsubsection{Key-Value Databases}
One of the main architectures that I considered for my project is a design
called a key-value store\cite{kvdatabase}, which would store the relationship of
a single voxel to its value. Many other voxel databases use this method to
achieve constant-time operations on retrieving points, which means that
regardless of the size of the dataset, the database will always be able to
return a result in the same amount of time. This structures is behind many of
the high-performance caches that are commonly used to speed up web applications,
such as Redis and RocksDB\cite{dong2021rocksdb}. In order to provide high speeds
for this data, the key-value mappings are usually stored in main memory, which
is far more expensive and limited than the system's disk drive, but offers
a speed advantage of several orders of magnitude\cite{latencyKnow}.
\section{Prior Work}
\subsection{Voxels for Efficient Computation}
Most existing literature on the topic of using voxels to store shapes focuses on
the application of the voxel grid for efficient computation. Since voxel points
are completely independent of each other, this allows for efficient parallel
processors, which are increasingly more common on consumer hardware, to take
advantage of this speedup. In VDB\cite{museth2013vdb} Museth demonstrates that
by modeling a sparse voxel grid in different resolutions, a computer cluster can
efficiently approximate a physical structures such as a cloud, in order to
calculate expensive lighting operations.
% Parallel processing on voxels
Williams\cite{williams1992voxel} expands upon the uses of a voxel database to
model graph and mesh-based problems. Taking advantage of the parallelism in the
grid, many problems can be reframed in the representation of voxels, and solve
those problems far more efficiently. This model however, assumes that every
voxel is stored in shared memory, making this process only viable to solve
problems that can be modeled on one machine, and are far more computationally
expensive, rather than data-intensive.
\subsection{Storing Large Voxel Data Sets}
Another approach to the problem of storing voxel data is the distributed
approach in Gorte et. al. \cite{gorte2023analysis}. Since memory is limited
within one computer, this workload can be split up between many servers, which
allows very large datasets to be worked on by a single workstation through an
API. This method keeps many of the same performance considerations, but also
assumes that the voxel data is not very dense, and uses a three-dimensional
data structure called an octree, which allows the user to change the resolution
of the data that they are working on. In the paper, Gorte acknowledges the need
to split large datasets up into smaller regions, which is similar to the concept
of ``chunks'' in my implementation.
\subsection{Chunk Systems in Other Games}
The decision to choose chunks to represent game data has many justifications. As
\cite{gorte2023analysis} mentions, an infinite grid of voxels needs to be broken
up in a way where applications can store data in an efficient way, and many
other games converge on this same implementation. Another voxel-based game,
Veloren\cite{https://veloren.net} uses the same chunk-based system, although
differs in its storage method. The game switches between several different
storage implementations in each chunk, depending on how dense or sparse the voxel
data within the chunk is. For sparser data, the game stores block information in
a simple key-value hash map. As the number of voxels increase, the game further
breaks this information up, and creates several smaller sections within the
chunk. Finally, for very dense data, the game stores a compressed version using
Zlib compression\cite{veloren32}. This gives many options for data compression
in my database, but also shows how the database can be adapted to store sparser
structures more efficiently if the focus of the project ever needs to change.
Since this game is not based on Minecraft, but an independent project named cube
world, the game comes up with a similar data structure, and shows the
performance considerations for using such a structure. The benchmarks that they
show suggest about an order-of-magnitude improvement over using a key-value
store.
\subsection{Previous Special-Purpose Databases}
The design of my database was also inspired by the LSM tree and data-driven
designs of Tigerbeetle\cite{tigerbeetleDesign}, which is also able to handle
concurrent operations on the same design. Another database,
CockroachDB\footnote{https://www.cockroachlabs.com/product/}, uses a key-value
mapping backend to store a SQL-like tables and rows. Finally, the design of
caching layers in modern SQL caches such as Noria\cite{gjengset2018noria} show
that it it possible to efficiently remember the complex queries found in SQL,
and replicate these in real-time.
\section{Methods}
\subsection{The Interface for the Database}
For developers to interact with the database, the database is implemented as a
library, and the database provides a simple application programming interface to
read and write data, consisting of the following operations. The performance
considerations for each of these operations can be found in the methods section
below.
\begin{itemize}
\item Read a single block
\item Write a single block
\item Change a range of blocks
\item Read a pre-defined ``chunk'' of blocks
\end{itemize}
\subsection{Reading and Writing a Single Voxel}
The process of updating the data for a single point in the world starts with the
voxel's position. Because the world is infinite on the horizontal $x$ and $z$
axes, this is implemented by a system of ``chunks'', which are fixed-size 16x16
columns of voxels, 256 voxels high. The size of these chunks are chosen so that
they are large enough to be efficiently cached, and many operations can occur
within the same chunk, but not too large to the point where the hundred or so
chunks sent to the user upon joining the world cause a network slowdown. Given a
point's $x$ and $z$ positions, the chunk that that voxel belongs to can be found
with a fast modulus operation, in constant time.
To fetch the data for that chunk, the database needs to read that data from
disk. The database stores this information in combined files that I call ``unity
files'' (shown in figure \ref{fig:unity}), which consist of a single file on disk, but with the encoded data for
each chunk stored as a start index and size, so that the \verb|seek| syscall can
be used to efficiently query this data, while only keeping one file open. This
scheme was used over the previous system of storing chunk files separately,
because the filesystem had a hard time searching through the hundreds of
thousands of chunks in larger worlds. This start position and size are stored in
an auxillary hash map that stores a mapping of every chunk's position to its
metadata within the unity file. This structure uses a minimal amount of memory,
and also allows for a file to be fetched from disk in a constant amount of time
and disk reads.
\begin{figure}
\centering
\includegraphics[width=8cm]{unity-file.drawio.png}
\caption{The Layout of a Unity File}
\label{fig:unity}
\end{figure}
Each chunk is further divided into sections, in this case each chunk consists of
16 stacked 16x16x16 cubes of voxels, which results in a total of 4096 block
states per section. Using the voxel's $y$ position, the section for a block can
be found with another modulus. Once this is found, a perfect hash function is
used to map the voxel's position to an array index within the section. Again,
both of these steps are done in constant time respectively.
Every section additionally stores a look-up-table, that stores a mapping of a
\textit{palette index} to the state of a block. When the value for the point is
retrieved from the section, the value returned is not the block's state, but
simply an index into this palette. The palette lookup is done in constant time,
and when a new block is added into the section that needs an additional state in
the palette, this value is added in constant time as well. The existence of this
palette supports the efficient operation changing large portions of blocks in
the world.
Once the value of the point is found in the palette, the value can be returned
to the user. A visual diagram of this process can be found in figure
\ref{fig:lookup}.
\begin{figure}
\centering
\includegraphics[width=8cm]{block-search.drawio.png}
\caption{The process of looking up a single block}
\label{fig:lookup}
\end{figure}
The ability to change a region of blocks is also a common operation within the
database, and which isn't locked to a specific range of chunks. This operation
is implemented as overwriting the palettes for a specific region. By overwriting
every palette index to the same value, every value in the chunk effectively gets
set to the same value. This does however create the need for an additional
``compaction'' step, where the palette is shrunk to remove duplicate values, and
every block within the section must be updated to point to the correct index in
the palette. This compaction is done upon any subsequent writes to the section
by inserting a block, because only this fixed-size section needs to be changed,
preserving the time of the operation as constant time.
Finally, the retrieval of a single chunk can be done efficiently, because the
database already stores chunks separately, and serializes these to the client.
% \cite{vohra2016apache}.
\section{Evaluation Metrics}
\subsection{Reading Single Voxels}
Reads and writes of single voxels are the most common fundamental operation for
my database, and the database should be handle this operation in the same amount
of time, regardless of the size of the world. Both my implementation and the
simpler key-value store meet this criteria.
\subsection{Changing Regions of Voxels}
Changing regions of voxels should be able to be done in linear time. This is
because resetting or changing a region of voxels is important while drawing
shapes of various resolutions. Lower resolution shapes are less precise, and
thus are able to be written faster.
\subsection{Memory Requirements}
The memory requirement is set quite low, at 256MB, in order to require the
database to store most of its data on disk, and limit its memory usage to
important caching features. This limitation was chosen for larger datasets that
don't fit within memory on a single machine, because memory is much more
expensive than disk storage, and would limit the analysis to smaller voxel grids.
\subsection{Reading Regions of Voxels}
The ability to retrieve large shapes from the database is important, because in
order to export a shape, another operation must be present to efficiently do
this. This operation therefore must be done in constant time, because as
Gorte\cite{gorte2023analysis} identifies, many researchers might want to work
on the same dataset, and exporting all this data would become inefficient for
the database to process. In the use-case of Minecraft, this allows the server to
support many more players at once, by not sending every individual block to each
client. This requirement is not met by the key-value database, but is reached by
my implementation, by sending the stored chunks on disk.
\subsection{Reading Neighboring Blocks}
The last common operation in most voxel databases is the ability to read points
that are neighboring another point. This is important because many voxel shapes
approximate cubic shapes \cite{gorte2023analysis}, and in Minecraft, players are
constantly affecting voxels that are nearer to each other.
\section{Results and Discussion}
Benchmarking on my laptop, inserting values in various spreads around the voxel
world, I get the following benchmarks, comparing an in-memory implementation of
SpatialDB, the disk-based implementation of SpatialDB, and a memory-based
key-value implementation in figure \ref{fig:reads}:
\begin{figure}
\centering
\begin{tabular}{c | c | c | c}
Spread of Points & In-memory & Disk & KeyValue\\
\hline
128 & 4275 & 4146669 & 176.7\\
512 & 4184 & 3319162 & 190.6\\
2048 & 2613 & 422938 & 184.8\\
65536 & 2382 & 18814 & 186.1
\end{tabular}
\caption{Time (in ns) to operate on a single voxel, based on the size of the
world (spread)}
\label{fig:reads}
\end{figure}
These results show that the scaling remains consistent between the in-memory
version and the key-value store, although my implementation is about two orders
of magnitude slower than the latter. This scaling is however not met by the
performance of the on-disk database. Originally, I thought that these poor
results were the result of no caching being done on the chunk files, which would
have made searches much slower, but still doesn't explain the improvement in
performance by larger worlds. This led me to implement a disk cache, which had
similar results, to the final implementation where I combined all the data in
one large file, and selectively read sections from that file. This leads me to
believe that as the points tested grow more spread out, since the world is only
so large, many points will be outside of the loaded chunks, and instantly return
empty.
This change could likely be addressed by a change in caching methods, and
remembering the data for more chunks, but this still doesn't address the slow
speeds for accessing data in the first place. The slow speeds are most likely
the decoding of the JSON data stored on disk, which is relatively large at
about 4 megabytes in size. A custom encoding method could be designed to replace
this scheme, or additionally pre-allocate the entire storage space in the
chunks, so that chunk data could be retrieved without decoding the entire chunk.
However, this would require a much more constrained data layout, and limit the
implementation of different voxels.
Additionally, compression would also reduce the amount of data sent from the
disk to the application.
\section{Ethical Considerations}
\subsection{Considerations of Computing Resources}
Since a database is at the core part of most software systems, it is important
that the database is designed to work on a wide variety of computers, in order
to ensure all parties are able to take advantage of the improvements. I
designed my database to run on entry-level commodity hardware, as well as
alongside existing application programs that can require far more resources.
Additionally, by focusing on disk storage, which is far cheaper than equivalent
capacities of memory, this further allows researchers or individuals to run
large datasets on a single machine.
My system targets far less memory usage than existing commercial applications
\footnote{\url{https://docs.oracle.com/en/database/oracle/oracle-database/12.2/ntdbi/oracle-database-minimum-hardware-requirements.html}}
\footnote{\url{https://wiki.lustre.org/Lustre_Server_Requirements_Guidelines}}.
In the design of my application I had to take advantage of as much of the
computing hardware as possible, but make sure that the approachability and
accessibility for the application does not decrease as as result.
\subsection{Considerations of Complexity}
Another factor to consider in the implementation of my database is how complex
the existing systems are. Some of the most popular SQL databases, PostgreSQL and
MySQL have 1.4 and 4.4 million lines of code respectively
\footnote{\url{https://news.ycombinator.com/item?id=24813239}}.
Because these systems are so complex, this decreases the number of people who
can effectively work with these systems and maintain them, effectively limiting
this role to larger companies that can afford teams of people to solve these
problems for them. By not focusing on the significant complexity that comes with
caching logic, and keeping a simple implementation for the server, I allow more
companies and developers to use this database for their own needs, and expand
with them. In addition, many decisions were made to help in the debugging
process, including the choice of JSON serialization for the chunk data, which
allows users to read the contents of files easier, and recover potentially
corrupted data.
\subsection{Considerations in Security}
Since databases are very complex, there is also the risk that having a
server exposed over the internet through the Minecraft game server might leave
it exposed to attacks. While this is a large issue, an even more important
implication is the ability to configure the database correctly. Since these
databases are extremely complex, it is also very hard to make sure that they are
configured securely. There have been many high-profile data
breaches\footnote{\url{https://www.zdnet.com/article/hacker-ransoms-23k-mongodb-databases-and-threatens-to-contact-gdpr-authorities/}}
that involve a single server, even at larger companies that have dedicated teams
that involve a data breach.
I mitigate this risk by implementing the database in a memory-safe
programming language, Go, which should remove the risk class of memory-unsafety
bugs, which account for around 70\% of all bugs in the Chromium browser
engine\footnote{\url{https://www.chromium.org/Home/chromium-security/memory-safety/}},
which is entirely written in non-memory safe C++.
However, there is the possibility that information stored in the database is
exposed, whether the database not secured, or exposed via an application error.
With this, my database follows the previous threat model of many other
databases, and leaves the security up to the user implementing the application.
Implementing features such as encryption would provide some additional layer of
security, but would also likely decrease performance and increase complexity,
which are also harmful to security in their own ways. Ultimately, I rely on a
setting of defaults that doesn't many any assumptions about the security of the
system.
\subsection{Considerations in Fairness}
In the implementation of databases, it can often be beneficial to make certain
operations faster, at the expense of others that are not done as often. For
instance, if I notice that researchers often write more to the database, and
adjust the application accordingly, I can take advantage of this assumption to
speed up the database for the most common operations. However, this can be
problematic if the things that I choose to sacrifice affect a certain group of
users.
This tradeoff between speed and reliability occurs so often in Computer Science
and is described in terms of percentiles. For instance, if we notice that some
event occurs about half the time, we can say it is in the 50th percentile.
Similarly, if an event only occurs 1\% of the time, we can say it occurs in the
99th percentile. The impossible effect of not hurting anyone when a decision
like this is make is written about by Google \cite{dean2013tail}, who have to make every
decision like this at their scale.
My database plans to keep a consistent set of gaurantees in regards to the
complexity of the basic operations, and provide constant-time operations for
most of these operations.
\subsection{Considerations in Accessibility}
By creating this system, I also have to consider if the players are going to
require a certain type of computer. Requiring a certain operating system or a
more powerful computer would limit access to many of the people that were
playing the game before.
However, with the previous performance goals, as well as an implementation in a
portable language, the program is available for as many systems as the Go
compiler supports.
\printbibliography
\end{document}

102
paper/oxycomps.sty Normal file
View File

@ -0,0 +1,102 @@
% A simple two-column LaTeX style for Occidental College's CS senior projects.
% Based on latex8.sty by Paolo.Ienne@di.epfl.ch
\usepackage{times} % use Times as the default font
% define bold 11pt Times font for second-order headings
\font\elvbf = ptmb scaled 1100
\usepackage[style=numeric,sorting=nyt]{biblatex} % format the bibliography nicely
\usepackage{xpatch} % used to patch \textcite
% change \textcite to do family-name (year)
\xpatchbibmacro{textcite}
{\printnames{labelname}}
{\printnames{labelname} (\printfield{year})}
{}
{}
% sort bibliography by last name
\DeclareNameAlias{default}{family-given}
\usepackage{amsfonts} % provides many math symbols/fonts
\usepackage{amsmath} % provides many math environments
\usepackage{amssymb} % provides many math symbols/fonts
\usepackage{caption} % fixes caption spacing issues
\usepackage[usenames,dvipsnames]{color} % allows for colored text
\usepackage{enumitem} % allows adjustment of list spacing
\usepackage{graphicx} % allows insertion of graphics
\usepackage{hyperref} % creates links within the page and to URLs
\usepackage{listings} % provides the lstlisting environment
\usepackage{url} % formats URLs properly
\usepackage{verbatim} % provides the comment environment
% set dimensions of columns, gap between columns, and paragraph indent
\setlength{\textheight}{8.875in}
\setlength{\textwidth}{6.875in}
\setlength{\columnsep}{0.3125in}
\setlength{\topmargin}{0in}
\setlength{\headheight}{0in}
\setlength{\headsep}{0in}
\setlength{\parindent}{1em}
\setlength{\oddsidemargin}{-.304in}
\setlength{\evensidemargin}{-.304in}
% remove the space between list items
\setlist{noitemsep}
% style code listings
\lstset{
basicstyle=\ttfamily\footnotesize,
breaklines=true,
showstringspaces=false
}
% style the title
\def\@maketitle{
\newpage
\begin{center}
{\Large \bf \@title \par}
% add two empty lines at the end of the title
\vspace*{2\baselineskip}
{
\large
\begin{tabular}[t]{c}
\@author
\end{tabular}
\par
}
% add small space at the end of the author name
\vspace*{.5em}
{
\ifx \@empty \@email
\else
\texttt{\@email}
\par
\vspace*{.25em}
\fi
\ifx \@empty \@affiliation
\else
\@affiliation
\fi
}
% add empty line at the end of the title block
\vspace*{\baselineskip}
\end{center}
}
% style the abstract
\def\abstract{%
\centerline{\large\bf Abstract}%
\vspace*{\baselineskip}%
}
% define email and affiliation
\def\email#1{\gdef\@email{#1}}
\gdef\@email{}
\def\affiliation#1{\gdef\@affiliation{#1}}
\gdef\@affiliation{}
% correct heading spacing and type
\def\section{\@startsection {section}{1}{\z@}
{14pt plus 2pt minus 2pt}{14pt plus 2pt minus 2pt} {\large\bf}}
\def\subsection{\@startsection {subsection}{2}{\z@}
{13pt plus 2pt minus 2pt}{13pt plus 2pt minus 2pt} {\elvbf}}

315
paper/references.bib Normal file
View File

@ -0,0 +1,315 @@
// Introduction
@misc{sqliteOnlyDatabase,
title={SQLite the only database you will ever need in most cases},
url={https://unixsheikh.com/articles/sqlite-the-only-database-you-will-ever-need-in-most-cases.html},
journal={https://unixsheikh.com/},
publisher={https://unixsheikh.com/},
author={Sheikh, Unix},
year={2021},
month={Apr},
}
@misc{ enwiki:1181180757,
author = "{Wikipedia contributors}",
title = "Modelviewcontroller --- {Wikipedia}{,} The Free Encyclopedia",
year = "2023",
howpublished = "\url{https://en.wikipedia.org/w/index.php?title=Model%E2%80%93view%E2%80%93controller&oldid=1181180757}",
note = "[Online; accessed 13-December-2023]"
}
@online{googlePerfTeam,
author = {{Google Performance Team}},
title = {System Performance},
month = {May},
year = {2023},
url = {https://research.google/teams/system-performance/},
}
// Applications of voxels
@misc{ enwiki:1186283262,
author = "{Wikipedia contributors}",
title = "Voxel --- {Wikipedia}{,} The Free Encyclopedia",
year = "2023",
howpublished = "\url{https://en.wikipedia.org/w/index.php?title=Voxel&oldid=1186283262}",
note = "[Online; accessed 13-December-2023]"
}
@article{norman2006beyond,
title={Beyond mind-reading: multi-voxel pattern analysis of fMRI data},
author={Norman, Kenneth A and Polyn, Sean M and Detre, Greg J and Haxby, James V},
journal={Trends in cognitive sciences},
volume={10},
number={9},
pages={424--430},
year={2006},
publisher={Elsevier}
}
@article{museth2013vdb,
title={VDB: High-resolution sparse volumes with dynamic topology},
author={Museth, Ken},
journal={ACM transactions on graphics (TOG)},
volume={32},
number={3},
pages={1--22},
year={2013},
publisher={ACM New York, NY, USA}
}
@article{li2020deep,
title={Deep learning for lidar point clouds in autonomous driving: A review},
author={Li, Ying and Ma, Lingfei and Zhong, Zilong and Liu, Fei and Chapman, Michael A and Cao, Dongpu and Li, Jonathan},
journal={IEEE Transactions on Neural Networks and Learning Systems},
volume={32},
number={8},
pages={3412--3432},
year={2020},
publisher={IEEE}
}
// Literature Review
@article{williams1992voxel,
title={Voxel databases: A paradigm for parallelism with spatial structure},
author={Williams, Roy D},
journal={Concurrency: Practice and Experience},
volume={4},
number={8},
pages={619--636},
year={1992},
publisher={Wiley Online Library}
}
@article{gorte2023analysis,
title={Analysis of very large voxel datasets},
author={Gorte, Ben},
journal={International Journal of Applied Earth Observation and Geoinformation},
volume={119},
pages={103316},
year={2023},
publisher={Elsevier}
}
@online{tigerbeetleDesign,
author = {{Tigerbeetle Developers}},
title = {Tigerbeetle Design Document},
month = {July},
year = {2020},
url = {https://github.com/tigerbeetledb/tigerbeetle/blob/main/docs/DESIGN.md},
}
@online{tigerbeetlePerf,
author = {{Tigerbeetle Developers}},
title = {Tigerbeetle Design Document},
month = {July},
year = {2020},
url = {https://github.com/tigerbeetledb/tigerbeetle/blob/main/docs/HISTORY.md},
}
@online{nomiSlowME,
author = {{Jokercortex}},
title = {Moron's Guide to Managing Mechanical Monstrosities},
month = {Feb},
year = {2020},
url = {https://github.com/Nomifactory/Guides/blob/latest/guides/AE2ForDummies.md},
}
@misc{btree,
author = "{Wikipedia contributors}",
title = "B-tree --- {Wikipedia}{,} The Free Encyclopedia",
year = "2023",
url = "https://en.wikipedia.org/w/index.php?title=B-tree&oldid=1146616935",
note = "[Online; accessed 13-May-2023]"
}
@misc{kvdatabase,
author = "{Wikipedia contributors}",
title = "Keyvalue database --- {Wikipedia}{,} The Free Encyclopedia",
year = "2023",
url = "https://en.wikipedia.org/w/index.php?title=Key%E2%80%93value_database&oldid=1135560734",
note = "[Online; accessed 13-May-2023]"
}
@online{latencyKnow,
author = "Jeff Dean",
title = "Latency Numbers Every Programmer Should Know",
year = "2018",
url = "https://gist.github.com/jboner/2841832",
note = "[Online; accessed 12-Dec-2023]"
}
@online{cockroachData,
author = {{CockroachDB Developers}},
title = {Structured data encoding in CockroachDB SQL},
year = {2017},
month = Mar,
url = {https://github.com/cockroachdb/cockroach/blob/master/docs/tech-notes/encoding.md},
}
@article{dong2021rocksdb,
title={Rocksdb: Evolution of development priorities in a key-value store serving large-scale applications},
author={Dong, Siying and Kryczka, Andrew and Jin, Yanqin and Stumm, Michael},
journal={ACM Transactions on Storage (TOS)},
volume={17},
number={4},
pages={1--32},
year={2021},
publisher={ACM New York, NY}
}
@misc{lsm,
author = "{Wikipedia contributors}",
title = "Log-structured merge-tree --- {Wikipedia}{,} The Free Encyclopedia",
year = "2023",
url = "https://en.wikipedia.org/w/index.php?title=Log-structured_merge-tree&oldid=1153046573",
note = "[Online; accessed 13-May-2023]"
}
@online{lsmUses,
author = {{Braden Groom}},
title = {Understanding LSM Trees: What Powers Write-Heavy Databases},
month = Jun,
year = {2020},
url = {https://yetanotherdevblog.com/lsm/},
}
@article{chang2008bigtable,
title={Bigtable: A distributed storage system for structured data},
author={Chang, Fay and Dean, Jeffrey and Ghemawat, Sanjay and Hsieh, Wilson C and Wallach, Deborah A and Burrows, Mike and Chandra, Tushar and Fikes, Andrew and Gruber, Robert E},
journal={ACM Transactions on Computer Systems (TOCS)},
volume={26},
number={2},
pages={1--26},
year={2008},
publisher={ACM New York, NY, USA}
}
@inproceedings{abadi2008column,
title={Column-stores vs. row-stores: how different are they really?},
author={Abadi, Daniel J and Madden, Samuel R and Hachem, Nabil},
booktitle={Proceedings of the 2008 ACM SIGMOD international conference on Management of data},
pages={967--980},
year={2008}
}
@article{athanassoulis2019optimal,
title={Optimal column layout for hybrid workloads},
author={Athanassoulis, Manos and B{\o}gh, Kenneth S and Idreos, Stratos},
journal={Proceedings of the VLDB Endowment},
volume={12},
number={13},
pages={2393--2407},
year={2019},
publisher={VLDB Endowment}
}
@inproceedings{armbrust2021lakehouse,
title={Lakehouse: a new generation of open platforms that unify data warehousing and advanced analytics},
author={Armbrust, Michael and Ghodsi, Ali and Xin, Reynold and Zaharia, Matei},
booktitle={Proceedings of CIDR},
volume={8},
year={2021}
}
@article{dean2013tail,
title={The tail at scale},
author={Dean, Jeffrey and Barroso, Luiz Andr{\'e}},
journal={Communications of the ACM},
volume={56},
number={2},
pages={74--80},
year={2013},
publisher={ACM New York, NY, USA}
}
https://github.blog/2021-09-27-partitioning-githubs-relational-databases-scale/#:~:text=Yet%20at%20its%20core%2C%20GitHub,%2C%20issues%2C%20and%20pull%20requests.
@misc{githubSingleSQL,
title={Partitioning githubs relational databases to handle scale},
url={https://github.blog/2021-09-27-partitioning-githubs-relational-databases-scale/},
journal={The GitHub Blog},
publisher={GitHub},
author={Maurer, Thomas},
year={2021},
month={Sep},
}
@inproceedings{bronson2013tao,
title={$\{$TAO$\}$: Facebooks distributed data store for the social graph},
author={Bronson, Nathan and Amsden, Zach and Cabrera, George and Chakka, Prasad and Dimov, Peter and Ding, Hui and Ferris, Jack and Giardullo, Anthony and Kulkarni, Sachin and Li, Harry and others},
booktitle={2013 $\{$USENIX$\}$ Annual Technical Conference ($\{$USENIX$\}$$\{$ATC$\}$ 13)},
pages={49--60},
year={2013}
}
@inproceedings{chatterji2002complexity,
title={On the complexity of approximate query optimization},
author={Chatterji, Sourav and Evani, Sai Surya Kiran and Ganguly, Sumit and Yemmanuru, Mahesh Datt},
booktitle={Proceedings of the twenty-first ACM SIGMOD-SIGACT-SIGART symposium on Principles of database systems},
pages={282--292},
year={2002}
}
@inproceedings{gjengset2018noria,
title={Noria: dynamic, partially-stateful data-flow for high-performance web applications.},
author={Gjengset, Jon and Schwarzkopf, Malte and Behrens, Jonathan and Ara{\'u}jo, Lara Timb{\'o} and Ek, Martin and Kohler, Eddie and Kaashoek, M Frans and Morris, Robert Tappan},
booktitle={OSDI},
volume={18},
pages={213--231},
year={2018}
}
How storage works in database systems, and the evolution of how data is stored
@article{stonebraker2005goes,
title={What goes around comes around},
author={Stonebraker, Michael and Hellerstein, Joey},
journal={Readings in database systems},
volume={4},
pages={1},
year={2005}
}
@article{vohra2016apache,
title={Apache parquet},
author={Vohra, Deepak and Vohra, Deepak},
journal={Practical Hadoop Ecosystem: A Definitive Guide to Hadoop-Related Frameworks and Tools},
pages={325--335},
year={2016},
publisher={Springer}
}
@inproceedings{nishtala2013scaling,
title={Scaling memcache at facebook},
author={Nishtala, Rajesh and Fugal, Hans and Grimm, Steven and Kwiatkowski, Marc and Lee, Herman and Li, Harry C and McElroy, Ryan and Paleczny, Mike and Peek, Daniel and Saab, Paul and others},
booktitle={Presented as part of the 10th $\{$USENIX$\}$ Symposium on Networked Systems Design and Implementation ($\{$NSDI$\}$ 13)},
pages={385--398},
year={2013}
}
@inproceedings{zhou2021foundationdb,
title={Foundationdb: A distributed unbundled transactional key value store},
author={Zhou, Jingyu and Xu, Meng and Shraer, Alexander and Namasivayam, Bala and Miller, Alex and Tschannen, Evan and Atherton, Steve and Beamon, Andrew J and Sears, Rusty and Leach, John and others},
booktitle={Proceedings of the 2021 International Conference on Management of Data},
pages={2653--2666},
year={2021}
}
@article{lakshman2010cassandra,
title={Cassandra: a decentralized structured storage system},
author={Lakshman, Avinash and Malik, Prashant},
journal={ACM SIGOPS operating systems review},
volume={44},
number={2},
pages={35--40},
year={2010},
publisher={ACM New York, NY, USA}
}
@misc{veloren32,
title = "This Week In Veloren 32",
author = "AngelOnFira",
month = "September",
year = "2019",
url = "https://veloren.net/blog/devblog-32/"
}

53
paper/unity-file.drawio Normal file
View File

@ -0,0 +1,53 @@
<mxfile host="Electron" modified="2023-12-14T09:51:26.683Z" agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) draw.io/22.0.2 Chrome/114.0.5735.289 Electron/25.8.4 Safari/537.36" etag="iOiW5F6x8VUFkmnMflTj" version="22.0.2" type="device">
<diagram name="Page-1" id="TafIrdbnw2cWi4bqOyK2">
<mxGraphModel dx="1114" dy="999" grid="1" gridSize="10" guides="1" tooltips="1" connect="1" arrows="1" fold="1" page="1" pageScale="1" pageWidth="850" pageHeight="1100" math="0" shadow="0">
<root>
<mxCell id="0" />
<mxCell id="1" parent="0" />
<mxCell id="f65CT_Lw4DzFi_7RwwvQ-1" value="" style="rounded=0;whiteSpace=wrap;html=1;" vertex="1" parent="1">
<mxGeometry x="40" y="20" width="120" height="200" as="geometry" />
</mxCell>
<mxCell id="f65CT_Lw4DzFi_7RwwvQ-2" value="Chunk 1" style="rounded=1;whiteSpace=wrap;html=1;fillColor=#fff2cc;strokeColor=#d6b656;" vertex="1" parent="1">
<mxGeometry x="50" y="50" width="100" height="40" as="geometry" />
</mxCell>
<mxCell id="f65CT_Lw4DzFi_7RwwvQ-3" value="Chunk 2" style="rounded=1;whiteSpace=wrap;html=1;fillColor=#fff2cc;strokeColor=#d6b656;" vertex="1" parent="1">
<mxGeometry x="50" y="100" width="100" height="40" as="geometry" />
</mxCell>
<mxCell id="f65CT_Lw4DzFi_7RwwvQ-6" value="" style="endArrow=none;dashed=1;html=1;dashPattern=1 3;strokeWidth=2;rounded=0;" edge="1" parent="1">
<mxGeometry width="50" height="50" relative="1" as="geometry">
<mxPoint x="100" y="210" as="sourcePoint" />
<mxPoint x="100" y="150" as="targetPoint" />
</mxGeometry>
</mxCell>
<mxCell id="f65CT_Lw4DzFi_7RwwvQ-7" value="Metadata" style="swimlane;fontStyle=0;childLayout=stackLayout;horizontal=1;startSize=30;horizontalStack=0;resizeParent=1;resizeParentMax=0;resizeLast=0;collapsible=1;marginBottom=0;whiteSpace=wrap;html=1;" vertex="1" parent="1">
<mxGeometry x="230" y="40" width="140" height="90" as="geometry" />
</mxCell>
<mxCell id="f65CT_Lw4DzFi_7RwwvQ-8" value="Start: 0, Size: 2" style="text;strokeColor=none;fillColor=none;align=left;verticalAlign=middle;spacingLeft=4;spacingRight=4;overflow=hidden;points=[[0,0.5],[1,0.5]];portConstraint=eastwest;rotatable=0;whiteSpace=wrap;html=1;" vertex="1" parent="f65CT_Lw4DzFi_7RwwvQ-7">
<mxGeometry y="30" width="140" height="30" as="geometry" />
</mxCell>
<mxCell id="f65CT_Lw4DzFi_7RwwvQ-9" value="Start: 2, Size 3" style="text;strokeColor=none;fillColor=none;align=left;verticalAlign=middle;spacingLeft=4;spacingRight=4;overflow=hidden;points=[[0,0.5],[1,0.5]];portConstraint=eastwest;rotatable=0;whiteSpace=wrap;html=1;" vertex="1" parent="f65CT_Lw4DzFi_7RwwvQ-7">
<mxGeometry y="60" width="140" height="30" as="geometry" />
</mxCell>
<mxCell id="f65CT_Lw4DzFi_7RwwvQ-11" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;entryX=1;entryY=0;entryDx=0;entryDy=0;" edge="1" parent="1" source="f65CT_Lw4DzFi_7RwwvQ-8" target="f65CT_Lw4DzFi_7RwwvQ-2">
<mxGeometry relative="1" as="geometry">
<Array as="points">
<mxPoint x="190" y="85" />
<mxPoint x="190" y="50" />
</Array>
</mxGeometry>
</mxCell>
<mxCell id="f65CT_Lw4DzFi_7RwwvQ-12" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;entryX=1;entryY=0;entryDx=0;entryDy=0;" edge="1" parent="1" source="f65CT_Lw4DzFi_7RwwvQ-9" target="f65CT_Lw4DzFi_7RwwvQ-3">
<mxGeometry relative="1" as="geometry">
<Array as="points">
<mxPoint x="190" y="115" />
<mxPoint x="190" y="100" />
</Array>
</mxGeometry>
</mxCell>
<mxCell id="f65CT_Lw4DzFi_7RwwvQ-14" value="Unity File" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" vertex="1" parent="1">
<mxGeometry x="70" y="20" width="60" height="30" as="geometry" />
</mxCell>
</root>
</mxGraphModel>
</diagram>
</mxfile>

BIN
paper/unity-file.drawio.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 17 KiB

View File

@ -1,8 +1,8 @@
package server package server
import ( import (
"git.nicholasnovak.io/nnovak/spatial-db/storage" "github.com/NickyBoy89/spatial-db/storage"
"git.nicholasnovak.io/nnovak/spatial-db/world" "github.com/NickyBoy89/spatial-db/world"
log "github.com/sirupsen/logrus" log "github.com/sirupsen/logrus"
) )

View File

@ -6,8 +6,8 @@ import (
"os" "os"
"path/filepath" "path/filepath"
"git.nicholasnovak.io/nnovak/spatial-db/storage" "github.com/NickyBoy89/spatial-db/storage"
"git.nicholasnovak.io/nnovak/spatial-db/world" "github.com/NickyBoy89/spatial-db/world"
) )
type InMemoryServer struct { type InMemoryServer struct {

View File

@ -7,8 +7,8 @@ import (
"os" "os"
"path/filepath" "path/filepath"
"git.nicholasnovak.io/nnovak/spatial-db/storage" "github.com/NickyBoy89/spatial-db/storage"
"git.nicholasnovak.io/nnovak/spatial-db/world" "github.com/NickyBoy89/spatial-db/world"
) )
const fileCacheSize = 8 const fileCacheSize = 8

View File

@ -8,7 +8,7 @@ import (
"strings" "strings"
"sync" "sync"
"git.nicholasnovak.io/nnovak/spatial-db/world" "github.com/NickyBoy89/spatial-db/world"
) )
func ReadChunkFromFile(chunkFile *os.File) (world.ChunkData, error) { func ReadChunkFromFile(chunkFile *os.File) (world.ChunkData, error) {

View File

@ -3,7 +3,7 @@ package storage
import ( import (
"errors" "errors"
"git.nicholasnovak.io/nnovak/spatial-db/world" "github.com/NickyBoy89/spatial-db/world"
) )
type StorageServer interface { type StorageServer interface {

View File

@ -7,7 +7,7 @@ import (
"io/fs" "io/fs"
"os" "os"
"git.nicholasnovak.io/nnovak/spatial-db/world" "github.com/NickyBoy89/spatial-db/world"
) )
// A `UnityFile` is a collection of chunks, stored as a single file on disk // A `UnityFile` is a collection of chunks, stored as a single file on disk

View File

@ -7,7 +7,7 @@ import (
"reflect" "reflect"
"testing" "testing"
"git.nicholasnovak.io/nnovak/spatial-db/world" "github.com/NickyBoy89/spatial-db/world"
) )
func TestCreateUnityFile(t *testing.T) { func TestCreateUnityFile(t *testing.T) {

View File

@ -4,8 +4,8 @@ import (
"errors" "errors"
"testing" "testing"
"git.nicholasnovak.io/nnovak/spatial-db/storage" "github.com/NickyBoy89/spatial-db/storage"
"git.nicholasnovak.io/nnovak/spatial-db/world" "github.com/NickyBoy89/spatial-db/world"
) )
func readBlockTemplate( func readBlockTemplate(

View File

@ -4,9 +4,9 @@ import (
"errors" "errors"
"strings" "strings"
"git.nicholasnovak.io/nnovak/spatial-db/server" "github.com/NickyBoy89/spatial-db/server"
"git.nicholasnovak.io/nnovak/spatial-db/storage" "github.com/NickyBoy89/spatial-db/storage"
"git.nicholasnovak.io/nnovak/spatial-db/world" "github.com/NickyBoy89/spatial-db/world"
tea "github.com/charmbracelet/bubbletea" tea "github.com/charmbracelet/bubbletea"
"github.com/charmbracelet/lipgloss" "github.com/charmbracelet/lipgloss"
"github.com/spf13/cobra" "github.com/spf13/cobra"