initial commit
This commit is contained in:
commit
d025c735fa
12 changed files with 46418 additions and 0 deletions
1
.envrc
Normal file
1
.envrc
Normal file
|
@ -0,0 +1 @@
|
||||||
|
use flake
|
6
.gitignore
vendored
Normal file
6
.gitignore
vendored
Normal file
|
@ -0,0 +1,6 @@
|
||||||
|
model-out
|
||||||
|
.direnv
|
||||||
|
|
||||||
|
# Added by cargo
|
||||||
|
|
||||||
|
/target
|
43
Cargo.lock
generated
Normal file
43
Cargo.lock
generated
Normal file
|
@ -0,0 +1,43 @@
|
||||||
|
# This file is automatically @generated by Cargo.
|
||||||
|
# It is not intended for manual editing.
|
||||||
|
version = 4
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "cc"
|
||||||
|
version = "1.2.7"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "a012a0df96dd6d06ba9a1b29d6402d1a5d77c6befd2566afdc26e10603dc93d7"
|
||||||
|
dependencies = [
|
||||||
|
"shlex",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "cfasttext-sys"
|
||||||
|
version = "0.7.8"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "982185af4edba23861639c25e46b36e077d2d60e553c20d1341c9fbf17fdb369"
|
||||||
|
dependencies = [
|
||||||
|
"cc",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "fasttext"
|
||||||
|
version = "0.7.8"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "fd26f3978ff7b22e594af9026912da644237fd7d360889d0c5a6ac8ec4f940c8"
|
||||||
|
dependencies = [
|
||||||
|
"cfasttext-sys",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "fasttext-classifier"
|
||||||
|
version = "0.1.0"
|
||||||
|
dependencies = [
|
||||||
|
"fasttext",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "shlex"
|
||||||
|
version = "1.3.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
|
7
Cargo.toml
Normal file
7
Cargo.toml
Normal file
|
@ -0,0 +1,7 @@
|
||||||
|
[package]
|
||||||
|
name = "fasttext-classifier"
|
||||||
|
version = "0.1.0"
|
||||||
|
edition = "2021"
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
fasttext = "0.7.8"
|
15404
data/cooking.stackexchange.id
Normal file
15404
data/cooking.stackexchange.id
Normal file
File diff suppressed because it is too large
Load diff
15404
data/cooking.stackexchange.txt
Normal file
15404
data/cooking.stackexchange.txt
Normal file
File diff suppressed because it is too large
Load diff
12404
data/cooking.train
Normal file
12404
data/cooking.train
Normal file
File diff suppressed because it is too large
Load diff
3000
data/cooking.valid
Normal file
3000
data/cooking.valid
Normal file
File diff suppressed because it is too large
Load diff
17
data/readme.txt
Normal file
17
data/readme.txt
Normal file
|
@ -0,0 +1,17 @@
|
||||||
|
The data in this archive is derived from the user-contributed content on the
|
||||||
|
Cooking Stack Exchange website (https://cooking.stackexchange.com/), used under
|
||||||
|
CC-BY-SA 3.0 (http://creativecommons.org/licenses/by-sa/3.0/).
|
||||||
|
|
||||||
|
The original data dump can be downloaded from:
|
||||||
|
https://archive.org/download/stackexchange/cooking.stackexchange.com.7z
|
||||||
|
and details about the dump obtained from:
|
||||||
|
https://archive.org/details/stackexchange
|
||||||
|
|
||||||
|
We distribute two files, under CC-BY-SA 3.0:
|
||||||
|
|
||||||
|
- cooking.stackexchange.txt, which contains all question titles and
|
||||||
|
their associated tags (one question per line, tags are prefixed by
|
||||||
|
the string "__label__") ;
|
||||||
|
|
||||||
|
- cooking.stackexchange.id, which contains the corresponding row IDs,
|
||||||
|
from the original data dump.
|
43
flake.lock
Normal file
43
flake.lock
Normal file
|
@ -0,0 +1,43 @@
|
||||||
|
{
|
||||||
|
"nodes": {
|
||||||
|
"crane": {
|
||||||
|
"locked": {
|
||||||
|
"lastModified": 1736101677,
|
||||||
|
"narHash": "sha256-iKOPq86AOWCohuzxwFy/MtC8PcSVGnrxBOvxpjpzrAY=",
|
||||||
|
"owner": "ipetkov",
|
||||||
|
"repo": "crane",
|
||||||
|
"rev": "61ba163d85e5adeddc7b3a69bb174034965965b2",
|
||||||
|
"type": "github"
|
||||||
|
},
|
||||||
|
"original": {
|
||||||
|
"owner": "ipetkov",
|
||||||
|
"repo": "crane",
|
||||||
|
"type": "github"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nixpkgs": {
|
||||||
|
"locked": {
|
||||||
|
"lastModified": 1736012469,
|
||||||
|
"narHash": "sha256-/qlNWm/IEVVH7GfgAIyP6EsVZI6zjAx1cV5zNyrs+rI=",
|
||||||
|
"owner": "nixos",
|
||||||
|
"repo": "nixpkgs",
|
||||||
|
"rev": "8f3e1f807051e32d8c95cd12b9b421623850a34d",
|
||||||
|
"type": "github"
|
||||||
|
},
|
||||||
|
"original": {
|
||||||
|
"owner": "nixos",
|
||||||
|
"ref": "nixos-unstable",
|
||||||
|
"repo": "nixpkgs",
|
||||||
|
"type": "github"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"root": {
|
||||||
|
"inputs": {
|
||||||
|
"crane": "crane",
|
||||||
|
"nixpkgs": "nixpkgs"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"root": "root",
|
||||||
|
"version": 7
|
||||||
|
}
|
59
flake.nix
Normal file
59
flake.nix
Normal file
|
@ -0,0 +1,59 @@
|
||||||
|
{
|
||||||
|
description = "Fasttext experiments";
|
||||||
|
|
||||||
|
inputs = {
|
||||||
|
nixpkgs.url = "github:nixos/nixpkgs/nixos-unstable";
|
||||||
|
crane.url = "github:ipetkov/crane";
|
||||||
|
};
|
||||||
|
|
||||||
|
outputs =
|
||||||
|
{
|
||||||
|
self,
|
||||||
|
nixpkgs,
|
||||||
|
crane,
|
||||||
|
}:
|
||||||
|
let
|
||||||
|
systems = [
|
||||||
|
"x86_64-linux"
|
||||||
|
"aarch64-linux"
|
||||||
|
"aarch64-darwin"
|
||||||
|
"x86_64-darwin"
|
||||||
|
];
|
||||||
|
forAllSystems = nixpkgs.lib.genAttrs systems;
|
||||||
|
in
|
||||||
|
{
|
||||||
|
packages = forAllSystems (
|
||||||
|
system:
|
||||||
|
let
|
||||||
|
pkgs = import nixpkgs { inherit system; };
|
||||||
|
craneLib = crane.mkLib pkgs;
|
||||||
|
in
|
||||||
|
{
|
||||||
|
default = craneLib.buildPackage {
|
||||||
|
src = craneLib.cleanCargoSource ./.;
|
||||||
|
|
||||||
|
buildInputs = with pkgs; [
|
||||||
|
fasttext
|
||||||
|
];
|
||||||
|
};
|
||||||
|
}
|
||||||
|
);
|
||||||
|
devShells = forAllSystems (
|
||||||
|
system:
|
||||||
|
let
|
||||||
|
pkgs = import nixpkgs { inherit system; };
|
||||||
|
craneLib = crane.mkLib pkgs;
|
||||||
|
in
|
||||||
|
{
|
||||||
|
default = craneLib.devShell {
|
||||||
|
inputsFrom = [ self.packages.${system}.default ];
|
||||||
|
|
||||||
|
packages = with pkgs; [
|
||||||
|
clippy
|
||||||
|
rust-analyzer
|
||||||
|
];
|
||||||
|
};
|
||||||
|
}
|
||||||
|
);
|
||||||
|
};
|
||||||
|
}
|
30
src/main.rs
Normal file
30
src/main.rs
Normal file
|
@ -0,0 +1,30 @@
|
||||||
|
use std::path::Path;
|
||||||
|
|
||||||
|
use fasttext::{Args, FastText, LossName, ModelName};
|
||||||
|
|
||||||
|
fn train_cooking_model() -> Result<(), String> {
|
||||||
|
let mut args = Args::new();
|
||||||
|
args.set_input("../data/cooking.train").unwrap();
|
||||||
|
args.set_model(ModelName::SUP);
|
||||||
|
args.set_lr(1.0);
|
||||||
|
args.set_epoch(25);
|
||||||
|
//args.set_loss(LossName::SOFTMAX);
|
||||||
|
let mut ft_model = FastText::new();
|
||||||
|
ft_model.train(&args).unwrap();
|
||||||
|
|
||||||
|
ft_model.save_model("../model-out/out.bin")
|
||||||
|
}
|
||||||
|
|
||||||
|
fn test_cooking_model(filename: &Path) -> Vec<fasttext::Prediction> {
|
||||||
|
let mut text = FastText::new();
|
||||||
|
|
||||||
|
let _ = text.load_model(filename.to_str().unwrap());
|
||||||
|
text.predict("Safe temperatures to bake cookies at?", 3, 0.2)
|
||||||
|
.unwrap()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn main() {
|
||||||
|
//train_cooking_model().unwrap();
|
||||||
|
let result = test_cooking_model(Path::new("../model-out/out.bin"));
|
||||||
|
println!("{:?}", result);
|
||||||
|
}
|
Loading…
Reference in a new issue